No OneTemporary
Actions

Size

6 MB

Referenced Files

None

Subscribers

None

View Options

This file is larger than 256 KB, so syntax highlighting was skipped.

	Index: head/contrib/compiler-rt
	===================================================================
	--- head/contrib/compiler-rt (revision 322319)
	+++ head/contrib/compiler-rt (revision 322320)

	Property changes on: head/contrib/compiler-rt
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/compiler-rt/dist:r321713-322300
	Index: head/contrib/libc++/include/__bsd_locale_defaults.h
	===================================================================
	--- head/contrib/libc++/include/__bsd_locale_defaults.h (revision 322319)
	+++ head/contrib/libc++/include/__bsd_locale_defaults.h (revision 322320)
	@@ -1,33 +1,37 @@
	// -- C++ --
	//===---------------------- __bsd_locale_defaults.h -----------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is dual licensed under the MIT and the University of Illinois Open
	// Source Licenses. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	// The BSDs have lots of *_l functions. We don't want to define those symbols
	// on other platforms though, for fear of conflicts with user code. So here,
	// we will define the mapping from an internal macro to the real BSD symbol.
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP_BSD_LOCALE_DEFAULTS_H
	#define _LIBCPP_BSD_LOCALE_DEFAULTS_H

	+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
	+#pragma GCC system_header
	+#endif
	+
	#define __libcpp_mb_cur_max_l(loc) MB_CUR_MAX_L(loc)
	#define __libcpp_btowc_l(ch, loc) btowc_l(ch, loc)
	#define __libcpp_wctob_l(wch, loc) wctob_l(wch, loc)
	#define __libcpp_wcsnrtombs_l(dst, src, nwc, len, ps, loc) wcsnrtombs_l(dst, src, nwc, len, ps, loc)
	#define __libcpp_wcrtomb_l(src, wc, ps, loc) wcrtomb_l(src, wc, ps, loc)
	#define __libcpp_mbsnrtowcs_l(dst, src, nms, len, ps, loc) mbsnrtowcs_l(dst, src, nms, len, ps, loc)
	#define __libcpp_mbrtowc_l(pwc, s, n, ps, l) mbrtowc_l(pwc, s, n, ps, l)
	#define __libcpp_mbtowc_l(pwc, pmb, max, l) mbtowc_l(pwc, pmb, max, l)
	#define __libcpp_mbrlen_l(s, n, ps, l) mbrlen_l(s, n, ps, l)
	#define __libcpp_localeconv_l(l) localeconv_l(l)
	#define __libcpp_mbsrtowcs_l(dest, src, len, ps, l) mbsrtowcs_l(dest, src, len, ps, l)
	#define __libcpp_snprintf_l(...) snprintf_l(__VA_ARGS__)
	#define __libcpp_asprintf_l(...) asprintf_l(__VA_ARGS__)
	#define __libcpp_sscanf_l(...) sscanf_l(__VA_ARGS__)

	#endif // _LIBCPP_BSD_LOCALE_DEFAULTS_H
	Index: head/contrib/libc++/include/__bsd_locale_fallbacks.h
	===================================================================
	--- head/contrib/libc++/include/__bsd_locale_fallbacks.h (revision 322319)
	+++ head/contrib/libc++/include/__bsd_locale_fallbacks.h (revision 322320)
	@@ -1,136 +1,140 @@
	// -- C++ --
	//===---------------------- __bsd_locale_fallbacks.h ----------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is dual licensed under the MIT and the University of Illinois Open
	// Source Licenses. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	// The BSDs have lots of *_l functions. This file provides reimplementations
	// of those functions for non-BSD platforms.
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP_BSD_LOCALE_FALLBACKS_DEFAULTS_H
	#define _LIBCPP_BSD_LOCALE_FALLBACKS_DEFAULTS_H

	#include <stdlib.h>
	#include <stdarg.h>
	#include <memory>

	+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
	+#pragma GCC system_header
	+#endif
	+
	_LIBCPP_BEGIN_NAMESPACE_STD

	inline _LIBCPP_ALWAYS_INLINE
	decltype(MB_CUR_MAX) __libcpp_mb_cur_max_l(locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return MB_CUR_MAX;
	}

	inline _LIBCPP_ALWAYS_INLINE
	wint_t __libcpp_btowc_l(int __c, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return btowc(__c);
	}

	inline _LIBCPP_ALWAYS_INLINE
	int __libcpp_wctob_l(wint_t __c, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return wctob(__c);
	}

	inline _LIBCPP_ALWAYS_INLINE
	size_t __libcpp_wcsnrtombs_l(char __dest, const wchar_t *__src, size_t __nwc,
	size_t __len, mbstate_t *__ps, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return wcsnrtombs(__dest, __src, __nwc, __len, __ps);
	}

	inline _LIBCPP_ALWAYS_INLINE
	size_t __libcpp_wcrtomb_l(char __s, wchar_t __wc, mbstate_t __ps, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return wcrtomb(__s, __wc, __ps);
	}

	inline _LIBCPP_ALWAYS_INLINE
	size_t __libcpp_mbsnrtowcs_l(wchar_t * __dest, const char **__src, size_t __nms,
	size_t __len, mbstate_t *__ps, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return mbsnrtowcs(__dest, __src, __nms, __len, __ps);
	}

	inline _LIBCPP_ALWAYS_INLINE
	size_t __libcpp_mbrtowc_l(wchar_t __pwc, const char __s, size_t __n,
	mbstate_t *__ps, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return mbrtowc(__pwc, __s, __n, __ps);
	}

	inline _LIBCPP_ALWAYS_INLINE
	int __libcpp_mbtowc_l(wchar_t __pwc, const char __pmb, size_t __max, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return mbtowc(__pwc, __pmb, __max);
	}

	inline _LIBCPP_ALWAYS_INLINE
	size_t __libcpp_mbrlen_l(const char __s, size_t __n, mbstate_t __ps, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return mbrlen(__s, __n, __ps);
	}

	inline _LIBCPP_ALWAYS_INLINE
	lconv *__libcpp_localeconv_l(locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return localeconv();
	}

	inline _LIBCPP_ALWAYS_INLINE
	size_t __libcpp_mbsrtowcs_l(wchar_t __dest, const char *__src, size_t __len,
	mbstate_t *__ps, locale_t __l)
	{
	__libcpp_locale_guard __current(__l);
	return mbsrtowcs(__dest, __src, __len, __ps);
	}

	inline
	int __libcpp_snprintf_l(char __s, size_t __n, locale_t __l, const char __format, ...) {
	va_list __va;
	va_start(__va, __format);
	__libcpp_locale_guard __current(__l);
	int __res = vsnprintf(__s, __n, __format, __va);
	va_end(__va);
	return __res;
	}

	inline
	int __libcpp_asprintf_l(char *__s, locale_t __l, const char __format, ...) {
	va_list __va;
	va_start(__va, __format);
	__libcpp_locale_guard __current(__l);
	int __res = vasprintf(__s, __format, __va);
	va_end(__va);
	return __res;
	}

	inline
	int __libcpp_sscanf_l(const char __s, locale_t __l, const char __format, ...) {
	va_list __va;
	va_start(__va, __format);
	__libcpp_locale_guard __current(__l);
	int __res = vsscanf(__s, __format, __va);
	va_end(__va);
	return __res;
	}

	_LIBCPP_END_NAMESPACE_STD

	#endif // _LIBCPP_BSD_LOCALE_FALLBACKS_DEFAULTS_H
	Index: head/contrib/libc++/include/__locale
	===================================================================
	--- head/contrib/libc++/include/__locale (revision 322319)
	+++ head/contrib/libc++/include/__locale (revision 322320)
	@@ -1,1501 +1,1501 @@
	// -- C++ --
	//===----------------------------------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is dual licensed under the MIT and the University of Illinois Open
	// Source Licenses. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP___LOCALE
	#define _LIBCPP___LOCALE

	#include <__config>
	#include <string>
	#include <memory>
	#include <utility>
	#include <mutex>
	#include <cstdint>
	#include <cctype>
	#include <locale.h>
	#if defined(_LIBCPP_MSVCRT_LIKE)
	# include <support/win32/locale_win32.h>
	#elif defined(_AIX)
	# include <support/ibm/xlocale.h>
	#elif defined(__ANDROID__)
	// Android gained the locale aware functions in L (API level 21)
	# include <android/api-level.h>
	# if __ANDROID_API__ <= 20
	# include <support/android/locale_bionic.h>
	# endif
	#elif defined(__sun__)
	# include <xlocale.h>
	# include <support/solaris/xlocale.h>
	#elif defined(_NEWLIB_VERSION)
	# include <support/newlib/xlocale.h>
	-#elif (defined(__GLIBC__) \|\| defined(__APPLE__) \|\| defined(__FreeBSD__) \
	+#elif (defined(__APPLE__) \|\| defined(__FreeBSD__) \
	\|\| defined(__EMSCRIPTEN__) \|\| defined(__IBMCPP__))
	# include <xlocale.h>
	#elif defined(__Fuchsia__)
	# include <support/fuchsia/xlocale.h>
	#elif defined(_LIBCPP_HAS_MUSL_LIBC)
	# include <support/musl/xlocale.h>
	#endif

	#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
	#pragma GCC system_header
	#endif

	_LIBCPP_BEGIN_NAMESPACE_STD

	#if !defined(_LIBCPP_LOCALE__L_EXTENSIONS) \|\| defined(_LIBCPP_MSVCRT)
	struct __libcpp_locale_guard {
	_LIBCPP_INLINE_VISIBILITY
	__libcpp_locale_guard(locale_t& __loc) : __old_loc_(uselocale(__loc)) {}

	_LIBCPP_INLINE_VISIBILITY
	~__libcpp_locale_guard() {
	if (__old_loc_)
	uselocale(__old_loc_);
	}

	locale_t __old_loc_;
	private:
	__libcpp_locale_guard(__libcpp_locale_guard const&);
	__libcpp_locale_guard& operator=(__libcpp_locale_guard const&);
	};
	#endif


	class _LIBCPP_TYPE_VIS locale;

	template <class _Facet>
	_LIBCPP_INLINE_VISIBILITY
	bool
	has_facet(const locale&) _NOEXCEPT;

	template <class _Facet>
	_LIBCPP_INLINE_VISIBILITY
	const _Facet&
	use_facet(const locale&);

	class _LIBCPP_TYPE_VIS locale
	{
	public:
	// types:
	class _LIBCPP_TYPE_VIS facet;
	class _LIBCPP_TYPE_VIS id;

	typedef int category;
	_LIBCPP_AVAILABILITY_LOCALE_CATEGORY
	static const category // values assigned here are for exposition only
	none = 0,
	collate = LC_COLLATE_MASK,
	ctype = LC_CTYPE_MASK,
	monetary = LC_MONETARY_MASK,
	numeric = LC_NUMERIC_MASK,
	time = LC_TIME_MASK,
	messages = LC_MESSAGES_MASK,
	all = collate \| ctype \| monetary \| numeric \| time \| messages;

	// construct/copy/destroy:
	locale() _NOEXCEPT;
	locale(const locale&) _NOEXCEPT;
	explicit locale(const char*);
	explicit locale(const string&);
	locale(const locale&, const char*, category);
	locale(const locale&, const string&, category);
	template <class _Facet>
	_LIBCPP_INLINE_VISIBILITY locale(const locale&, _Facet*);
	locale(const locale&, const locale&, category);

	~locale();

	const locale& operator=(const locale&) _NOEXCEPT;

	template <class _Facet>
	_LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
	locale combine(const locale&) const;

	// locale operations:
	string name() const;
	bool operator==(const locale&) const;
	bool operator!=(const locale& __y) const {return !(*this == __y);}
	template <class _CharT, class _Traits, class _Allocator>
	_LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
	bool operator()(const basic_string<_CharT, _Traits, _Allocator>&,
	const basic_string<_CharT, _Traits, _Allocator>&) const;

	// global locale objects:
	static locale global(const locale&);
	static const locale& classic();

	private:
	class __imp;
	__imp* __locale_;

	void __install_ctor(const locale&, facet*, long);
	static locale& __global();
	bool has_facet(id&) const;
	const facet* use_facet(id&) const;

	template <class _Facet> friend bool has_facet(const locale&) _NOEXCEPT;
	template <class _Facet> friend const _Facet& use_facet(const locale&);
	};

	class _LIBCPP_TYPE_VIS locale::facet
	: public __shared_count
	{
	protected:
	_LIBCPP_INLINE_VISIBILITY
	explicit facet(size_t __refs = 0)
	: __shared_count(static_cast<long>(__refs)-1) {}

	virtual ~facet();

	// facet(const facet&) = delete; // effectively done in __shared_count
	// void operator=(const facet&) = delete;
	private:
	virtual void __on_zero_shared() _NOEXCEPT;
	};

	class _LIBCPP_TYPE_VIS locale::id
	{
	once_flag __flag_;
	int32_t __id_;

	static int32_t __next_id;
	public:
	_LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR id() :__id_(0) {}
	private:
	void __init();
	void operator=(const id&); // = delete;
	id(const id&); // = delete;
	public: // only needed for tests
	long __get();

	friend class locale;
	friend class locale::__imp;
	};

	template <class _Facet>
	inline _LIBCPP_INLINE_VISIBILITY
	locale::locale(const locale& __other, _Facet* __f)
	{
	__install_ctor(__other, __f, __f ? __f->id.__get() : 0);
	}

	template <class _Facet>
	locale
	locale::combine(const locale& __other) const
	{
	if (!_VSTD::has_facet<_Facet>(__other))
	__throw_runtime_error("locale::combine: locale missing facet");

	return locale(*this, &const_cast<_Facet&>(_VSTD::use_facet<_Facet>(__other)));
	}

	template <class _Facet>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	has_facet(const locale& __l) _NOEXCEPT
	{
	return __l.has_facet(_Facet::id);
	}

	template <class _Facet>
	inline _LIBCPP_INLINE_VISIBILITY
	const _Facet&
	use_facet(const locale& __l)
	{
	return static_cast<const _Facet&>(*__l.use_facet(_Facet::id));
	}

	// template <class _CharT> class collate;

	template <class _CharT>
	class _LIBCPP_TEMPLATE_VIS collate
	: public locale::facet
	{
	public:
	typedef _CharT char_type;
	typedef basic_string<char_type> string_type;

	_LIBCPP_INLINE_VISIBILITY
	explicit collate(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_INLINE_VISIBILITY
	int compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const
	{
	return do_compare(__lo1, __hi1, __lo2, __hi2);
	}

	_LIBCPP_INLINE_VISIBILITY
	string_type transform(const char_type* __lo, const char_type* __hi) const
	{
	return do_transform(__lo, __hi);
	}

	_LIBCPP_INLINE_VISIBILITY
	long hash(const char_type* __lo, const char_type* __hi) const
	{
	return do_hash(__lo, __hi);
	}

	static locale::id id;

	protected:
	~collate();
	virtual int do_compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const;
	virtual string_type do_transform(const char_type* __lo, const char_type* __hi) const
	{return string_type(__lo, __hi);}
	virtual long do_hash(const char_type* __lo, const char_type* __hi) const;
	};

	template <class _CharT> locale::id collate<_CharT>::id;

	template <class _CharT>
	collate<_CharT>::~collate()
	{
	}

	template <class _CharT>
	int
	collate<_CharT>::do_compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const
	{
	for (; __lo2 != __hi2; ++__lo1, ++__lo2)
	{
	if (__lo1 == __hi1 \|\| __lo1 < __lo2)
	return -1;
	if (__lo2 < __lo1)
	return 1;
	}
	return __lo1 != __hi1;
	}

	template <class _CharT>
	long
	collate<_CharT>::do_hash(const char_type* __lo, const char_type* __hi) const
	{
	size_t __h = 0;
	const size_t __sr = __CHAR_BIT__ * sizeof(size_t) - 8;
	const size_t __mask = size_t(0xF) << (__sr + 4);
	for(const char_type* __p = __lo; __p != __hi; ++__p)
	{
	__h = (__h << 4) + static_cast<size_t>(*__p);
	size_t __g = __h & __mask;
	__h ^= __g \| (__g >> __sr);
	}
	return static_cast<long>(__h);
	}

	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate<char>)
	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS collate<wchar_t>)

	// template <class CharT> class collate_byname;

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS collate_byname;

	template <>
	class _LIBCPP_TYPE_VIS collate_byname<char>
	: public collate<char>
	{
	locale_t __l;
	public:
	typedef char char_type;
	typedef basic_string<char_type> string_type;

	explicit collate_byname(const char* __n, size_t __refs = 0);
	explicit collate_byname(const string& __n, size_t __refs = 0);

	protected:
	~collate_byname();
	virtual int do_compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const;
	virtual string_type do_transform(const char_type* __lo, const char_type* __hi) const;
	};

	template <>
	class _LIBCPP_TYPE_VIS collate_byname<wchar_t>
	: public collate<wchar_t>
	{
	locale_t __l;
	public:
	typedef wchar_t char_type;
	typedef basic_string<char_type> string_type;

	explicit collate_byname(const char* __n, size_t __refs = 0);
	explicit collate_byname(const string& __n, size_t __refs = 0);

	protected:
	~collate_byname();

	virtual int do_compare(const char_type* __lo1, const char_type* __hi1,
	const char_type* __lo2, const char_type* __hi2) const;
	virtual string_type do_transform(const char_type* __lo, const char_type* __hi) const;
	};

	template <class _CharT, class _Traits, class _Allocator>
	bool
	locale::operator()(const basic_string<_CharT, _Traits, _Allocator>& __x,
	const basic_string<_CharT, _Traits, _Allocator>& __y) const
	{
	return _VSTD::use_facet<_VSTD::collate<_CharT> >(*this).compare(
	__x.data(), __x.data() + __x.size(),
	__y.data(), __y.data() + __y.size()) < 0;
	}

	// template <class charT> class ctype

	class _LIBCPP_TYPE_VIS ctype_base
	{
	public:
	#if defined(__GLIBC__)
	typedef unsigned short mask;
	static const mask space = _ISspace;
	static const mask print = _ISprint;
	static const mask cntrl = _IScntrl;
	static const mask upper = _ISupper;
	static const mask lower = _ISlower;
	static const mask alpha = _ISalpha;
	static const mask digit = _ISdigit;
	static const mask punct = _ISpunct;
	static const mask xdigit = _ISxdigit;
	static const mask blank = _ISblank;
	#elif defined(_LIBCPP_MSVCRT_LIKE)
	typedef unsigned short mask;
	static const mask space = _SPACE;
	static const mask print = _BLANK\|_PUNCT\|_ALPHA\|_DIGIT;
	static const mask cntrl = _CONTROL;
	static const mask upper = _UPPER;
	static const mask lower = _LOWER;
	static const mask alpha = _ALPHA;
	static const mask digit = _DIGIT;
	static const mask punct = _PUNCT;
	static const mask xdigit = _HEX;
	static const mask blank = _BLANK;
	# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT
	#elif defined(__APPLE__) \|\| defined(__FreeBSD__) \|\| defined(__EMSCRIPTEN__) \|\| defined(__NetBSD__)
	# ifdef __APPLE__
	typedef __uint32_t mask;
	# elif defined(__FreeBSD__)
	typedef unsigned long mask;
	# elif defined(__EMSCRIPTEN__) \|\| defined(__NetBSD__)
	typedef unsigned short mask;
	# endif
	static const mask space = _CTYPE_S;
	static const mask print = _CTYPE_R;
	static const mask cntrl = _CTYPE_C;
	static const mask upper = _CTYPE_U;
	static const mask lower = _CTYPE_L;
	static const mask alpha = _CTYPE_A;
	static const mask digit = _CTYPE_D;
	static const mask punct = _CTYPE_P;
	static const mask xdigit = _CTYPE_X;

	# if defined(__NetBSD__)
	static const mask blank = _CTYPE_BL;
	# else
	static const mask blank = _CTYPE_B;
	# endif
	#elif defined(__sun__) \|\| defined(_AIX)
	typedef unsigned int mask;
	static const mask space = _ISSPACE;
	static const mask print = _ISPRINT;
	static const mask cntrl = _ISCNTRL;
	static const mask upper = _ISUPPER;
	static const mask lower = _ISLOWER;
	static const mask alpha = _ISALPHA;
	static const mask digit = _ISDIGIT;
	static const mask punct = _ISPUNCT;
	static const mask xdigit = _ISXDIGIT;
	static const mask blank = _ISBLANK;
	#elif defined(_NEWLIB_VERSION)
	// Same type as Newlib's _ctype_ array in newlib/libc/include/ctype.h.
	typedef char mask;
	static const mask space = _S;
	static const mask print = _P \| _U \| _L \| _N \| _B;
	static const mask cntrl = _C;
	static const mask upper = _U;
	static const mask lower = _L;
	static const mask alpha = _U \| _L;
	static const mask digit = _N;
	static const mask punct = _P;
	static const mask xdigit = _X \| _N;
	static const mask blank = _B;
	# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_PRINT
	# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_ALPHA
	# define _LIBCPP_CTYPE_MASK_IS_COMPOSITE_XDIGIT
	#else
	typedef unsigned long mask;
	static const mask space = 1<<0;
	static const mask print = 1<<1;
	static const mask cntrl = 1<<2;
	static const mask upper = 1<<3;
	static const mask lower = 1<<4;
	static const mask alpha = 1<<5;
	static const mask digit = 1<<6;
	static const mask punct = 1<<7;
	static const mask xdigit = 1<<8;
	static const mask blank = 1<<9;
	#endif
	static const mask alnum = alpha \| digit;
	static const mask graph = alnum \| punct;

	_LIBCPP_ALWAYS_INLINE ctype_base() {}
	};

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS ctype;

	template <>
	class _LIBCPP_TYPE_VIS ctype<wchar_t>
	: public locale::facet,
	public ctype_base
	{
	public:
	typedef wchar_t char_type;

	_LIBCPP_ALWAYS_INLINE
	explicit ctype(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_ALWAYS_INLINE
	bool is(mask __m, char_type __c) const
	{
	return do_is(__m, __c);
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* is(const char_type* __low, const char_type* __high, mask* __vec) const
	{
	return do_is(__low, __high, __vec);
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* scan_is(mask __m, const char_type* __low, const char_type* __high) const
	{
	return do_scan_is(__m, __low, __high);
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* scan_not(mask __m, const char_type* __low, const char_type* __high) const
	{
	return do_scan_not(__m, __low, __high);
	}

	_LIBCPP_ALWAYS_INLINE
	char_type toupper(char_type __c) const
	{
	return do_toupper(__c);
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* toupper(char_type* __low, const char_type* __high) const
	{
	return do_toupper(__low, __high);
	}

	_LIBCPP_ALWAYS_INLINE
	char_type tolower(char_type __c) const
	{
	return do_tolower(__c);
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* tolower(char_type* __low, const char_type* __high) const
	{
	return do_tolower(__low, __high);
	}

	_LIBCPP_ALWAYS_INLINE
	char_type widen(char __c) const
	{
	return do_widen(__c);
	}

	_LIBCPP_ALWAYS_INLINE
	const char* widen(const char* __low, const char* __high, char_type* __to) const
	{
	return do_widen(__low, __high, __to);
	}

	_LIBCPP_ALWAYS_INLINE
	char narrow(char_type __c, char __dfault) const
	{
	return do_narrow(__c, __dfault);
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* narrow(const char_type* __low, const char_type* __high, char __dfault, char* __to) const
	{
	return do_narrow(__low, __high, __dfault, __to);
	}

	static locale::id id;

	protected:
	~ctype();
	virtual bool do_is(mask __m, char_type __c) const;
	virtual const char_type* do_is(const char_type* __low, const char_type* __high, mask* __vec) const;
	virtual const char_type* do_scan_is(mask __m, const char_type* __low, const char_type* __high) const;
	virtual const char_type* do_scan_not(mask __m, const char_type* __low, const char_type* __high) const;
	virtual char_type do_toupper(char_type) const;
	virtual const char_type* do_toupper(char_type* __low, const char_type* __high) const;
	virtual char_type do_tolower(char_type) const;
	virtual const char_type* do_tolower(char_type* __low, const char_type* __high) const;
	virtual char_type do_widen(char) const;
	virtual const char* do_widen(const char* __low, const char* __high, char_type* __dest) const;
	virtual char do_narrow(char_type, char __dfault) const;
	virtual const char_type* do_narrow(const char_type* __low, const char_type* __high, char __dfault, char* __dest) const;
	};

	template <>
	class _LIBCPP_TYPE_VIS ctype<char>
	: public locale::facet, public ctype_base
	{
	const mask* __tab_;
	bool __del_;
	public:
	typedef char char_type;

	explicit ctype(const mask* __tab = 0, bool __del = false, size_t __refs = 0);

	_LIBCPP_ALWAYS_INLINE
	bool is(mask __m, char_type __c) const
	{
	return isascii(__c) ? (__tab_[static_cast<int>(__c)] & __m) !=0 : false;
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* is(const char_type* __low, const char_type* __high, mask* __vec) const
	{
	for (; __low != __high; ++__low, ++__vec)
	__vec = isascii(__low) ? __tab_[static_cast<int>(*__low)] : 0;
	return __low;
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* scan_is (mask __m, const char_type* __low, const char_type* __high) const
	{
	for (; __low != __high; ++__low)
	if (isascii(__low) && (__tab_[static_cast<int>(__low)] & __m))
	break;
	return __low;
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* scan_not(mask __m, const char_type* __low, const char_type* __high) const
	{
	for (; __low != __high; ++__low)
	if (!(isascii(__low) && (__tab_[static_cast<int>(__low)] & __m)))
	break;
	return __low;
	}

	_LIBCPP_ALWAYS_INLINE
	char_type toupper(char_type __c) const
	{
	return do_toupper(__c);
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* toupper(char_type* __low, const char_type* __high) const
	{
	return do_toupper(__low, __high);
	}

	_LIBCPP_ALWAYS_INLINE
	char_type tolower(char_type __c) const
	{
	return do_tolower(__c);
	}

	_LIBCPP_ALWAYS_INLINE
	const char_type* tolower(char_type* __low, const char_type* __high) const
	{
	return do_tolower(__low, __high);
	}

	_LIBCPP_ALWAYS_INLINE
	char_type widen(char __c) const
	{
	return do_widen(__c);
	}

	_LIBCPP_ALWAYS_INLINE
	const char* widen(const char* __low, const char* __high, char_type* __to) const
	{
	return do_widen(__low, __high, __to);
	}

	_LIBCPP_ALWAYS_INLINE
	char narrow(char_type __c, char __dfault) const
	{
	return do_narrow(__c, __dfault);
	}

	_LIBCPP_ALWAYS_INLINE
	const char* narrow(const char_type* __low, const char_type* __high, char __dfault, char* __to) const
	{
	return do_narrow(__low, __high, __dfault, __to);
	}

	static locale::id id;

	#ifdef _CACHED_RUNES
	static const size_t table_size = _CACHED_RUNES;
	#else
	static const size_t table_size = 256; // FIXME: Don't hardcode this.
	#endif
	_LIBCPP_ALWAYS_INLINE const mask* table() const _NOEXCEPT {return __tab_;}
	static const mask* classic_table() _NOEXCEPT;
	#if defined(__GLIBC__) \|\| defined(__EMSCRIPTEN__)
	static const int* __classic_upper_table() _NOEXCEPT;
	static const int* __classic_lower_table() _NOEXCEPT;
	#endif
	#if defined(__NetBSD__)
	static const short* __classic_upper_table() _NOEXCEPT;
	static const short* __classic_lower_table() _NOEXCEPT;
	#endif

	protected:
	~ctype();
	virtual char_type do_toupper(char_type __c) const;
	virtual const char_type* do_toupper(char_type* __low, const char_type* __high) const;
	virtual char_type do_tolower(char_type __c) const;
	virtual const char_type* do_tolower(char_type* __low, const char_type* __high) const;
	virtual char_type do_widen(char __c) const;
	virtual const char* do_widen(const char* __low, const char* __high, char_type* __to) const;
	virtual char do_narrow(char_type __c, char __dfault) const;
	virtual const char* do_narrow(const char_type* __low, const char_type* __high, char __dfault, char* __to) const;
	};

	// template <class CharT> class ctype_byname;

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS ctype_byname;

	template <>
	class _LIBCPP_TYPE_VIS ctype_byname<char>
	: public ctype<char>
	{
	locale_t __l;

	public:
	explicit ctype_byname(const char*, size_t = 0);
	explicit ctype_byname(const string&, size_t = 0);

	protected:
	~ctype_byname();
	virtual char_type do_toupper(char_type) const;
	virtual const char_type* do_toupper(char_type* __low, const char_type* __high) const;
	virtual char_type do_tolower(char_type) const;
	virtual const char_type* do_tolower(char_type* __low, const char_type* __high) const;
	};

	template <>
	class _LIBCPP_TYPE_VIS ctype_byname<wchar_t>
	: public ctype<wchar_t>
	{
	locale_t __l;

	public:
	explicit ctype_byname(const char*, size_t = 0);
	explicit ctype_byname(const string&, size_t = 0);

	protected:
	~ctype_byname();
	virtual bool do_is(mask __m, char_type __c) const;
	virtual const char_type* do_is(const char_type* __low, const char_type* __high, mask* __vec) const;
	virtual const char_type* do_scan_is(mask __m, const char_type* __low, const char_type* __high) const;
	virtual const char_type* do_scan_not(mask __m, const char_type* __low, const char_type* __high) const;
	virtual char_type do_toupper(char_type) const;
	virtual const char_type* do_toupper(char_type* __low, const char_type* __high) const;
	virtual char_type do_tolower(char_type) const;
	virtual const char_type* do_tolower(char_type* __low, const char_type* __high) const;
	virtual char_type do_widen(char) const;
	virtual const char* do_widen(const char* __low, const char* __high, char_type* __dest) const;
	virtual char do_narrow(char_type, char __dfault) const;
	virtual const char_type* do_narrow(const char_type* __low, const char_type* __high, char __dfault, char* __dest) const;
	};

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isspace(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::space, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isprint(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::print, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	iscntrl(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::cntrl, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isupper(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::upper, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	islower(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::lower, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isalpha(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::alpha, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isdigit(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::digit, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	ispunct(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::punct, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isxdigit(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::xdigit, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isalnum(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::alnum, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	bool
	isgraph(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).is(ctype_base::graph, __c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	_CharT
	toupper(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).toupper(__c);
	}

	template <class _CharT>
	inline _LIBCPP_INLINE_VISIBILITY
	_CharT
	tolower(_CharT __c, const locale& __loc)
	{
	return use_facet<ctype<_CharT> >(__loc).tolower(__c);
	}

	// codecvt_base

	class _LIBCPP_TYPE_VIS codecvt_base
	{
	public:
	_LIBCPP_ALWAYS_INLINE codecvt_base() {}
	enum result {ok, partial, error, noconv};
	};

	// template <class internT, class externT, class stateT> class codecvt;

	template <class _InternT, class _ExternT, class _StateT> class _LIBCPP_TEMPLATE_VIS codecvt;

	// template <> class codecvt<char, char, mbstate_t>

	template <>
	class _LIBCPP_TYPE_VIS codecvt<char, char, mbstate_t>
	: public locale::facet,
	public codecvt_base
	{
	public:
	typedef char intern_type;
	typedef char extern_type;
	typedef mbstate_t state_type;

	_LIBCPP_ALWAYS_INLINE
	explicit codecvt(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_ALWAYS_INLINE
	result out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	result unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_unshift(__st, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	result in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
	{
	return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	int encoding() const _NOEXCEPT
	{
	return do_encoding();
	}

	_LIBCPP_ALWAYS_INLINE
	bool always_noconv() const _NOEXCEPT
	{
	return do_always_noconv();
	}

	_LIBCPP_ALWAYS_INLINE
	int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
	{
	return do_length(__st, __frm, __end, __mx);
	}

	_LIBCPP_ALWAYS_INLINE
	int max_length() const _NOEXCEPT
	{
	return do_max_length();
	}

	static locale::id id;

	protected:
	_LIBCPP_ALWAYS_INLINE
	explicit codecvt(const char*, size_t __refs = 0)
	: locale::facet(__refs) {}

	~codecvt();

	virtual result do_out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual result do_in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
	virtual result do_unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual int do_encoding() const _NOEXCEPT;
	virtual bool do_always_noconv() const _NOEXCEPT;
	virtual int do_length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
	virtual int do_max_length() const _NOEXCEPT;
	};

	// template <> class codecvt<wchar_t, char, mbstate_t>

	template <>
	class _LIBCPP_TYPE_VIS codecvt<wchar_t, char, mbstate_t>
	: public locale::facet,
	public codecvt_base
	{
	locale_t __l;
	public:
	typedef wchar_t intern_type;
	typedef char extern_type;
	typedef mbstate_t state_type;

	explicit codecvt(size_t __refs = 0);

	_LIBCPP_ALWAYS_INLINE
	result out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	result unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_unshift(__st, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	result in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
	{
	return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	int encoding() const _NOEXCEPT
	{
	return do_encoding();
	}

	_LIBCPP_ALWAYS_INLINE
	bool always_noconv() const _NOEXCEPT
	{
	return do_always_noconv();
	}

	_LIBCPP_ALWAYS_INLINE
	int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
	{
	return do_length(__st, __frm, __end, __mx);
	}

	_LIBCPP_ALWAYS_INLINE
	int max_length() const _NOEXCEPT
	{
	return do_max_length();
	}

	static locale::id id;

	protected:
	explicit codecvt(const char*, size_t __refs = 0);

	~codecvt();

	virtual result do_out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual result do_in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
	virtual result do_unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual int do_encoding() const _NOEXCEPT;
	virtual bool do_always_noconv() const _NOEXCEPT;
	virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
	virtual int do_max_length() const _NOEXCEPT;
	};

	// template <> class codecvt<char16_t, char, mbstate_t>

	template <>
	class _LIBCPP_TYPE_VIS codecvt<char16_t, char, mbstate_t>
	: public locale::facet,
	public codecvt_base
	{
	public:
	typedef char16_t intern_type;
	typedef char extern_type;
	typedef mbstate_t state_type;

	_LIBCPP_ALWAYS_INLINE
	explicit codecvt(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_ALWAYS_INLINE
	result out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	result unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_unshift(__st, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	result in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
	{
	return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	int encoding() const _NOEXCEPT
	{
	return do_encoding();
	}

	_LIBCPP_ALWAYS_INLINE
	bool always_noconv() const _NOEXCEPT
	{
	return do_always_noconv();
	}

	_LIBCPP_ALWAYS_INLINE
	int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
	{
	return do_length(__st, __frm, __end, __mx);
	}

	_LIBCPP_ALWAYS_INLINE
	int max_length() const _NOEXCEPT
	{
	return do_max_length();
	}

	static locale::id id;

	protected:
	_LIBCPP_ALWAYS_INLINE
	explicit codecvt(const char*, size_t __refs = 0)
	: locale::facet(__refs) {}

	~codecvt();

	virtual result do_out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual result do_in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
	virtual result do_unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual int do_encoding() const _NOEXCEPT;
	virtual bool do_always_noconv() const _NOEXCEPT;
	virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
	virtual int do_max_length() const _NOEXCEPT;
	};

	// template <> class codecvt<char32_t, char, mbstate_t>

	template <>
	class _LIBCPP_TYPE_VIS codecvt<char32_t, char, mbstate_t>
	: public locale::facet,
	public codecvt_base
	{
	public:
	typedef char32_t intern_type;
	typedef char extern_type;
	typedef mbstate_t state_type;

	_LIBCPP_ALWAYS_INLINE
	explicit codecvt(size_t __refs = 0)
	: locale::facet(__refs) {}

	_LIBCPP_ALWAYS_INLINE
	result out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_out(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	result unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const
	{
	return do_unshift(__st, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	result in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const
	{
	return do_in(__st, __frm, __frm_end, __frm_nxt, __to, __to_end, __to_nxt);
	}

	_LIBCPP_ALWAYS_INLINE
	int encoding() const _NOEXCEPT
	{
	return do_encoding();
	}

	_LIBCPP_ALWAYS_INLINE
	bool always_noconv() const _NOEXCEPT
	{
	return do_always_noconv();
	}

	_LIBCPP_ALWAYS_INLINE
	int length(state_type& __st, const extern_type* __frm, const extern_type* __end, size_t __mx) const
	{
	return do_length(__st, __frm, __end, __mx);
	}

	_LIBCPP_ALWAYS_INLINE
	int max_length() const _NOEXCEPT
	{
	return do_max_length();
	}

	static locale::id id;

	protected:
	_LIBCPP_ALWAYS_INLINE
	explicit codecvt(const char*, size_t __refs = 0)
	: locale::facet(__refs) {}

	~codecvt();

	virtual result do_out(state_type& __st,
	const intern_type* __frm, const intern_type* __frm_end, const intern_type*& __frm_nxt,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual result do_in(state_type& __st,
	const extern_type* __frm, const extern_type* __frm_end, const extern_type*& __frm_nxt,
	intern_type* __to, intern_type* __to_end, intern_type*& __to_nxt) const;
	virtual result do_unshift(state_type& __st,
	extern_type* __to, extern_type* __to_end, extern_type*& __to_nxt) const;
	virtual int do_encoding() const _NOEXCEPT;
	virtual bool do_always_noconv() const _NOEXCEPT;
	virtual int do_length(state_type&, const extern_type* __frm, const extern_type* __end, size_t __mx) const;
	virtual int do_max_length() const _NOEXCEPT;
	};

	// template <class _InternT, class _ExternT, class _StateT> class codecvt_byname

	template <class _InternT, class _ExternT, class _StateT>
	class _LIBCPP_TEMPLATE_VIS codecvt_byname
	: public codecvt<_InternT, _ExternT, _StateT>
	{
	public:
	_LIBCPP_ALWAYS_INLINE
	explicit codecvt_byname(const char* __nm, size_t __refs = 0)
	: codecvt<_InternT, _ExternT, _StateT>(__nm, __refs) {}
	_LIBCPP_ALWAYS_INLINE
	explicit codecvt_byname(const string& __nm, size_t __refs = 0)
	: codecvt<_InternT, _ExternT, _StateT>(__nm.c_str(), __refs) {}
	protected:
	~codecvt_byname();
	};

	template <class _InternT, class _ExternT, class _StateT>
	codecvt_byname<_InternT, _ExternT, _StateT>::~codecvt_byname()
	{
	}

	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char, char, mbstate_t>)
	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<wchar_t, char, mbstate_t>)
	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char16_t, char, mbstate_t>)
	_LIBCPP_EXTERN_TEMPLATE2(class _LIBCPP_EXTERN_TEMPLATE_TYPE_VIS codecvt_byname<char32_t, char, mbstate_t>)

	_LIBCPP_NORETURN _LIBCPP_FUNC_VIS void __throw_runtime_error(const char*);

	template <size_t _Np>
	struct __narrow_to_utf8
	{
	template <class _OutputIterator, class _CharT>
	_OutputIterator
	operator()(_OutputIterator __s, const _CharT* __wb, const _CharT* __we) const;
	};

	template <>
	struct __narrow_to_utf8<8>
	{
	template <class _OutputIterator, class _CharT>
	_LIBCPP_ALWAYS_INLINE
	_OutputIterator
	operator()(_OutputIterator __s, const _CharT* __wb, const _CharT* __we) const
	{
	for (; __wb < __we; ++__wb, ++__s)
	__s = __wb;
	return __s;
	}
	};

	template <>
	struct __narrow_to_utf8<16>
	: public codecvt<char16_t, char, mbstate_t>
	{
	_LIBCPP_ALWAYS_INLINE
	__narrow_to_utf8() : codecvt<char16_t, char, mbstate_t>(1) {}

	~__narrow_to_utf8();

	template <class _OutputIterator, class _CharT>
	_LIBCPP_ALWAYS_INLINE
	_OutputIterator
	operator()(_OutputIterator __s, const _CharT* __wb, const _CharT* __we) const
	{
	result __r = ok;
	mbstate_t __mb;
	while (__wb < __we && __r != error)
	{
	const int __sz = 32;
	char __buf[__sz];
	char* __bn;
	const char16_t* __wn = (const char16_t*)__wb;
	__r = do_out(__mb, (const char16_t)__wb, (const char16_t)__we, __wn,
	__buf, __buf+__sz, __bn);
	if (__r == codecvt_base::error \|\| __wn == (const char16_t*)__wb)
	__throw_runtime_error("locale not supported");
	for (const char* __p = __buf; __p < __bn; ++__p, ++__s)
	__s = __p;
	__wb = (const _CharT*)__wn;
	}
	return __s;
	}
	};

	template <>
	struct __narrow_to_utf8<32>
	: public codecvt<char32_t, char, mbstate_t>
	{
	_LIBCPP_ALWAYS_INLINE
	__narrow_to_utf8() : codecvt<char32_t, char, mbstate_t>(1) {}

	~__narrow_to_utf8();

	template <class _OutputIterator, class _CharT>
	_LIBCPP_ALWAYS_INLINE
	_OutputIterator
	operator()(_OutputIterator __s, const _CharT* __wb, const _CharT* __we) const
	{
	result __r = ok;
	mbstate_t __mb;
	while (__wb < __we && __r != error)
	{
	const int __sz = 32;
	char __buf[__sz];
	char* __bn;
	const char32_t* __wn = (const char32_t*)__wb;
	__r = do_out(__mb, (const char32_t)__wb, (const char32_t)__we, __wn,
	__buf, __buf+__sz, __bn);
	if (__r == codecvt_base::error \|\| __wn == (const char32_t*)__wb)
	__throw_runtime_error("locale not supported");
	for (const char* __p = __buf; __p < __bn; ++__p, ++__s)
	__s = __p;
	__wb = (const _CharT*)__wn;
	}
	return __s;
	}
	};

	template <size_t _Np>
	struct __widen_from_utf8
	{
	template <class _OutputIterator>
	_OutputIterator
	operator()(_OutputIterator __s, const char* __nb, const char* __ne) const;
	};

	template <>
	struct __widen_from_utf8<8>
	{
	template <class _OutputIterator>
	_LIBCPP_ALWAYS_INLINE
	_OutputIterator
	operator()(_OutputIterator __s, const char* __nb, const char* __ne) const
	{
	for (; __nb < __ne; ++__nb, ++__s)
	__s = __nb;
	return __s;
	}
	};

	template <>
	struct __widen_from_utf8<16>
	: public codecvt<char16_t, char, mbstate_t>
	{
	_LIBCPP_ALWAYS_INLINE
	__widen_from_utf8() : codecvt<char16_t, char, mbstate_t>(1) {}

	~__widen_from_utf8();

	template <class _OutputIterator>
	_LIBCPP_ALWAYS_INLINE
	_OutputIterator
	operator()(_OutputIterator __s, const char* __nb, const char* __ne) const
	{
	result __r = ok;
	mbstate_t __mb;
	while (__nb < __ne && __r != error)
	{
	const int __sz = 32;
	char16_t __buf[__sz];
	char16_t* __bn;
	const char* __nn = __nb;
	__r = do_in(__mb, __nb, __ne - __nb > __sz ? __nb+__sz : __ne, __nn,
	__buf, __buf+__sz, __bn);
	if (__r == codecvt_base::error \|\| __nn == __nb)
	__throw_runtime_error("locale not supported");
	for (const char16_t* __p = __buf; __p < __bn; ++__p, ++__s)
	__s = (wchar_t)__p;
	__nb = __nn;
	}
	return __s;
	}
	};

	template <>
	struct __widen_from_utf8<32>
	: public codecvt<char32_t, char, mbstate_t>
	{
	_LIBCPP_ALWAYS_INLINE
	__widen_from_utf8() : codecvt<char32_t, char, mbstate_t>(1) {}

	~__widen_from_utf8();

	template <class _OutputIterator>
	_LIBCPP_ALWAYS_INLINE
	_OutputIterator
	operator()(_OutputIterator __s, const char* __nb, const char* __ne) const
	{
	result __r = ok;
	mbstate_t __mb;
	while (__nb < __ne && __r != error)
	{
	const int __sz = 32;
	char32_t __buf[__sz];
	char32_t* __bn;
	const char* __nn = __nb;
	__r = do_in(__mb, __nb, __ne - __nb > __sz ? __nb+__sz : __ne, __nn,
	__buf, __buf+__sz, __bn);
	if (__r == codecvt_base::error \|\| __nn == __nb)
	__throw_runtime_error("locale not supported");
	for (const char32_t* __p = __buf; __p < __bn; ++__p, ++__s)
	__s = (wchar_t)__p;
	__nb = __nn;
	}
	return __s;
	}
	};

	// template <class charT> class numpunct

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS numpunct;

	template <>
	class _LIBCPP_TYPE_VIS numpunct<char>
	: public locale::facet
	{
	public:
	typedef char char_type;
	typedef basic_string<char_type> string_type;

	explicit numpunct(size_t __refs = 0);

	_LIBCPP_ALWAYS_INLINE char_type decimal_point() const {return do_decimal_point();}
	_LIBCPP_ALWAYS_INLINE char_type thousands_sep() const {return do_thousands_sep();}
	_LIBCPP_ALWAYS_INLINE string grouping() const {return do_grouping();}
	_LIBCPP_ALWAYS_INLINE string_type truename() const {return do_truename();}
	_LIBCPP_ALWAYS_INLINE string_type falsename() const {return do_falsename();}

	static locale::id id;

	protected:
	~numpunct();
	virtual char_type do_decimal_point() const;
	virtual char_type do_thousands_sep() const;
	virtual string do_grouping() const;
	virtual string_type do_truename() const;
	virtual string_type do_falsename() const;

	char_type __decimal_point_;
	char_type __thousands_sep_;
	string __grouping_;
	};

	template <>
	class _LIBCPP_TYPE_VIS numpunct<wchar_t>
	: public locale::facet
	{
	public:
	typedef wchar_t char_type;
	typedef basic_string<char_type> string_type;

	explicit numpunct(size_t __refs = 0);

	_LIBCPP_ALWAYS_INLINE char_type decimal_point() const {return do_decimal_point();}
	_LIBCPP_ALWAYS_INLINE char_type thousands_sep() const {return do_thousands_sep();}
	_LIBCPP_ALWAYS_INLINE string grouping() const {return do_grouping();}
	_LIBCPP_ALWAYS_INLINE string_type truename() const {return do_truename();}
	_LIBCPP_ALWAYS_INLINE string_type falsename() const {return do_falsename();}

	static locale::id id;

	protected:
	~numpunct();
	virtual char_type do_decimal_point() const;
	virtual char_type do_thousands_sep() const;
	virtual string do_grouping() const;
	virtual string_type do_truename() const;
	virtual string_type do_falsename() const;

	char_type __decimal_point_;
	char_type __thousands_sep_;
	string __grouping_;
	};

	// template <class charT> class numpunct_byname

	template <class _CharT> class _LIBCPP_TEMPLATE_VIS numpunct_byname;

	template <>
	class _LIBCPP_TYPE_VIS numpunct_byname<char>
	: public numpunct<char>
	{
	public:
	typedef char char_type;
	typedef basic_string<char_type> string_type;

	explicit numpunct_byname(const char* __nm, size_t __refs = 0);
	explicit numpunct_byname(const string& __nm, size_t __refs = 0);

	protected:
	~numpunct_byname();

	private:
	void __init(const char*);
	};

	template <>
	class _LIBCPP_TYPE_VIS numpunct_byname<wchar_t>
	: public numpunct<wchar_t>
	{
	public:
	typedef wchar_t char_type;
	typedef basic_string<char_type> string_type;

	explicit numpunct_byname(const char* __nm, size_t __refs = 0);
	explicit numpunct_byname(const string& __nm, size_t __refs = 0);

	protected:
	~numpunct_byname();

	private:
	void __init(const char*);
	};

	_LIBCPP_END_NAMESPACE_STD

	#endif // _LIBCPP___LOCALE
	Index: head/contrib/libc++/include/mutex
	===================================================================
	--- head/contrib/libc++/include/mutex (revision 322319)
	+++ head/contrib/libc++/include/mutex (revision 322320)
	@@ -1,702 +1,702 @@
	// -- C++ --
	//===--------------------------- mutex ------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is dual licensed under the MIT and the University of Illinois Open
	// Source Licenses. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP_MUTEX
	#define _LIBCPP_MUTEX

	/*
	mutex synopsis

	namespace std
	{

	class mutex
	{
	public:
	constexpr mutex() noexcept;
	~mutex();

	mutex(const mutex&) = delete;
	mutex& operator=(const mutex&) = delete;

	void lock();
	bool try_lock();
	void unlock();

	typedef pthread_mutex_t* native_handle_type;
	native_handle_type native_handle();
	};

	class recursive_mutex
	{
	public:
	recursive_mutex();
	~recursive_mutex();

	recursive_mutex(const recursive_mutex&) = delete;
	recursive_mutex& operator=(const recursive_mutex&) = delete;

	void lock();
	bool try_lock() noexcept;
	void unlock();

	typedef pthread_mutex_t* native_handle_type;
	native_handle_type native_handle();
	};

	class timed_mutex
	{
	public:
	timed_mutex();
	~timed_mutex();

	timed_mutex(const timed_mutex&) = delete;
	timed_mutex& operator=(const timed_mutex&) = delete;

	void lock();
	bool try_lock();
	template <class Rep, class Period>
	bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
	template <class Clock, class Duration>
	bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
	void unlock();
	};

	class recursive_timed_mutex
	{
	public:
	recursive_timed_mutex();
	~recursive_timed_mutex();

	recursive_timed_mutex(const recursive_timed_mutex&) = delete;
	recursive_timed_mutex& operator=(const recursive_timed_mutex&) = delete;

	void lock();
	bool try_lock() noexcept;
	template <class Rep, class Period>
	bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
	template <class Clock, class Duration>
	bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);
	void unlock();
	};

	struct defer_lock_t {};
	struct try_to_lock_t {};
	struct adopt_lock_t {};

	constexpr defer_lock_t defer_lock{};
	constexpr try_to_lock_t try_to_lock{};
	constexpr adopt_lock_t adopt_lock{};

	template <class Mutex>
	class lock_guard
	{
	public:
	typedef Mutex mutex_type;

	explicit lock_guard(mutex_type& m);
	lock_guard(mutex_type& m, adopt_lock_t);
	~lock_guard();

	lock_guard(lock_guard const&) = delete;
	lock_guard& operator=(lock_guard const&) = delete;
	};

	template <class... MutexTypes>
	class scoped_lock // C++17
	{
	public:
	using mutex_type = Mutex; // If MutexTypes... consists of the single type Mutex

	explicit scoped_lock(MutexTypes&... m);
	- scoped_lock(MutexTypes&... m, adopt_lock_t);
	+ scoped_lock(adopt_lock_t, MutexTypes&... m);
	~scoped_lock();
	scoped_lock(scoped_lock const&) = delete;
	scoped_lock& operator=(scoped_lock const&) = delete;
	private:
	tuple<MutexTypes&...> pm; // exposition only
	};

	template <class Mutex>
	class unique_lock
	{
	public:
	typedef Mutex mutex_type;
	unique_lock() noexcept;
	explicit unique_lock(mutex_type& m);
	unique_lock(mutex_type& m, defer_lock_t) noexcept;
	unique_lock(mutex_type& m, try_to_lock_t);
	unique_lock(mutex_type& m, adopt_lock_t);
	template <class Clock, class Duration>
	unique_lock(mutex_type& m, const chrono::time_point<Clock, Duration>& abs_time);
	template <class Rep, class Period>
	unique_lock(mutex_type& m, const chrono::duration<Rep, Period>& rel_time);
	~unique_lock();

	unique_lock(unique_lock const&) = delete;
	unique_lock& operator=(unique_lock const&) = delete;

	unique_lock(unique_lock&& u) noexcept;
	unique_lock& operator=(unique_lock&& u) noexcept;

	void lock();
	bool try_lock();

	template <class Rep, class Period>
	bool try_lock_for(const chrono::duration<Rep, Period>& rel_time);
	template <class Clock, class Duration>
	bool try_lock_until(const chrono::time_point<Clock, Duration>& abs_time);

	void unlock();

	void swap(unique_lock& u) noexcept;
	mutex_type* release() noexcept;

	bool owns_lock() const noexcept;
	explicit operator bool () const noexcept;
	mutex_type* mutex() const noexcept;
	};

	template <class Mutex>
	void swap(unique_lock<Mutex>& x, unique_lock<Mutex>& y) noexcept;

	template <class L1, class L2, class... L3>
	int try_lock(L1&, L2&, L3&...);
	template <class L1, class L2, class... L3>
	void lock(L1&, L2&, L3&...);

	struct once_flag
	{
	constexpr once_flag() noexcept;

	once_flag(const once_flag&) = delete;
	once_flag& operator=(const once_flag&) = delete;
	};

	template<class Callable, class ...Args>
	void call_once(once_flag& flag, Callable&& func, Args&&... args);

	} // std

	*/

	#include <__config>
	#include <__mutex_base>
	#include <functional>
	#include <memory>
	#ifndef _LIBCPP_CXX03_LANG
	#include <tuple>
	#endif
	#include <__threading_support>

	#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
	#pragma GCC system_header
	#endif

	_LIBCPP_PUSH_MACROS
	#include <__undef_macros>


	_LIBCPP_BEGIN_NAMESPACE_STD

	#ifndef _LIBCPP_HAS_NO_THREADS

	class _LIBCPP_TYPE_VIS recursive_mutex
	{
	__libcpp_recursive_mutex_t __m_;

	public:
	recursive_mutex();
	~recursive_mutex();

	private:
	recursive_mutex(const recursive_mutex&); // = delete;
	recursive_mutex& operator=(const recursive_mutex&); // = delete;

	public:
	void lock();
	bool try_lock() _NOEXCEPT;
	void unlock() _NOEXCEPT;

	typedef __libcpp_recursive_mutex_t* native_handle_type;

	_LIBCPP_INLINE_VISIBILITY
	native_handle_type native_handle() {return &__m_;}
	};

	class _LIBCPP_TYPE_VIS timed_mutex
	{
	mutex __m_;
	condition_variable __cv_;
	bool __locked_;
	public:
	timed_mutex();
	~timed_mutex();

	private:
	timed_mutex(const timed_mutex&); // = delete;
	timed_mutex& operator=(const timed_mutex&); // = delete;

	public:
	void lock();
	bool try_lock() _NOEXCEPT;
	template <class _Rep, class _Period>
	_LIBCPP_INLINE_VISIBILITY
	bool try_lock_for(const chrono::duration<_Rep, _Period>& __d)
	{return try_lock_until(chrono::steady_clock::now() + __d);}
	template <class _Clock, class _Duration>
	_LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
	bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t);
	void unlock() _NOEXCEPT;
	};

	template <class _Clock, class _Duration>
	bool
	timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
	{
	using namespace chrono;
	unique_lock<mutex> __lk(__m_);
	bool no_timeout = _Clock::now() < __t;
	while (no_timeout && __locked_)
	no_timeout = __cv_.wait_until(__lk, __t) == cv_status::no_timeout;
	if (!__locked_)
	{
	__locked_ = true;
	return true;
	}
	return false;
	}

	class _LIBCPP_TYPE_VIS recursive_timed_mutex
	{
	mutex __m_;
	condition_variable __cv_;
	size_t __count_;
	__libcpp_thread_id __id_;
	public:
	recursive_timed_mutex();
	~recursive_timed_mutex();

	private:
	recursive_timed_mutex(const recursive_timed_mutex&); // = delete;
	recursive_timed_mutex& operator=(const recursive_timed_mutex&); // = delete;

	public:
	void lock();
	bool try_lock() _NOEXCEPT;
	template <class _Rep, class _Period>
	_LIBCPP_INLINE_VISIBILITY
	bool try_lock_for(const chrono::duration<_Rep, _Period>& __d)
	{return try_lock_until(chrono::steady_clock::now() + __d);}
	template <class _Clock, class _Duration>
	_LIBCPP_METHOD_TEMPLATE_IMPLICIT_INSTANTIATION_VIS
	bool try_lock_until(const chrono::time_point<_Clock, _Duration>& __t);
	void unlock() _NOEXCEPT;
	};

	template <class _Clock, class _Duration>
	bool
	recursive_timed_mutex::try_lock_until(const chrono::time_point<_Clock, _Duration>& __t)
	{
	using namespace chrono;
	__libcpp_thread_id __id = __libcpp_thread_get_current_id();
	unique_lock<mutex> lk(__m_);
	if (__libcpp_thread_id_equal(__id, __id_))
	{
	if (__count_ == numeric_limits<size_t>::max())
	return false;
	++__count_;
	return true;
	}
	bool no_timeout = _Clock::now() < __t;
	while (no_timeout && __count_ != 0)
	no_timeout = __cv_.wait_until(lk, __t) == cv_status::no_timeout;
	if (__count_ == 0)
	{
	__count_ = 1;
	__id_ = __id;
	return true;
	}
	return false;
	}

	template <class _L0, class _L1>
	int
	try_lock(_L0& __l0, _L1& __l1)
	{
	unique_lock<_L0> __u0(__l0, try_to_lock);
	if (__u0.owns_lock())
	{
	if (__l1.try_lock())
	{
	__u0.release();
	return -1;
	}
	else
	return 1;
	}
	return 0;
	}

	#ifndef _LIBCPP_CXX03_LANG

	template <class _L0, class _L1, class _L2, class... _L3>
	int
	try_lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3)
	{
	int __r = 0;
	unique_lock<_L0> __u0(__l0, try_to_lock);
	if (__u0.owns_lock())
	{
	__r = try_lock(__l1, __l2, __l3...);
	if (__r == -1)
	__u0.release();
	else
	++__r;
	}
	return __r;
	}

	#endif // _LIBCPP_CXX03_LANG

	template <class _L0, class _L1>
	void
	lock(_L0& __l0, _L1& __l1)
	{
	while (true)
	{
	{
	unique_lock<_L0> __u0(__l0);
	if (__l1.try_lock())
	{
	__u0.release();
	break;
	}
	}
	__libcpp_thread_yield();
	{
	unique_lock<_L1> __u1(__l1);
	if (__l0.try_lock())
	{
	__u1.release();
	break;
	}
	}
	__libcpp_thread_yield();
	}
	}

	#ifndef _LIBCPP_CXX03_LANG

	template <class _L0, class _L1, class _L2, class ..._L3>
	void
	__lock_first(int __i, _L0& __l0, _L1& __l1, _L2& __l2, _L3& ...__l3)
	{
	while (true)
	{
	switch (__i)
	{
	case 0:
	{
	unique_lock<_L0> __u0(__l0);
	__i = try_lock(__l1, __l2, __l3...);
	if (__i == -1)
	{
	__u0.release();
	return;
	}
	}
	++__i;
	__libcpp_thread_yield();
	break;
	case 1:
	{
	unique_lock<_L1> __u1(__l1);
	__i = try_lock(__l2, __l3..., __l0);
	if (__i == -1)
	{
	__u1.release();
	return;
	}
	}
	if (__i == sizeof...(_L3) + 1)
	__i = 0;
	else
	__i += 2;
	__libcpp_thread_yield();
	break;
	default:
	__lock_first(__i - 2, __l2, __l3..., __l0, __l1);
	return;
	}
	}
	}

	template <class _L0, class _L1, class _L2, class ..._L3>
	inline _LIBCPP_INLINE_VISIBILITY
	void
	lock(_L0& __l0, _L1& __l1, _L2& __l2, _L3& ...__l3)
	{
	__lock_first(0, __l0, __l1, __l2, __l3...);
	}

	template <class _L0>
	inline _LIBCPP_INLINE_VISIBILITY
	void __unlock(_L0& __l0) {
	__l0.unlock();
	}

	template <class _L0, class _L1>
	inline _LIBCPP_INLINE_VISIBILITY
	void __unlock(_L0& __l0, _L1& __l1) {
	__l0.unlock();
	__l1.unlock();
	}

	template <class _L0, class _L1, class _L2, class ..._L3>
	inline _LIBCPP_INLINE_VISIBILITY
	void __unlock(_L0& __l0, _L1& __l1, _L2& __l2, _L3&... __l3) {
	__l0.unlock();
	__l1.unlock();
	_VSTD::__unlock(__l2, __l3...);
	}

	#endif // _LIBCPP_CXX03_LANG

	#if _LIBCPP_STD_VER > 14
	template <class ..._Mutexes>
	class _LIBCPP_TEMPLATE_VIS scoped_lock;

	template <>
	class _LIBCPP_TEMPLATE_VIS scoped_lock<> {
	public:
	explicit scoped_lock() {}
	~scoped_lock() = default;

	_LIBCPP_INLINE_VISIBILITY
	explicit scoped_lock(adopt_lock_t) {}

	scoped_lock(scoped_lock const&) = delete;
	scoped_lock& operator=(scoped_lock const&) = delete;
	};

	template <class _Mutex>
	class _LIBCPP_TEMPLATE_VIS scoped_lock<_Mutex> {
	public:
	typedef _Mutex mutex_type;
	private:
	mutex_type& __m_;
	public:
	explicit scoped_lock(mutex_type & __m) _LIBCPP_THREAD_SAFETY_ANNOTATION(acquire_capability(__m))
	: __m_(__m) {__m_.lock();}

	~scoped_lock() _LIBCPP_THREAD_SAFETY_ANNOTATION(release_capability()) {__m_.unlock();}

	_LIBCPP_INLINE_VISIBILITY
	- explicit scoped_lock(mutex_type& __m, adopt_lock_t) _LIBCPP_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
	+ explicit scoped_lock(adopt_lock_t, mutex_type& __m) _LIBCPP_THREAD_SAFETY_ANNOTATION(requires_capability(__m))
	: __m_(__m) {}

	scoped_lock(scoped_lock const&) = delete;
	scoped_lock& operator=(scoped_lock const&) = delete;
	};

	template <class ..._MArgs>
	class _LIBCPP_TEMPLATE_VIS scoped_lock
	{
	static_assert(sizeof...(_MArgs) > 1, "At least 2 lock types required");
	typedef tuple<_MArgs&...> _MutexTuple;

	public:
	_LIBCPP_INLINE_VISIBILITY
	explicit scoped_lock(_MArgs&... __margs)
	: __t_(__margs...)
	{
	_VSTD::lock(__margs...);
	}

	_LIBCPP_INLINE_VISIBILITY
	- scoped_lock(_MArgs&... __margs, adopt_lock_t)
	+ scoped_lock(adopt_lock_t, _MArgs&... __margs)
	: __t_(__margs...)
	{
	}

	_LIBCPP_INLINE_VISIBILITY
	~scoped_lock() {
	typedef typename __make_tuple_indices<sizeof...(_MArgs)>::type _Indices;
	__unlock_unpack(_Indices{}, __t_);
	}

	scoped_lock(scoped_lock const&) = delete;
	scoped_lock& operator=(scoped_lock const&) = delete;

	private:
	template <size_t ..._Indx>
	_LIBCPP_INLINE_VISIBILITY
	static void __unlock_unpack(__tuple_indices<_Indx...>, _MutexTuple& __mt) {
	_VSTD::__unlock(_VSTD::get<_Indx>(__mt)...);
	}

	_MutexTuple __t_;
	};

	#endif // _LIBCPP_STD_VER > 14
	#endif // !_LIBCPP_HAS_NO_THREADS

	struct _LIBCPP_TEMPLATE_VIS once_flag;

	#ifndef _LIBCPP_CXX03_LANG

	template<class _Callable, class... _Args>
	_LIBCPP_INLINE_VISIBILITY
	void call_once(once_flag&, _Callable&&, _Args&&...);

	#else // _LIBCPP_CXX03_LANG

	template<class _Callable>
	_LIBCPP_INLINE_VISIBILITY
	void call_once(once_flag&, _Callable&);

	template<class _Callable>
	_LIBCPP_INLINE_VISIBILITY
	void call_once(once_flag&, const _Callable&);

	#endif // _LIBCPP_CXX03_LANG

	struct _LIBCPP_TEMPLATE_VIS once_flag
	{
	_LIBCPP_INLINE_VISIBILITY
	_LIBCPP_CONSTEXPR
	once_flag() _NOEXCEPT : __state_(0) {}

	private:
	once_flag(const once_flag&); // = delete;
	once_flag& operator=(const once_flag&); // = delete;

	unsigned long __state_;

	#ifndef _LIBCPP_CXX03_LANG
	template<class _Callable, class... _Args>
	friend
	void call_once(once_flag&, _Callable&&, _Args&&...);
	#else // _LIBCPP_CXX03_LANG
	template<class _Callable>
	friend
	void call_once(once_flag&, _Callable&);

	template<class _Callable>
	friend
	void call_once(once_flag&, const _Callable&);
	#endif // _LIBCPP_CXX03_LANG
	};

	#ifndef _LIBCPP_CXX03_LANG

	template <class _Fp>
	class __call_once_param
	{
	_Fp& __f_;
	public:
	_LIBCPP_INLINE_VISIBILITY
	explicit __call_once_param(_Fp& __f) : __f_(__f) {}

	_LIBCPP_INLINE_VISIBILITY
	void operator()()
	{
	typedef typename __make_tuple_indices<tuple_size<_Fp>::value, 1>::type _Index;
	__execute(_Index());
	}

	private:
	template <size_t ..._Indices>
	_LIBCPP_INLINE_VISIBILITY
	void __execute(__tuple_indices<_Indices...>)
	{
	__invoke(_VSTD::get<0>(_VSTD::move(__f_)), _VSTD::get<_Indices>(_VSTD::move(__f_))...);
	}
	};

	#else

	template <class _Fp>
	class __call_once_param
	{
	_Fp& __f_;
	public:
	_LIBCPP_INLINE_VISIBILITY
	explicit __call_once_param(_Fp& __f) : __f_(__f) {}

	_LIBCPP_INLINE_VISIBILITY
	void operator()()
	{
	__f_();
	}
	};

	#endif

	template <class _Fp>
	void
	__call_once_proxy(void* __vp)
	{
	__call_once_param<_Fp>* __p = static_cast<__call_once_param<_Fp>*>(__vp);
	(*__p)();
	}

	_LIBCPP_FUNC_VIS void __call_once(volatile unsigned long&, void, void()(void*));

	#ifndef _LIBCPP_CXX03_LANG

	template<class _Callable, class... _Args>
	inline _LIBCPP_INLINE_VISIBILITY
	void
	call_once(once_flag& __flag, _Callable&& __func, _Args&&... __args)
	{
	if (__libcpp_acquire_load(&__flag.__state_) != ~0ul)
	{
	typedef tuple<_Callable&&, _Args&&...> _Gp;
	_Gp __f(_VSTD::forward<_Callable>(__func), _VSTD::forward<_Args>(__args)...);
	__call_once_param<_Gp> __p(__f);
	__call_once(__flag.__state_, &__p, &__call_once_proxy<_Gp>);
	}
	}

	#else // _LIBCPP_CXX03_LANG

	template<class _Callable>
	inline _LIBCPP_INLINE_VISIBILITY
	void
	call_once(once_flag& __flag, _Callable& __func)
	{
	if (__libcpp_acquire_load(&__flag.__state_) != ~0ul)
	{
	__call_once_param<_Callable> __p(__func);
	__call_once(__flag.__state_, &__p, &__call_once_proxy<_Callable>);
	}
	}

	template<class _Callable>
	inline _LIBCPP_INLINE_VISIBILITY
	void
	call_once(once_flag& __flag, const _Callable& __func)
	{
	if (__libcpp_acquire_load(&__flag.__state_) != ~0ul)
	{
	__call_once_param<const _Callable> __p(__func);
	__call_once(__flag.__state_, &__p, &__call_once_proxy<const _Callable>);
	}
	}

	#endif // _LIBCPP_CXX03_LANG

	_LIBCPP_END_NAMESPACE_STD

	_LIBCPP_POP_MACROS

	#endif // _LIBCPP_MUTEX
	Index: head/contrib/libc++/include/sstream
	===================================================================
	--- head/contrib/libc++/include/sstream (revision 322319)
	+++ head/contrib/libc++/include/sstream (revision 322320)
	@@ -1,977 +1,978 @@
	// -- C++ --
	//===--------------------------- sstream ----------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is dual licensed under the MIT and the University of Illinois Open
	// Source Licenses. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef _LIBCPP_SSTREAM
	#define _LIBCPP_SSTREAM

	/*
	sstream synopsis

	template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
	class basic_stringbuf
	: public basic_streambuf<charT, traits>
	{
	public:
	typedef charT char_type;
	typedef traits traits_type;
	typedef typename traits_type::int_type int_type;
	typedef typename traits_type::pos_type pos_type;
	typedef typename traits_type::off_type off_type;
	typedef Allocator allocator_type;

	// 27.8.1.1 Constructors:
	explicit basic_stringbuf(ios_base::openmode which = ios_base::in \| ios_base::out);
	explicit basic_stringbuf(const basic_string<char_type, traits_type, allocator_type>& str,
	ios_base::openmode which = ios_base::in \| ios_base::out);
	basic_stringbuf(basic_stringbuf&& rhs);

	// 27.8.1.2 Assign and swap:
	basic_stringbuf& operator=(basic_stringbuf&& rhs);
	void swap(basic_stringbuf& rhs);

	// 27.8.1.3 Get and set:
	basic_string<char_type, traits_type, allocator_type> str() const;
	void str(const basic_string<char_type, traits_type, allocator_type>& s);

	protected:
	// 27.8.1.4 Overridden virtual functions:
	virtual int_type underflow();
	virtual int_type pbackfail(int_type c = traits_type::eof());
	virtual int_type overflow (int_type c = traits_type::eof());
	virtual basic_streambuf<char_type, traits_type>* setbuf(char_type*, streamsize);
	virtual pos_type seekoff(off_type off, ios_base::seekdir way,
	ios_base::openmode which = ios_base::in \| ios_base::out);
	virtual pos_type seekpos(pos_type sp,
	ios_base::openmode which = ios_base::in \| ios_base::out);
	};

	template <class charT, class traits, class Allocator>
	void swap(basic_stringbuf<charT, traits, Allocator>& x,
	basic_stringbuf<charT, traits, Allocator>& y);

	typedef basic_stringbuf<char> stringbuf;
	typedef basic_stringbuf<wchar_t> wstringbuf;

	template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
	class basic_istringstream
	: public basic_istream<charT, traits>
	{
	public:
	typedef charT char_type;
	typedef traits traits_type;
	typedef typename traits_type::int_type int_type;
	typedef typename traits_type::pos_type pos_type;
	typedef typename traits_type::off_type off_type;
	typedef Allocator allocator_type;

	// 27.8.2.1 Constructors:
	explicit basic_istringstream(ios_base::openmode which = ios_base::in);
	explicit basic_istringstream(const basic_string<char_type, traits_type,allocator_type>& str,
	ios_base::openmode which = ios_base::in);
	basic_istringstream(basic_istringstream&& rhs);

	// 27.8.2.2 Assign and swap:
	basic_istringstream& operator=(basic_istringstream&& rhs);
	void swap(basic_istringstream& rhs);

	// 27.8.2.3 Members:
	basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
	basic_string<char_type, traits_type, allocator_type> str() const;
	void str(const basic_string<char_type, traits_type, allocator_type>& s);
	};

	template <class charT, class traits, class Allocator>
	void swap(basic_istringstream<charT, traits, Allocator>& x,
	basic_istringstream<charT, traits, Allocator>& y);

	typedef basic_istringstream<char> istringstream;
	typedef basic_istringstream<wchar_t> wistringstream;

	template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
	class basic_ostringstream
	: public basic_ostream<charT, traits>
	{
	public:
	// types:
	typedef charT char_type;
	typedef traits traits_type;
	typedef typename traits_type::int_type int_type;
	typedef typename traits_type::pos_type pos_type;
	typedef typename traits_type::off_type off_type;
	typedef Allocator allocator_type;

	// 27.8.3.1 Constructors/destructor:
	explicit basic_ostringstream(ios_base::openmode which = ios_base::out);
	explicit basic_ostringstream(const basic_string<char_type, traits_type, allocator_type>& str,
	ios_base::openmode which = ios_base::out);
	basic_ostringstream(basic_ostringstream&& rhs);

	// 27.8.3.2 Assign/swap:
	basic_ostringstream& operator=(basic_ostringstream&& rhs);
	void swap(basic_ostringstream& rhs);

	// 27.8.3.3 Members:
	basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
	basic_string<char_type, traits_type, allocator_type> str() const;
	void str(const basic_string<char_type, traits_type, allocator_type>& s);
	};

	template <class charT, class traits, class Allocator>
	void swap(basic_ostringstream<charT, traits, Allocator>& x,
	basic_ostringstream<charT, traits, Allocator>& y);

	typedef basic_ostringstream<char> ostringstream;
	typedef basic_ostringstream<wchar_t> wostringstream;

	template <class charT, class traits = char_traits<charT>, class Allocator = allocator<charT> >
	class basic_stringstream
	: public basic_iostream<charT, traits>
	{
	public:
	// types:
	typedef charT char_type;
	typedef traits traits_type;
	typedef typename traits_type::int_type int_type;
	typedef typename traits_type::pos_type pos_type;
	typedef typename traits_type::off_type off_type;
	typedef Allocator allocator_type;

	// constructors/destructor
	explicit basic_stringstream(ios_base::openmode which = ios_base::out\|ios_base::in);
	explicit basic_stringstream(const basic_string<char_type, traits_type, allocator_type>& str,
	ios_base::openmode which = ios_base::out\|ios_base::in);
	basic_stringstream(basic_stringstream&& rhs);

	// 27.8.5.1 Assign/swap:
	basic_stringstream& operator=(basic_stringstream&& rhs);
	void swap(basic_stringstream& rhs);

	// Members:
	basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
	basic_string<char_type, traits_type, allocator_type> str() const;
	void str(const basic_string<char_type, traits_type, allocator_type>& str);
	};

	template <class charT, class traits, class Allocator>
	void swap(basic_stringstream<charT, traits, Allocator>& x,
	basic_stringstream<charT, traits, Allocator>& y);

	typedef basic_stringstream<char> stringstream;
	typedef basic_stringstream<wchar_t> wstringstream;

	} // std

	*/

	#include <__config>
	#include <ostream>
	#include <istream>
	#include <string>

	#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
	#pragma GCC system_header
	#endif

	_LIBCPP_PUSH_MACROS
	#include <__undef_macros>


	_LIBCPP_BEGIN_NAMESPACE_STD

	// basic_stringbuf

	template <class _CharT, class _Traits, class _Allocator>
	class _LIBCPP_TEMPLATE_VIS basic_stringbuf
	: public basic_streambuf<_CharT, _Traits>
	{
	public:
	typedef _CharT char_type;
	typedef _Traits traits_type;
	typedef typename traits_type::int_type int_type;
	typedef typename traits_type::pos_type pos_type;
	typedef typename traits_type::off_type off_type;
	typedef _Allocator allocator_type;

	typedef basic_string<char_type, traits_type, allocator_type> string_type;

	private:

	string_type __str_;
	mutable char_type* __hm_;
	ios_base::openmode __mode_;

	public:
	// 27.8.1.1 Constructors:
	inline _LIBCPP_INLINE_VISIBILITY
	explicit basic_stringbuf(ios_base::openmode __wch = ios_base::in \| ios_base::out);
	inline _LIBCPP_INLINE_VISIBILITY
	explicit basic_stringbuf(const string_type& __s,
	ios_base::openmode __wch = ios_base::in \| ios_base::out);
	#ifndef _LIBCPP_CXX03_LANG
	basic_stringbuf(basic_stringbuf&& __rhs);

	// 27.8.1.2 Assign and swap:
	basic_stringbuf& operator=(basic_stringbuf&& __rhs);
	#endif
	void swap(basic_stringbuf& __rhs);

	// 27.8.1.3 Get and set:
	string_type str() const;
	void str(const string_type& __s);

	protected:
	// 27.8.1.4 Overridden virtual functions:
	virtual int_type underflow();
	virtual int_type pbackfail(int_type __c = traits_type::eof());
	virtual int_type overflow (int_type __c = traits_type::eof());
	virtual pos_type seekoff(off_type __off, ios_base::seekdir __way,
	ios_base::openmode __wch = ios_base::in \| ios_base::out);
	inline _LIBCPP_INLINE_VISIBILITY
	virtual pos_type seekpos(pos_type __sp,
	ios_base::openmode __wch = ios_base::in \| ios_base::out);
	};

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringbuf<_CharT, _Traits, _Allocator>::basic_stringbuf(ios_base::openmode __wch)
	: __hm_(0),
	__mode_(__wch)
	{
	str(string_type());
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringbuf<_CharT, _Traits, _Allocator>::basic_stringbuf(const string_type& __s,
	ios_base::openmode __wch)
	- : __hm_(0),
	+ : __str_(__s.get_allocator()),
	+ __hm_(0),
	__mode_(__wch)
	{
	str(__s);
	}

	#ifndef _LIBCPP_CXX03_LANG

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringbuf<_CharT, _Traits, _Allocator>::basic_stringbuf(basic_stringbuf&& __rhs)
	: __mode_(__rhs.__mode_)
	{
	char_type* __p = const_cast<char_type*>(__rhs.__str_.data());
	ptrdiff_t __binp = -1;
	ptrdiff_t __ninp = -1;
	ptrdiff_t __einp = -1;
	if (__rhs.eback() != nullptr)
	{
	__binp = __rhs.eback() - __p;
	__ninp = __rhs.gptr() - __p;
	__einp = __rhs.egptr() - __p;
	}
	ptrdiff_t __bout = -1;
	ptrdiff_t __nout = -1;
	ptrdiff_t __eout = -1;
	if (__rhs.pbase() != nullptr)
	{
	__bout = __rhs.pbase() - __p;
	__nout = __rhs.pptr() - __p;
	__eout = __rhs.epptr() - __p;
	}
	ptrdiff_t __hm = __rhs.__hm_ == nullptr ? -1 : __rhs.__hm_ - __p;
	__str_ = _VSTD::move(__rhs.__str_);
	__p = const_cast<char_type*>(__str_.data());
	if (__binp != -1)
	this->setg(__p + __binp, __p + __ninp, __p + __einp);
	if (__bout != -1)
	{
	this->setp(__p + __bout, __p + __eout);
	this->pbump(__nout);
	}
	__hm_ = __hm == -1 ? nullptr : __p + __hm;
	__p = const_cast<char_type*>(__rhs.__str_.data());
	__rhs.setg(__p, __p, __p);
	__rhs.setp(__p, __p);
	__rhs.__hm_ = __p;
	this->pubimbue(__rhs.getloc());
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringbuf<_CharT, _Traits, _Allocator>&
	basic_stringbuf<_CharT, _Traits, _Allocator>::operator=(basic_stringbuf&& __rhs)
	{
	char_type* __p = const_cast<char_type*>(__rhs.__str_.data());
	ptrdiff_t __binp = -1;
	ptrdiff_t __ninp = -1;
	ptrdiff_t __einp = -1;
	if (__rhs.eback() != nullptr)
	{
	__binp = __rhs.eback() - __p;
	__ninp = __rhs.gptr() - __p;
	__einp = __rhs.egptr() - __p;
	}
	ptrdiff_t __bout = -1;
	ptrdiff_t __nout = -1;
	ptrdiff_t __eout = -1;
	if (__rhs.pbase() != nullptr)
	{
	__bout = __rhs.pbase() - __p;
	__nout = __rhs.pptr() - __p;
	__eout = __rhs.epptr() - __p;
	}
	ptrdiff_t __hm = __rhs.__hm_ == nullptr ? -1 : __rhs.__hm_ - __p;
	__str_ = _VSTD::move(__rhs.__str_);
	__p = const_cast<char_type*>(__str_.data());
	if (__binp != -1)
	this->setg(__p + __binp, __p + __ninp, __p + __einp);
	else
	this->setg(nullptr, nullptr, nullptr);
	if (__bout != -1)
	{
	this->setp(__p + __bout, __p + __eout);
	this->pbump(__nout);
	}
	else
	this->setp(nullptr, nullptr);

	__hm_ = __hm == -1 ? nullptr : __p + __hm;
	__mode_ = __rhs.__mode_;
	__p = const_cast<char_type*>(__rhs.__str_.data());
	__rhs.setg(__p, __p, __p);
	__rhs.setp(__p, __p);
	__rhs.__hm_ = __p;
	this->pubimbue(__rhs.getloc());
	return *this;
	}

	#endif // _LIBCPP_CXX03_LANG

	template <class _CharT, class _Traits, class _Allocator>
	void
	basic_stringbuf<_CharT, _Traits, _Allocator>::swap(basic_stringbuf& __rhs)
	{
	char_type* __p = const_cast<char_type*>(__rhs.__str_.data());
	ptrdiff_t __rbinp = -1;
	ptrdiff_t __rninp = -1;
	ptrdiff_t __reinp = -1;
	if (__rhs.eback() != nullptr)
	{
	__rbinp = __rhs.eback() - __p;
	__rninp = __rhs.gptr() - __p;
	__reinp = __rhs.egptr() - __p;
	}
	ptrdiff_t __rbout = -1;
	ptrdiff_t __rnout = -1;
	ptrdiff_t __reout = -1;
	if (__rhs.pbase() != nullptr)
	{
	__rbout = __rhs.pbase() - __p;
	__rnout = __rhs.pptr() - __p;
	__reout = __rhs.epptr() - __p;
	}
	ptrdiff_t __rhm = __rhs.__hm_ == nullptr ? -1 : __rhs.__hm_ - __p;
	__p = const_cast<char_type*>(__str_.data());
	ptrdiff_t __lbinp = -1;
	ptrdiff_t __lninp = -1;
	ptrdiff_t __leinp = -1;
	if (this->eback() != nullptr)
	{
	__lbinp = this->eback() - __p;
	__lninp = this->gptr() - __p;
	__leinp = this->egptr() - __p;
	}
	ptrdiff_t __lbout = -1;
	ptrdiff_t __lnout = -1;
	ptrdiff_t __leout = -1;
	if (this->pbase() != nullptr)
	{
	__lbout = this->pbase() - __p;
	__lnout = this->pptr() - __p;
	__leout = this->epptr() - __p;
	}
	ptrdiff_t __lhm = __hm_ == nullptr ? -1 : __hm_ - __p;
	_VSTD::swap(__mode_, __rhs.__mode_);
	__str_.swap(__rhs.__str_);
	__p = const_cast<char_type*>(__str_.data());
	if (__rbinp != -1)
	this->setg(__p + __rbinp, __p + __rninp, __p + __reinp);
	else
	this->setg(nullptr, nullptr, nullptr);
	if (__rbout != -1)
	{
	this->setp(__p + __rbout, __p + __reout);
	this->pbump(__rnout);
	}
	else
	this->setp(nullptr, nullptr);
	__hm_ = __rhm == -1 ? nullptr : __p + __rhm;
	__p = const_cast<char_type*>(__rhs.__str_.data());
	if (__lbinp != -1)
	__rhs.setg(__p + __lbinp, __p + __lninp, __p + __leinp);
	else
	__rhs.setg(nullptr, nullptr, nullptr);
	if (__lbout != -1)
	{
	__rhs.setp(__p + __lbout, __p + __leout);
	__rhs.pbump(__lnout);
	}
	else
	__rhs.setp(nullptr, nullptr);
	__rhs.__hm_ = __lhm == -1 ? nullptr : __p + __lhm;
	locale __tl = __rhs.getloc();
	__rhs.pubimbue(this->getloc());
	this->pubimbue(__tl);
	}

	template <class _CharT, class _Traits, class _Allocator>
	inline _LIBCPP_INLINE_VISIBILITY
	void
	swap(basic_stringbuf<_CharT, _Traits, _Allocator>& __x,
	basic_stringbuf<_CharT, _Traits, _Allocator>& __y)
	{
	__x.swap(__y);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_string<_CharT, _Traits, _Allocator>
	basic_stringbuf<_CharT, _Traits, _Allocator>::str() const
	{
	if (__mode_ & ios_base::out)
	{
	if (__hm_ < this->pptr())
	__hm_ = this->pptr();
	return string_type(this->pbase(), __hm_, __str_.get_allocator());
	}
	else if (__mode_ & ios_base::in)
	return string_type(this->eback(), this->egptr(), __str_.get_allocator());
	return string_type(__str_.get_allocator());
	}

	template <class _CharT, class _Traits, class _Allocator>
	void
	basic_stringbuf<_CharT, _Traits, _Allocator>::str(const string_type& __s)
	{
	__str_ = __s;
	__hm_ = 0;
	if (__mode_ & ios_base::in)
	{
	__hm_ = const_cast<char_type*>(__str_.data()) + __str_.size();
	this->setg(const_cast<char_type*>(__str_.data()),
	const_cast<char_type*>(__str_.data()),
	__hm_);
	}
	if (__mode_ & ios_base::out)
	{
	typename string_type::size_type __sz = __str_.size();
	__hm_ = const_cast<char_type*>(__str_.data()) + __sz;
	__str_.resize(__str_.capacity());
	this->setp(const_cast<char_type*>(__str_.data()),
	const_cast<char_type*>(__str_.data()) + __str_.size());
	if (__mode_ & (ios_base::app \| ios_base::ate))
	this->pbump(__sz);
	}
	}

	template <class _CharT, class _Traits, class _Allocator>
	typename basic_stringbuf<_CharT, _Traits, _Allocator>::int_type
	basic_stringbuf<_CharT, _Traits, _Allocator>::underflow()
	{
	if (__hm_ < this->pptr())
	__hm_ = this->pptr();
	if (__mode_ & ios_base::in)
	{
	if (this->egptr() < __hm_)
	this->setg(this->eback(), this->gptr(), __hm_);
	if (this->gptr() < this->egptr())
	return traits_type::to_int_type(*this->gptr());
	}
	return traits_type::eof();
	}

	template <class _CharT, class _Traits, class _Allocator>
	typename basic_stringbuf<_CharT, _Traits, _Allocator>::int_type
	basic_stringbuf<_CharT, _Traits, _Allocator>::pbackfail(int_type __c)
	{
	if (__hm_ < this->pptr())
	__hm_ = this->pptr();
	if (this->eback() < this->gptr())
	{
	if (traits_type::eq_int_type(__c, traits_type::eof()))
	{
	this->setg(this->eback(), this->gptr()-1, __hm_);
	return traits_type::not_eof(__c);
	}
	if ((__mode_ & ios_base::out) \|\|
	traits_type::eq(traits_type::to_char_type(__c), this->gptr()[-1]))
	{
	this->setg(this->eback(), this->gptr()-1, __hm_);
	*this->gptr() = traits_type::to_char_type(__c);
	return __c;
	}
	}
	return traits_type::eof();
	}

	template <class _CharT, class _Traits, class _Allocator>
	typename basic_stringbuf<_CharT, _Traits, _Allocator>::int_type
	basic_stringbuf<_CharT, _Traits, _Allocator>::overflow(int_type __c)
	{
	if (!traits_type::eq_int_type(__c, traits_type::eof()))
	{
	ptrdiff_t __ninp = this->gptr() - this->eback();
	if (this->pptr() == this->epptr())
	{
	if (!(__mode_ & ios_base::out))
	return traits_type::eof();
	#ifndef _LIBCPP_NO_EXCEPTIONS
	try
	{
	#endif // _LIBCPP_NO_EXCEPTIONS
	ptrdiff_t __nout = this->pptr() - this->pbase();
	ptrdiff_t __hm = __hm_ - this->pbase();
	__str_.push_back(char_type());
	__str_.resize(__str_.capacity());
	char_type* __p = const_cast<char_type*>(__str_.data());
	this->setp(__p, __p + __str_.size());
	this->pbump(__nout);
	__hm_ = this->pbase() + __hm;
	#ifndef _LIBCPP_NO_EXCEPTIONS
	}
	catch (...)
	{
	return traits_type::eof();
	}
	#endif // _LIBCPP_NO_EXCEPTIONS
	}
	__hm_ = _VSTD::max(this->pptr() + 1, __hm_);
	if (__mode_ & ios_base::in)
	{
	char_type* __p = const_cast<char_type*>(__str_.data());
	this->setg(__p, __p + __ninp, __hm_);
	}
	return this->sputc(__c);
	}
	return traits_type::not_eof(__c);
	}

	template <class _CharT, class _Traits, class _Allocator>
	typename basic_stringbuf<_CharT, _Traits, _Allocator>::pos_type
	basic_stringbuf<_CharT, _Traits, _Allocator>::seekoff(off_type __off,
	ios_base::seekdir __way,
	ios_base::openmode __wch)
	{
	if (__hm_ < this->pptr())
	__hm_ = this->pptr();
	if ((__wch & (ios_base::in \| ios_base::out)) == 0)
	return pos_type(-1);
	if ((__wch & (ios_base::in \| ios_base::out)) == (ios_base::in \| ios_base::out)
	&& __way == ios_base::cur)
	return pos_type(-1);
	off_type __noff;
	switch (__way)
	{
	case ios_base::beg:
	__noff = 0;
	break;
	case ios_base::cur:
	if (__wch & ios_base::in)
	__noff = this->gptr() - this->eback();
	else
	__noff = this->pptr() - this->pbase();
	break;
	case ios_base::end:
	__noff = __hm_ - __str_.data();
	break;
	default:
	return pos_type(-1);
	}
	__noff += __off;
	if (__noff < 0 \|\| __hm_ - __str_.data() < __noff)
	return pos_type(-1);
	if (__noff != 0)
	{
	if ((__wch & ios_base::in) && this->gptr() == 0)
	return pos_type(-1);
	if ((__wch & ios_base::out) && this->pptr() == 0)
	return pos_type(-1);
	}
	if (__wch & ios_base::in)
	this->setg(this->eback(), this->eback() + __noff, __hm_);
	if (__wch & ios_base::out)
	{
	this->setp(this->pbase(), this->epptr());
	this->pbump(__noff);
	}
	return pos_type(__noff);
	}

	template <class _CharT, class _Traits, class _Allocator>
	typename basic_stringbuf<_CharT, _Traits, _Allocator>::pos_type
	basic_stringbuf<_CharT, _Traits, _Allocator>::seekpos(pos_type __sp,
	ios_base::openmode __wch)
	{
	return seekoff(__sp, ios_base::beg, __wch);
	}

	// basic_istringstream

	template <class _CharT, class _Traits, class _Allocator>
	class _LIBCPP_TEMPLATE_VIS basic_istringstream
	: public basic_istream<_CharT, _Traits>
	{
	public:
	typedef _CharT char_type;
	typedef _Traits traits_type;
	typedef typename traits_type::int_type int_type;
	typedef typename traits_type::pos_type pos_type;
	typedef typename traits_type::off_type off_type;
	typedef _Allocator allocator_type;

	typedef basic_string<char_type, traits_type, allocator_type> string_type;

	private:
	basic_stringbuf<char_type, traits_type, allocator_type> __sb_;

	public:
	// 27.8.2.1 Constructors:
	inline _LIBCPP_INLINE_VISIBILITY
	explicit basic_istringstream(ios_base::openmode __wch = ios_base::in);
	inline _LIBCPP_INLINE_VISIBILITY
	explicit basic_istringstream(const string_type& __s,
	ios_base::openmode __wch = ios_base::in);
	#ifndef _LIBCPP_CXX03_LANG
	inline _LIBCPP_INLINE_VISIBILITY
	basic_istringstream(basic_istringstream&& __rhs);

	// 27.8.2.2 Assign and swap:
	basic_istringstream& operator=(basic_istringstream&& __rhs);
	#endif // _LIBCPP_CXX03_LANG
	inline _LIBCPP_INLINE_VISIBILITY
	void swap(basic_istringstream& __rhs);

	// 27.8.2.3 Members:
	inline _LIBCPP_INLINE_VISIBILITY
	basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
	inline _LIBCPP_INLINE_VISIBILITY
	string_type str() const;
	inline _LIBCPP_INLINE_VISIBILITY
	void str(const string_type& __s);
	};

	template <class _CharT, class _Traits, class _Allocator>
	basic_istringstream<_CharT, _Traits, _Allocator>::basic_istringstream(ios_base::openmode __wch)
	: basic_istream<_CharT, _Traits>(&__sb_),
	__sb_(__wch \| ios_base::in)
	{
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_istringstream<_CharT, _Traits, _Allocator>::basic_istringstream(const string_type& __s,
	ios_base::openmode __wch)
	: basic_istream<_CharT, _Traits>(&__sb_),
	__sb_(__s, __wch \| ios_base::in)
	{
	}

	#ifndef _LIBCPP_CXX03_LANG

	template <class _CharT, class _Traits, class _Allocator>
	basic_istringstream<_CharT, _Traits, _Allocator>::basic_istringstream(basic_istringstream&& __rhs)
	: basic_istream<_CharT, _Traits>(_VSTD::move(__rhs)),
	__sb_(_VSTD::move(__rhs.__sb_))
	{
	basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_istringstream<_CharT, _Traits, _Allocator>&
	basic_istringstream<_CharT, _Traits, _Allocator>::operator=(basic_istringstream&& __rhs)
	{
	basic_istream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
	__sb_ = _VSTD::move(__rhs.__sb_);
	return *this;
	}

	#endif // _LIBCPP_CXX03_LANG

	template <class _CharT, class _Traits, class _Allocator>
	void basic_istringstream<_CharT, _Traits, _Allocator>::swap(basic_istringstream& __rhs)
	{
	basic_istream<char_type, traits_type>::swap(__rhs);
	__sb_.swap(__rhs.__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	inline _LIBCPP_INLINE_VISIBILITY
	void
	swap(basic_istringstream<_CharT, _Traits, _Allocator>& __x,
	basic_istringstream<_CharT, _Traits, _Allocator>& __y)
	{
	__x.swap(__y);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringbuf<_CharT, _Traits, _Allocator>*
	basic_istringstream<_CharT, _Traits, _Allocator>::rdbuf() const
	{
	return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_string<_CharT, _Traits, _Allocator>
	basic_istringstream<_CharT, _Traits, _Allocator>::str() const
	{
	return __sb_.str();
	}

	template <class _CharT, class _Traits, class _Allocator>
	void basic_istringstream<_CharT, _Traits, _Allocator>::str(const string_type& __s)
	{
	__sb_.str(__s);
	}

	// basic_ostringstream

	template <class _CharT, class _Traits, class _Allocator>
	class _LIBCPP_TEMPLATE_VIS basic_ostringstream
	: public basic_ostream<_CharT, _Traits>
	{
	public:
	typedef _CharT char_type;
	typedef _Traits traits_type;
	typedef typename traits_type::int_type int_type;
	typedef typename traits_type::pos_type pos_type;
	typedef typename traits_type::off_type off_type;
	typedef _Allocator allocator_type;

	typedef basic_string<char_type, traits_type, allocator_type> string_type;

	private:
	basic_stringbuf<char_type, traits_type, allocator_type> __sb_;

	public:
	// 27.8.2.1 Constructors:
	inline _LIBCPP_INLINE_VISIBILITY
	explicit basic_ostringstream(ios_base::openmode __wch = ios_base::out);
	inline _LIBCPP_INLINE_VISIBILITY
	explicit basic_ostringstream(const string_type& __s,
	ios_base::openmode __wch = ios_base::out);
	#ifndef _LIBCPP_CXX03_LANG
	inline _LIBCPP_INLINE_VISIBILITY
	basic_ostringstream(basic_ostringstream&& __rhs);

	// 27.8.2.2 Assign and swap:
	basic_ostringstream& operator=(basic_ostringstream&& __rhs);
	#endif // _LIBCPP_CXX03_LANG
	inline _LIBCPP_INLINE_VISIBILITY
	void swap(basic_ostringstream& __rhs);

	// 27.8.2.3 Members:
	inline _LIBCPP_INLINE_VISIBILITY
	basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
	inline _LIBCPP_INLINE_VISIBILITY
	string_type str() const;
	inline _LIBCPP_INLINE_VISIBILITY
	void str(const string_type& __s);
	};

	template <class _CharT, class _Traits, class _Allocator>
	basic_ostringstream<_CharT, _Traits, _Allocator>::basic_ostringstream(ios_base::openmode __wch)
	: basic_ostream<_CharT, _Traits>(&__sb_),
	__sb_(__wch \| ios_base::out)
	{
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_ostringstream<_CharT, _Traits, _Allocator>::basic_ostringstream(const string_type& __s,
	ios_base::openmode __wch)
	: basic_ostream<_CharT, _Traits>(&__sb_),
	__sb_(__s, __wch \| ios_base::out)
	{
	}

	#ifndef _LIBCPP_CXX03_LANG

	template <class _CharT, class _Traits, class _Allocator>
	basic_ostringstream<_CharT, _Traits, _Allocator>::basic_ostringstream(basic_ostringstream&& __rhs)
	: basic_ostream<_CharT, _Traits>(_VSTD::move(__rhs)),
	__sb_(_VSTD::move(__rhs.__sb_))
	{
	basic_ostream<_CharT, _Traits>::set_rdbuf(&__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_ostringstream<_CharT, _Traits, _Allocator>&
	basic_ostringstream<_CharT, _Traits, _Allocator>::operator=(basic_ostringstream&& __rhs)
	{
	basic_ostream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
	__sb_ = _VSTD::move(__rhs.__sb_);
	return *this;
	}

	#endif // _LIBCPP_CXX03_LANG

	template <class _CharT, class _Traits, class _Allocator>
	void
	basic_ostringstream<_CharT, _Traits, _Allocator>::swap(basic_ostringstream& __rhs)
	{
	basic_ostream<char_type, traits_type>::swap(__rhs);
	__sb_.swap(__rhs.__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	inline _LIBCPP_INLINE_VISIBILITY
	void
	swap(basic_ostringstream<_CharT, _Traits, _Allocator>& __x,
	basic_ostringstream<_CharT, _Traits, _Allocator>& __y)
	{
	__x.swap(__y);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringbuf<_CharT, _Traits, _Allocator>*
	basic_ostringstream<_CharT, _Traits, _Allocator>::rdbuf() const
	{
	return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_string<_CharT, _Traits, _Allocator>
	basic_ostringstream<_CharT, _Traits, _Allocator>::str() const
	{
	return __sb_.str();
	}

	template <class _CharT, class _Traits, class _Allocator>
	void
	basic_ostringstream<_CharT, _Traits, _Allocator>::str(const string_type& __s)
	{
	__sb_.str(__s);
	}

	// basic_stringstream

	template <class _CharT, class _Traits, class _Allocator>
	class _LIBCPP_TEMPLATE_VIS basic_stringstream
	: public basic_iostream<_CharT, _Traits>
	{
	public:
	typedef _CharT char_type;
	typedef _Traits traits_type;
	typedef typename traits_type::int_type int_type;
	typedef typename traits_type::pos_type pos_type;
	typedef typename traits_type::off_type off_type;
	typedef _Allocator allocator_type;

	typedef basic_string<char_type, traits_type, allocator_type> string_type;

	private:
	basic_stringbuf<char_type, traits_type, allocator_type> __sb_;

	public:
	// 27.8.2.1 Constructors:
	inline _LIBCPP_INLINE_VISIBILITY
	explicit basic_stringstream(ios_base::openmode __wch = ios_base::in \| ios_base::out);
	inline _LIBCPP_INLINE_VISIBILITY
	explicit basic_stringstream(const string_type& __s,
	ios_base::openmode __wch = ios_base::in \| ios_base::out);
	#ifndef _LIBCPP_CXX03_LANG
	inline _LIBCPP_INLINE_VISIBILITY
	basic_stringstream(basic_stringstream&& __rhs);

	// 27.8.2.2 Assign and swap:
	basic_stringstream& operator=(basic_stringstream&& __rhs);
	#endif // _LIBCPP_CXX03_LANG
	inline _LIBCPP_INLINE_VISIBILITY
	void swap(basic_stringstream& __rhs);

	// 27.8.2.3 Members:
	inline _LIBCPP_INLINE_VISIBILITY
	basic_stringbuf<char_type, traits_type, allocator_type>* rdbuf() const;
	inline _LIBCPP_INLINE_VISIBILITY
	string_type str() const;
	inline _LIBCPP_INLINE_VISIBILITY
	void str(const string_type& __s);
	};

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringstream<_CharT, _Traits, _Allocator>::basic_stringstream(ios_base::openmode __wch)
	: basic_iostream<_CharT, _Traits>(&__sb_),
	__sb_(__wch)
	{
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringstream<_CharT, _Traits, _Allocator>::basic_stringstream(const string_type& __s,
	ios_base::openmode __wch)
	: basic_iostream<_CharT, _Traits>(&__sb_),
	__sb_(__s, __wch)
	{
	}

	#ifndef _LIBCPP_CXX03_LANG

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringstream<_CharT, _Traits, _Allocator>::basic_stringstream(basic_stringstream&& __rhs)
	: basic_iostream<_CharT, _Traits>(_VSTD::move(__rhs)),
	__sb_(_VSTD::move(__rhs.__sb_))
	{
	basic_istream<_CharT, _Traits>::set_rdbuf(&__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringstream<_CharT, _Traits, _Allocator>&
	basic_stringstream<_CharT, _Traits, _Allocator>::operator=(basic_stringstream&& __rhs)
	{
	basic_iostream<char_type, traits_type>::operator=(_VSTD::move(__rhs));
	__sb_ = _VSTD::move(__rhs.__sb_);
	return *this;
	}

	#endif // _LIBCPP_CXX03_LANG

	template <class _CharT, class _Traits, class _Allocator>
	void
	basic_stringstream<_CharT, _Traits, _Allocator>::swap(basic_stringstream& __rhs)
	{
	basic_iostream<char_type, traits_type>::swap(__rhs);
	__sb_.swap(__rhs.__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	inline _LIBCPP_INLINE_VISIBILITY
	void
	swap(basic_stringstream<_CharT, _Traits, _Allocator>& __x,
	basic_stringstream<_CharT, _Traits, _Allocator>& __y)
	{
	__x.swap(__y);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_stringbuf<_CharT, _Traits, _Allocator>*
	basic_stringstream<_CharT, _Traits, _Allocator>::rdbuf() const
	{
	return const_cast<basic_stringbuf<char_type, traits_type, allocator_type>*>(&__sb_);
	}

	template <class _CharT, class _Traits, class _Allocator>
	basic_string<_CharT, _Traits, _Allocator>
	basic_stringstream<_CharT, _Traits, _Allocator>::str() const
	{
	return __sb_.str();
	}

	template <class _CharT, class _Traits, class _Allocator>
	void
	basic_stringstream<_CharT, _Traits, _Allocator>::str(const string_type& __s)
	{
	__sb_.str(__s);
	}

	_LIBCPP_END_NAMESPACE_STD

	_LIBCPP_POP_MACROS

	#endif // _LIBCPP_SSTREAM
	Index: head/contrib/libc++
	===================================================================
	--- head/contrib/libc++ (revision 322319)
	+++ head/contrib/libc++ (revision 322320)

	Property changes on: head/contrib/libc++
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/libc++/dist:r321713-322300
	Index: head/contrib/llvm/include/llvm/Analysis/ValueTracking.h
	===================================================================
	--- head/contrib/llvm/include/llvm/Analysis/ValueTracking.h (revision 322319)
	+++ head/contrib/llvm/include/llvm/Analysis/ValueTracking.h (revision 322320)
	@@ -1,532 +1,538 @@
	//===- llvm/Analysis/ValueTracking.h - Walk computations --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains routines that help analyze properties that chains of
	// computations have.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_ANALYSIS_VALUETRACKING_H
	#define LLVM_ANALYSIS_VALUETRACKING_H

	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/Support/DataTypes.h"

	namespace llvm {
	template <typename T> class ArrayRef;
	class APInt;
	class AddOperator;
	class AssumptionCache;
	class DataLayout;
	class DominatorTree;
	class GEPOperator;
	class Instruction;
	struct KnownBits;
	class Loop;
	class LoopInfo;
	class OptimizationRemarkEmitter;
	class MDNode;
	class StringRef;
	class TargetLibraryInfo;
	class Value;

	namespace Intrinsic {
	enum ID : unsigned;
	}

	/// Determine which bits of V are known to be either zero or one and return
	/// them in the KnownZero/KnownOne bit sets.
	///
	/// This function is defined on values with integer type, values with pointer
	/// type, and vectors of integers. In the case
	/// where V is a vector, the known zero and known one values are the
	/// same width as the vector element, and the bit is set only if it is true
	/// for all of the elements in the vector.
	void computeKnownBits(const Value *V, KnownBits &Known,
	const DataLayout &DL, unsigned Depth = 0,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr,
	OptimizationRemarkEmitter *ORE = nullptr);
	/// Returns the known bits rather than passing by reference.
	KnownBits computeKnownBits(const Value *V, const DataLayout &DL,
	unsigned Depth = 0, AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr,
	OptimizationRemarkEmitter *ORE = nullptr);
	/// Compute known bits from the range metadata.
	/// \p KnownZero the set of bits that are known to be zero
	/// \p KnownOne the set of bits that are known to be one
	void computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
	KnownBits &Known);
	/// Return true if LHS and RHS have no common bits set.
	bool haveNoCommonBitsSet(const Value LHS, const Value RHS,
	const DataLayout &DL,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Return true if the given value is known to have exactly one bit set when
	/// defined. For vectors return true if every element is known to be a power
	/// of two when defined. Supports values with integer or pointer type and
	/// vectors of integers. If 'OrZero' is set, then return true if the given
	/// value is either a power of two or zero.
	bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
	bool OrZero = false, unsigned Depth = 0,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI);

	/// Return true if the given value is known to be non-zero when defined. For
	/// vectors, return true if every element is known to be non-zero when
	/// defined. For pointers, if the context instruction and dominator tree are
	/// specified, perform context-sensitive analysis and return true if the
	/// pointer couldn't possibly be null at the specified instruction.
	/// Supports values with integer or pointer type and vectors of integers.
	bool isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth = 0,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Returns true if the give value is known to be non-negative.
	bool isKnownNonNegative(const Value *V, const DataLayout &DL,
	unsigned Depth = 0,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Returns true if the given value is known be positive (i.e. non-negative
	/// and non-zero).
	bool isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth = 0,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Returns true if the given value is known be negative (i.e. non-positive
	/// and non-zero).
	bool isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth = 0,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Return true if the given values are known to be non-equal when defined.
	/// Supports scalar integer types only.
	bool isKnownNonEqual(const Value V1, const Value V2, const DataLayout &DL,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Return true if 'V & Mask' is known to be zero. We use this predicate to
	/// simplify operations downstream. Mask is known to be zero for bits that V
	/// cannot have.
	///
	/// This function is defined on values with integer type, values with pointer
	/// type, and vectors of integers. In the case
	/// where V is a vector, the mask, known zero, and known one values are the
	/// same width as the vector element, and the bit is set only if it is true
	/// for all of the elements in the vector.
	bool MaskedValueIsZero(const Value *V, const APInt &Mask,
	const DataLayout &DL,
	unsigned Depth = 0, AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Return the number of times the sign bit of the register is replicated into
	/// the other bits. We know that at least 1 bit is always equal to the sign
	/// bit (itself), but other cases can give us information. For example,
	/// immediately after an "ashr X, 2", we know that the top 3 bits are all
	/// equal to each other, so we return 3. For vectors, return the number of
	/// sign bits for the vector element with the mininum number of known sign
	/// bits.
	unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL,
	unsigned Depth = 0, AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// This function computes the integer multiple of Base that equals V. If
	/// successful, it returns true and returns the multiple in Multiple. If
	/// unsuccessful, it returns false. Also, if V can be simplified to an
	/// integer, then the simplified V is returned in Val. Look through sext only
	/// if LookThroughSExt=true.
	bool ComputeMultiple(Value V, unsigned Base, Value &Multiple,
	bool LookThroughSExt = false,
	unsigned Depth = 0);

	/// Map a call instruction to an intrinsic ID. Libcalls which have equivalent
	/// intrinsics are treated as-if they were intrinsics.
	Intrinsic::ID getIntrinsicForCallSite(ImmutableCallSite ICS,
	const TargetLibraryInfo *TLI);

	/// Return true if we can prove that the specified FP value is never equal to
	/// -0.0.
	bool CannotBeNegativeZero(const Value V, const TargetLibraryInfo TLI,
	unsigned Depth = 0);

	/// Return true if we can prove that the specified FP value is either NaN or
	/// never less than -0.0.
	///
	/// NaN --> true
	/// +0 --> true
	/// -0 --> true
	/// x > +0 --> true
	/// x < -0 --> false
	///
	bool CannotBeOrderedLessThanZero(const Value V, const TargetLibraryInfo TLI);

	/// Return true if we can prove that the specified FP value's sign bit is 0.
	///
	/// NaN --> true/false (depending on the NaN's sign bit)
	/// +0 --> true
	/// -0 --> false
	/// x > +0 --> true
	/// x < -0 --> false
	///
	bool SignBitMustBeZero(const Value V, const TargetLibraryInfo TLI);

	/// If the specified value can be set by repeating the same byte in memory,
	/// return the i8 value that it is represented with. This is true for all i8
	/// values obviously, but is also true for i32 0, i32 -1, i16 0xF0F0, double
	/// 0.0 etc. If the value can't be handled with a repeated byte store (e.g.
	/// i16 0x1234), return null.
	Value isBytewiseValue(Value V);

	/// Given an aggregrate and an sequence of indices, see if the scalar value
	/// indexed is already around as a register, for example if it were inserted
	/// directly into the aggregrate.
	///
	/// If InsertBefore is not null, this function will duplicate (modified)
	/// insertvalues when a part of a nested struct is extracted.
	Value FindInsertedValue(Value V,
	ArrayRef<unsigned> idx_range,
	Instruction *InsertBefore = nullptr);

	/// Analyze the specified pointer to see if it can be expressed as a base
	/// pointer plus a constant offset. Return the base and offset to the caller.
	Value GetPointerBaseWithConstantOffset(Value Ptr, int64_t &Offset,
	const DataLayout &DL);
	static inline const Value *
	GetPointerBaseWithConstantOffset(const Value *Ptr, int64_t &Offset,
	const DataLayout &DL) {
	return GetPointerBaseWithConstantOffset(const_cast<Value *>(Ptr), Offset,
	DL);
	}

	/// Returns true if the GEP is based on a pointer to a string (array of
	// \p CharSize integers) and is indexing into this string.
	bool isGEPBasedOnPointerToString(const GEPOperator *GEP,
	unsigned CharSize = 8);

	/// Represents offset+length into a ConstantDataArray.
	struct ConstantDataArraySlice {
	/// ConstantDataArray pointer. nullptr indicates a zeroinitializer (a valid
	/// initializer, it just doesn't fit the ConstantDataArray interface).
	const ConstantDataArray *Array;
	/// Slice starts at this Offset.
	uint64_t Offset;
	/// Length of the slice.
	uint64_t Length;

	/// Moves the Offset and adjusts Length accordingly.
	void move(uint64_t Delta) {
	assert(Delta < Length);
	Offset += Delta;
	Length -= Delta;
	}
	/// Convenience accessor for elements in the slice.
	uint64_t operator[](unsigned I) const {
	return Array==nullptr ? 0 : Array->getElementAsInteger(I + Offset);
	}
	};

	/// Returns true if the value \p V is a pointer into a ContantDataArray.
	/// If successful \p Index will point to a ConstantDataArray info object
	/// with an appropriate offset.
	bool getConstantDataArrayInfo(const Value *V, ConstantDataArraySlice &Slice,
	unsigned ElementSize, uint64_t Offset = 0);

	/// This function computes the length of a null-terminated C string pointed to
	/// by V. If successful, it returns true and returns the string in Str. If
	/// unsuccessful, it returns false. This does not include the trailing null
	/// character by default. If TrimAtNul is set to false, then this returns any
	/// trailing null characters as well as any other characters that come after
	/// it.
	bool getConstantStringInfo(const Value *V, StringRef &Str,
	uint64_t Offset = 0, bool TrimAtNul = true);

	/// If we can compute the length of the string pointed to by the specified
	/// pointer, return 'len+1'. If we can't, return 0.
	uint64_t GetStringLength(const Value *V, unsigned CharSize = 8);

	/// This method strips off any GEP address adjustments and pointer casts from
	/// the specified value, returning the original object being addressed. Note
	/// that the returned value has pointer type if the specified value does. If
	/// the MaxLookup value is non-zero, it limits the number of instructions to
	/// be stripped off.
	Value GetUnderlyingObject(Value V, const DataLayout &DL,
	unsigned MaxLookup = 6);
	static inline const Value GetUnderlyingObject(const Value V,
	const DataLayout &DL,
	unsigned MaxLookup = 6) {
	return GetUnderlyingObject(const_cast<Value *>(V), DL, MaxLookup);
	}

	/// \brief This method is similar to GetUnderlyingObject except that it can
	/// look through phi and select instructions and return multiple objects.
	///
	/// If LoopInfo is passed, loop phis are further analyzed. If a pointer
	/// accesses different objects in each iteration, we don't look through the
	/// phi node. E.g. consider this loop nest:
	///
	/// int **A;
	/// for (i)
	/// for (j) {
	/// A[i][j] = A[i-1][j] * B[j]
	/// }
	///
	/// This is transformed by Load-PRE to stash away A[i] for the next iteration
	/// of the outer loop:
	///
	/// Curr = A[0]; // Prev_0
	/// for (i: 1..N) {
	/// Prev = Curr; // Prev = PHI (Prev_0, Curr)
	/// Curr = A[i];
	/// for (j: 0..N) {
	/// Curr[j] = Prev[j] * B[j]
	/// }
	/// }
	///
	/// Since A[i] and A[i-1] are independent pointers, getUnderlyingObjects
	/// should not assume that Curr and Prev share the same underlying object thus
	/// it shouldn't look through the phi above.
	void GetUnderlyingObjects(Value V, SmallVectorImpl<Value > &Objects,
	const DataLayout &DL, LoopInfo *LI = nullptr,
	unsigned MaxLookup = 6);

	+ /// This is a wrapper around GetUnderlyingObjects and adds support for basic
	+ /// ptrtoint+arithmetic+inttoptr sequences.
	+ void getUnderlyingObjectsForCodeGen(const Value *V,
	+ SmallVectorImpl<Value *> &Objects,
	+ const DataLayout &DL);
	+
	/// Return true if the only users of this pointer are lifetime markers.
	bool onlyUsedByLifetimeMarkers(const Value *V);

	/// Return true if the instruction does not have any effects besides
	/// calculating the result and does not have undefined behavior.
	///
	/// This method never returns true for an instruction that returns true for
	/// mayHaveSideEffects; however, this method also does some other checks in
	/// addition. It checks for undefined behavior, like dividing by zero or
	/// loading from an invalid pointer (but not for undefined results, like a
	/// shift with a shift amount larger than the width of the result). It checks
	/// for malloc and alloca because speculatively executing them might cause a
	/// memory leak. It also returns false for instructions related to control
	/// flow, specifically terminators and PHI nodes.
	///
	/// If the CtxI is specified this method performs context-sensitive analysis
	/// and returns true if it is safe to execute the instruction immediately
	/// before the CtxI.
	///
	/// If the CtxI is NOT specified this method only looks at the instruction
	/// itself and its operands, so if this method returns true, it is safe to
	/// move the instruction as long as the correct dominance relationships for
	/// the operands and users hold.
	///
	/// This method can return true for instructions that read memory;
	/// for such instructions, moving them may change the resulting value.
	bool isSafeToSpeculativelyExecute(const Value *V,
	const Instruction *CtxI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Returns true if the result or effects of the given instructions \p I
	/// depend on or influence global memory.
	/// Memory dependence arises for example if the instruction reads from
	/// memory or may produce effects or undefined behaviour. Memory dependent
	/// instructions generally cannot be reorderd with respect to other memory
	/// dependent instructions or moved into non-dominated basic blocks.
	/// Instructions which just compute a value based on the values of their
	/// operands are not memory dependent.
	bool mayBeMemoryDependent(const Instruction &I);

	/// Return true if this pointer couldn't possibly be null by its definition.
	/// This returns true for allocas, non-extern-weak globals, and byval
	/// arguments.
	bool isKnownNonNull(const Value *V);

	/// Return true if this pointer couldn't possibly be null. If the context
	/// instruction and dominator tree are specified, perform context-sensitive
	/// analysis and return true if the pointer couldn't possibly be null at the
	/// specified instruction.
	bool isKnownNonNullAt(const Value *V,
	const Instruction *CtxI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Return true if it is valid to use the assumptions provided by an
	/// assume intrinsic, I, at the point in the control-flow identified by the
	/// context instruction, CxtI.
	bool isValidAssumeForContext(const Instruction I, const Instruction CxtI,
	const DominatorTree *DT = nullptr);

	enum class OverflowResult { AlwaysOverflows, MayOverflow, NeverOverflows };
	OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
	const Value *RHS,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT);
	OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
	const Value *RHS,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT);
	OverflowResult computeOverflowForSignedAdd(const Value LHS, const Value RHS,
	const DataLayout &DL,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);
	/// This version also leverages the sign bit of Add if known.
	OverflowResult computeOverflowForSignedAdd(const AddOperator *Add,
	const DataLayout &DL,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);

	/// Returns true if the arithmetic part of the \p II 's result is
	/// used only along the paths control dependent on the computation
	/// not overflowing, \p II being an <op>.with.overflow intrinsic.
	bool isOverflowIntrinsicNoWrap(const IntrinsicInst *II,
	const DominatorTree &DT);

	/// Return true if this function can prove that the instruction I will
	/// always transfer execution to one of its successors (including the next
	/// instruction that follows within a basic block). E.g. this is not
	/// guaranteed for function calls that could loop infinitely.
	///
	/// In other words, this function returns false for instructions that may
	/// transfer execution or fail to transfer execution in a way that is not
	/// captured in the CFG nor in the sequence of instructions within a basic
	/// block.
	///
	/// Undefined behavior is assumed not to happen, so e.g. division is
	/// guaranteed to transfer execution to the following instruction even
	/// though division by zero might cause undefined behavior.
	bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I);

	/// Return true if this function can prove that the instruction I
	/// is executed for every iteration of the loop L.
	///
	/// Note that this currently only considers the loop header.
	bool isGuaranteedToExecuteForEveryIteration(const Instruction *I,
	const Loop *L);

	/// Return true if this function can prove that I is guaranteed to yield
	/// full-poison (all bits poison) if at least one of its operands are
	/// full-poison (all bits poison).
	///
	/// The exact rules for how poison propagates through instructions have
	/// not been settled as of 2015-07-10, so this function is conservative
	/// and only considers poison to be propagated in uncontroversial
	/// cases. There is no attempt to track values that may be only partially
	/// poison.
	bool propagatesFullPoison(const Instruction *I);

	/// Return either nullptr or an operand of I such that I will trigger
	/// undefined behavior if I is executed and that operand has a full-poison
	/// value (all bits poison).
	const Value getGuaranteedNonFullPoisonOp(const Instruction I);

	/// Return true if this function can prove that if PoisonI is executed
	/// and yields a full-poison value (all bits poison), then that will
	/// trigger undefined behavior.
	///
	/// Note that this currently only considers the basic block that is
	/// the parent of I.
	bool programUndefinedIfFullPoison(const Instruction *PoisonI);

	/// \brief Specific patterns of select instructions we can match.
	enum SelectPatternFlavor {
	SPF_UNKNOWN = 0,
	SPF_SMIN, /// Signed minimum
	SPF_UMIN, /// Unsigned minimum
	SPF_SMAX, /// Signed maximum
	SPF_UMAX, /// Unsigned maximum
	SPF_FMINNUM, /// Floating point minnum
	SPF_FMAXNUM, /// Floating point maxnum
	SPF_ABS, /// Absolute value
	SPF_NABS /// Negated absolute value
	};
	/// \brief Behavior when a floating point min/max is given one NaN and one
	/// non-NaN as input.
	enum SelectPatternNaNBehavior {
	SPNB_NA = 0, /// NaN behavior not applicable.
	SPNB_RETURNS_NAN, /// Given one NaN input, returns the NaN.
	SPNB_RETURNS_OTHER, /// Given one NaN input, returns the non-NaN.
	SPNB_RETURNS_ANY /// Given one NaN input, can return either (or
	/// it has been determined that no operands can
	/// be NaN).
	};
	struct SelectPatternResult {
	SelectPatternFlavor Flavor;
	SelectPatternNaNBehavior NaNBehavior; /// Only applicable if Flavor is
	/// SPF_FMINNUM or SPF_FMAXNUM.
	bool Ordered; /// When implementing this min/max pattern as
	/// fcmp; select, does the fcmp have to be
	/// ordered?

	/// \brief Return true if \p SPF is a min or a max pattern.
	static bool isMinOrMax(SelectPatternFlavor SPF) {
	return !(SPF == SPF_UNKNOWN \|\| SPF == SPF_ABS \|\| SPF == SPF_NABS);
	}
	};
	/// Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind
	/// and providing the out parameter results if we successfully match.
	///
	/// If CastOp is not nullptr, also match MIN/MAX idioms where the type does
	/// not match that of the original select. If this is the case, the cast
	/// operation (one of Trunc,SExt,Zext) that must be done to transform the
	/// type of LHS and RHS into the type of V is returned in CastOp.
	///
	/// For example:
	/// %1 = icmp slt i32 %a, i32 4
	/// %2 = sext i32 %a to i64
	/// %3 = select i1 %1, i64 %2, i64 4
	///
	/// -> LHS = %a, RHS = i32 4, *CastOp = Instruction::SExt
	///
	SelectPatternResult matchSelectPattern(Value V, Value &LHS, Value *&RHS,
	Instruction::CastOps *CastOp = nullptr);
	static inline SelectPatternResult
	matchSelectPattern(const Value V, const Value &LHS, const Value *&RHS,
	Instruction::CastOps *CastOp = nullptr) {
	Value L = const_cast<Value>(LHS);
	Value R = const_cast<Value>(RHS);
	auto Result = matchSelectPattern(const_cast<Value*>(V), L, R);
	LHS = L;
	RHS = R;
	return Result;
	}

	/// Return true if RHS is known to be implied true by LHS. Return false if
	/// RHS is known to be implied false by LHS. Otherwise, return None if no
	/// implication can be made.
	/// A & B must be i1 (boolean) values or a vector of such values. Note that
	/// the truth table for implication is the same as <=u on i1 values (but not
	/// <=s!). The truth table for both is:
	/// \| T \| F (B)
	/// T \| T \| F
	/// F \| T \| T
	/// (A)
	Optional<bool> isImpliedCondition(const Value LHS, const Value RHS,
	const DataLayout &DL,
	bool LHSIsFalse = false, unsigned Depth = 0,
	AssumptionCache *AC = nullptr,
	const Instruction *CxtI = nullptr,
	const DominatorTree *DT = nullptr);
	} // end namespace llvm

	#endif
	Index: head/contrib/llvm/include/llvm/CodeGen/MachineFunction.h
	===================================================================
	--- head/contrib/llvm/include/llvm/CodeGen/MachineFunction.h (revision 322319)
	+++ head/contrib/llvm/include/llvm/CodeGen/MachineFunction.h (revision 322320)
	@@ -1,920 +1,926 @@
	//===- llvm/CodeGen/MachineFunction.h ---------------------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Collect native machine code for a function. This class contains a list of
	// MachineBasicBlock instances that make up the current compiled function.
	//
	// This class also contains pointers to various classes which hold
	// target-specific information about the generated code.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_MACHINEFUNCTION_H
	#define LLVM_CODEGEN_MACHINEFUNCTION_H

	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/GraphTraits.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/ilist.h"
	#include "llvm/ADT/iterator.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/Allocator.h"
	#include "llvm/Support/ArrayRecycler.h"
	#include "llvm/Support/AtomicOrdering.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/Recycler.h"
	#include <cassert>
	#include <cstdint>
	#include <memory>
	#include <utility>
	#include <vector>

	namespace llvm {

	class BasicBlock;
	class BlockAddress;
	class DataLayout;
	class DIExpression;
	class DILocalVariable;
	class DILocation;
	class Function;
	class GlobalValue;
	class MachineConstantPool;
	class MachineFrameInfo;
	class MachineFunction;
	class MachineJumpTableInfo;
	class MachineModuleInfo;
	class MachineRegisterInfo;
	class MCContext;
	class MCInstrDesc;
	class Pass;
	class PseudoSourceValueManager;
	class raw_ostream;
	class SlotIndexes;
	class TargetMachine;
	class TargetRegisterClass;
	class TargetSubtargetInfo;
	struct WinEHFuncInfo;

	template <> struct ilist_alloc_traits<MachineBasicBlock> {
	void deleteNode(MachineBasicBlock *MBB);
	};

	template <> struct ilist_callback_traits<MachineBasicBlock> {
	void addNodeToList(MachineBasicBlock* MBB);
	void removeNodeFromList(MachineBasicBlock* MBB);

	template <class Iterator>
	void transferNodesFromList(ilist_callback_traits &OldList, Iterator, Iterator) {
	llvm_unreachable("Never transfer between lists");
	}
	};

	/// MachineFunctionInfo - This class can be derived from and used by targets to
	/// hold private target-specific information for each MachineFunction. Objects
	/// of type are accessed/created with MF::getInfo and destroyed when the
	/// MachineFunction is destroyed.
	struct MachineFunctionInfo {
	virtual ~MachineFunctionInfo();

	/// \brief Factory function: default behavior is to call new using the
	/// supplied allocator.
	///
	/// This function can be overridden in a derive class.
	template<typename Ty>
	static Ty *create(BumpPtrAllocator &Allocator, MachineFunction &MF) {
	return new (Allocator.Allocate<Ty>()) Ty(MF);
	}
	};

	/// Properties which a MachineFunction may have at a given point in time.
	/// Each of these has checking code in the MachineVerifier, and passes can
	/// require that a property be set.
	class MachineFunctionProperties {
	// Possible TODO: Allow targets to extend this (perhaps by allowing the
	// constructor to specify the size of the bit vector)
	// Possible TODO: Allow requiring the negative (e.g. VRegsAllocated could be
	// stated as the negative of "has vregs"

	public:
	// The properties are stated in "positive" form; i.e. a pass could require
	// that the property hold, but not that it does not hold.

	// Property descriptions:
	// IsSSA: True when the machine function is in SSA form and virtual registers
	// have a single def.
	// NoPHIs: The machine function does not contain any PHI instruction.
	// TracksLiveness: True when tracking register liveness accurately.
	// While this property is set, register liveness information in basic block
	// live-in lists and machine instruction operands (e.g. kill flags, implicit
	// defs) is accurate. This means it can be used to change the code in ways
	// that affect the values in registers, for example by the register
	// scavenger.
	// When this property is clear, liveness is no longer reliable.
	// NoVRegs: The machine function does not use any virtual registers.
	// Legalized: In GlobalISel: the MachineLegalizer ran and all pre-isel generic
	// instructions have been legalized; i.e., all instructions are now one of:
	// - generic and always legal (e.g., COPY)
	// - target-specific
	// - legal pre-isel generic instructions.
	// RegBankSelected: In GlobalISel: the RegBankSelect pass ran and all generic
	// virtual registers have been assigned to a register bank.
	// Selected: In GlobalISel: the InstructionSelect pass ran and all pre-isel
	// generic instructions have been eliminated; i.e., all instructions are now
	// target-specific or non-pre-isel generic instructions (e.g., COPY).
	// Since only pre-isel generic instructions can have generic virtual register
	// operands, this also means that all generic virtual registers have been
	// constrained to virtual registers (assigned to register classes) and that
	// all sizes attached to them have been eliminated.
	enum class Property : unsigned {
	IsSSA,
	NoPHIs,
	TracksLiveness,
	NoVRegs,
	FailedISel,
	Legalized,
	RegBankSelected,
	Selected,
	LastProperty = Selected,
	};

	bool hasProperty(Property P) const {
	return Properties[static_cast<unsigned>(P)];
	}

	MachineFunctionProperties &set(Property P) {
	Properties.set(static_cast<unsigned>(P));
	return *this;
	}

	MachineFunctionProperties &reset(Property P) {
	Properties.reset(static_cast<unsigned>(P));
	return *this;
	}

	/// Reset all the properties.
	MachineFunctionProperties &reset() {
	Properties.reset();
	return *this;
	}

	MachineFunctionProperties &set(const MachineFunctionProperties &MFP) {
	Properties \|= MFP.Properties;
	return *this;
	}

	MachineFunctionProperties &reset(const MachineFunctionProperties &MFP) {
	Properties.reset(MFP.Properties);
	return *this;
	}

	// Returns true if all properties set in V (i.e. required by a pass) are set
	// in this.
	bool verifyRequiredProperties(const MachineFunctionProperties &V) const {
	return !V.Properties.test(Properties);
	}

	/// Print the MachineFunctionProperties in human-readable form.
	void print(raw_ostream &OS) const;

	private:
	BitVector Properties =
	BitVector(static_cast<unsigned>(Property::LastProperty)+1);
	};

	struct SEHHandler {
	/// Filter or finally function. Null indicates a catch-all.
	const Function *FilterOrFinally;

	/// Address of block to recover at. Null for a finally handler.
	const BlockAddress *RecoverBA;
	};

	/// This structure is used to retain landing pad info for the current function.
	struct LandingPadInfo {
	MachineBasicBlock *LandingPadBlock; // Landing pad block.
	SmallVector<MCSymbol *, 1> BeginLabels; // Labels prior to invoke.
	SmallVector<MCSymbol *, 1> EndLabels; // Labels after invoke.
	SmallVector<SEHHandler, 1> SEHHandlers; // SEH handlers active at this lpad.
	MCSymbol *LandingPadLabel = nullptr; // Label at beginning of landing pad.
	std::vector<int> TypeIds; // List of type ids (filters negative).

	explicit LandingPadInfo(MachineBasicBlock *MBB)
	: LandingPadBlock(MBB) {}
	};

	class MachineFunction {
	const Function *Fn;
	const TargetMachine &Target;
	const TargetSubtargetInfo *STI;
	MCContext &Ctx;
	MachineModuleInfo &MMI;

	// RegInfo - Information about each register in use in the function.
	MachineRegisterInfo *RegInfo;

	// Used to keep track of target-specific per-machine function information for
	// the target implementation.
	MachineFunctionInfo *MFInfo;

	// Keep track of objects allocated on the stack.
	MachineFrameInfo *FrameInfo;

	// Keep track of constants which are spilled to memory
	MachineConstantPool *ConstantPool;

	// Keep track of jump tables for switch instructions
	MachineJumpTableInfo *JumpTableInfo;

	// Keeps track of Windows exception handling related data. This will be null
	// for functions that aren't using a funclet-based EH personality.
	WinEHFuncInfo *WinEHInfo = nullptr;

	// Function-level unique numbering for MachineBasicBlocks. When a
	// MachineBasicBlock is inserted into a MachineFunction is it automatically
	// numbered and this vector keeps track of the mapping from ID's to MBB's.
	std::vector<MachineBasicBlock*> MBBNumbering;

	// Pool-allocate MachineFunction-lifetime and IR objects.
	BumpPtrAllocator Allocator;

	// Allocation management for instructions in function.
	Recycler<MachineInstr> InstructionRecycler;

	// Allocation management for operand arrays on instructions.
	ArrayRecycler<MachineOperand> OperandRecycler;

	// Allocation management for basic blocks in function.
	Recycler<MachineBasicBlock> BasicBlockRecycler;

	// List of machine basic blocks in function
	using BasicBlockListType = ilist<MachineBasicBlock>;
	BasicBlockListType BasicBlocks;

	/// FunctionNumber - This provides a unique ID for each function emitted in
	/// this translation unit.
	///
	unsigned FunctionNumber;

	/// Alignment - The alignment of the function.
	unsigned Alignment;

	/// ExposesReturnsTwice - True if the function calls setjmp or related
	/// functions with attribute "returns twice", but doesn't have
	/// the attribute itself.
	/// This is used to limit optimizations which cannot reason
	/// about the control flow of such functions.
	bool ExposesReturnsTwice = false;

	/// True if the function includes any inline assembly.
	bool HasInlineAsm = false;

	/// True if any WinCFI instruction have been emitted in this function.
	Optional<bool> HasWinCFI;

	/// Current high-level properties of the IR of the function (e.g. is in SSA
	/// form or whether registers have been allocated)
	MachineFunctionProperties Properties;

	// Allocation management for pseudo source values.
	std::unique_ptr<PseudoSourceValueManager> PSVManager;

	/// List of moves done by a function's prolog. Used to construct frame maps
	/// by debug and exception handling consumers.
	std::vector<MCCFIInstruction> FrameInstructions;

	/// \name Exception Handling
	/// \{

	/// List of LandingPadInfo describing the landing pad information.
	std::vector<LandingPadInfo> LandingPads;

	/// Map a landing pad's EH symbol to the call site indexes.
	DenseMap<MCSymbol*, SmallVector<unsigned, 4>> LPadToCallSiteMap;

	/// Map of invoke call site index values to associated begin EH_LABEL.
	DenseMap<MCSymbol*, unsigned> CallSiteMap;

	bool CallsEHReturn = false;
	bool CallsUnwindInit = false;
	bool HasEHFunclets = false;

	/// List of C++ TypeInfo used.
	std::vector<const GlobalValue *> TypeInfos;

	/// List of typeids encoding filters used.
	std::vector<unsigned> FilterIds;

	/// List of the indices in FilterIds corresponding to filter terminators.
	std::vector<unsigned> FilterEnds;

	EHPersonality PersonalityTypeCache = EHPersonality::Unknown;

	/// \}

	/// Clear all the members of this MachineFunction, but the ones used
	/// to initialize again the MachineFunction.
	/// More specifically, this deallocates all the dynamically allocated
	/// objects and get rid of all the XXXInfo data structure, but keep
	/// unchanged the references to Fn, Target, MMI, and FunctionNumber.
	void clear();
	/// Allocate and initialize the different members.
	/// In particular, the XXXInfo data structure.
	/// \pre Fn, Target, MMI, and FunctionNumber are properly set.
	void init();

	public:
	struct VariableDbgInfo {
	const DILocalVariable *Var;
	const DIExpression *Expr;
	unsigned Slot;
	const DILocation *Loc;

	VariableDbgInfo(const DILocalVariable Var, const DIExpression Expr,
	unsigned Slot, const DILocation *Loc)
	: Var(Var), Expr(Expr), Slot(Slot), Loc(Loc) {}
	};
	using VariableDbgInfoMapTy = SmallVector<VariableDbgInfo, 4>;
	VariableDbgInfoMapTy VariableDbgInfos;

	MachineFunction(const Function *Fn, const TargetMachine &TM,
	unsigned FunctionNum, MachineModuleInfo &MMI);
	MachineFunction(const MachineFunction &) = delete;
	MachineFunction &operator=(const MachineFunction &) = delete;
	~MachineFunction();

	/// Reset the instance as if it was just created.
	void reset() {
	clear();
	init();
	}

	MachineModuleInfo &getMMI() const { return MMI; }
	MCContext &getContext() const { return Ctx; }

	PseudoSourceValueManager &getPSVManager() const { return *PSVManager; }

	/// Return the DataLayout attached to the Module associated to this MF.
	const DataLayout &getDataLayout() const;

	/// getFunction - Return the LLVM function that this machine code represents
	const Function *getFunction() const { return Fn; }

	/// getName - Return the name of the corresponding LLVM function.
	StringRef getName() const;

	/// getFunctionNumber - Return a unique ID for the current function.
	unsigned getFunctionNumber() const { return FunctionNumber; }

	/// getTarget - Return the target machine this machine code is compiled with
	const TargetMachine &getTarget() const { return Target; }

	/// getSubtarget - Return the subtarget for which this machine code is being
	/// compiled.
	const TargetSubtargetInfo &getSubtarget() const { return *STI; }
	void setSubtarget(const TargetSubtargetInfo *ST) { STI = ST; }

	/// getSubtarget - This method returns a pointer to the specified type of
	/// TargetSubtargetInfo. In debug builds, it verifies that the object being
	/// returned is of the correct type.
	template<typename STC> const STC &getSubtarget() const {
	return static_cast<const STC >(STI);
	}

	/// getRegInfo - Return information about the registers currently in use.
	MachineRegisterInfo &getRegInfo() { return *RegInfo; }
	const MachineRegisterInfo &getRegInfo() const { return *RegInfo; }

	/// getFrameInfo - Return the frame info object for the current function.
	/// This object contains information about objects allocated on the stack
	/// frame of the current function in an abstract way.
	MachineFrameInfo &getFrameInfo() { return *FrameInfo; }
	const MachineFrameInfo &getFrameInfo() const { return *FrameInfo; }

	/// getJumpTableInfo - Return the jump table info object for the current
	/// function. This object contains information about jump tables in the
	/// current function. If the current function has no jump tables, this will
	/// return null.
	const MachineJumpTableInfo *getJumpTableInfo() const { return JumpTableInfo; }
	MachineJumpTableInfo *getJumpTableInfo() { return JumpTableInfo; }

	/// getOrCreateJumpTableInfo - Get the JumpTableInfo for this function, if it
	/// does already exist, allocate one.
	MachineJumpTableInfo *getOrCreateJumpTableInfo(unsigned JTEntryKind);

	/// getConstantPool - Return the constant pool object for the current
	/// function.
	MachineConstantPool *getConstantPool() { return ConstantPool; }
	const MachineConstantPool *getConstantPool() const { return ConstantPool; }

	/// getWinEHFuncInfo - Return information about how the current function uses
	/// Windows exception handling. Returns null for functions that don't use
	/// funclets for exception handling.
	const WinEHFuncInfo *getWinEHFuncInfo() const { return WinEHInfo; }
	WinEHFuncInfo *getWinEHFuncInfo() { return WinEHInfo; }

	/// getAlignment - Return the alignment (log2, not bytes) of the function.
	unsigned getAlignment() const { return Alignment; }

	/// setAlignment - Set the alignment (log2, not bytes) of the function.
	void setAlignment(unsigned A) { Alignment = A; }

	/// ensureAlignment - Make sure the function is at least 1 << A bytes aligned.
	void ensureAlignment(unsigned A) {
	if (Alignment < A) Alignment = A;
	}

	/// exposesReturnsTwice - Returns true if the function calls setjmp or
	/// any other similar functions with attribute "returns twice" without
	/// having the attribute itself.
	bool exposesReturnsTwice() const {
	return ExposesReturnsTwice;
	}

	/// setCallsSetJmp - Set a flag that indicates if there's a call to
	/// a "returns twice" function.
	void setExposesReturnsTwice(bool B) {
	ExposesReturnsTwice = B;
	}

	/// Returns true if the function contains any inline assembly.
	bool hasInlineAsm() const {
	return HasInlineAsm;
	}

	/// Set a flag that indicates that the function contains inline assembly.
	void setHasInlineAsm(bool B) {
	HasInlineAsm = B;
	}

	bool hasWinCFI() const {
	assert(HasWinCFI.hasValue() && "HasWinCFI not set yet!");
	return *HasWinCFI;
	}
	void setHasWinCFI(bool v) { HasWinCFI = v; }

	/// Get the function properties
	const MachineFunctionProperties &getProperties() const { return Properties; }
	MachineFunctionProperties &getProperties() { return Properties; }

	/// getInfo - Keep track of various per-function pieces of information for
	/// backends that would like to do so.
	///
	template<typename Ty>
	Ty *getInfo() {
	if (!MFInfo)
	MFInfo = Ty::template create<Ty>(Allocator, *this);
	return static_cast<Ty*>(MFInfo);
	}

	template<typename Ty>
	const Ty *getInfo() const {
	return const_cast<MachineFunction*>(this)->getInfo<Ty>();
	}

	/// getBlockNumbered - MachineBasicBlocks are automatically numbered when they
	/// are inserted into the machine function. The block number for a machine
	/// basic block can be found by using the MBB::getNumber method, this method
	/// provides the inverse mapping.
	MachineBasicBlock *getBlockNumbered(unsigned N) const {
	assert(N < MBBNumbering.size() && "Illegal block number");
	assert(MBBNumbering[N] && "Block was removed from the machine function!");
	return MBBNumbering[N];
	}

	/// Should we be emitting segmented stack stuff for the function
	bool shouldSplitStack() const;

	/// getNumBlockIDs - Return the number of MBB ID's allocated.
	unsigned getNumBlockIDs() const { return (unsigned)MBBNumbering.size(); }

	/// RenumberBlocks - This discards all of the MachineBasicBlock numbers and
	/// recomputes them. This guarantees that the MBB numbers are sequential,
	/// dense, and match the ordering of the blocks within the function. If a
	/// specific MachineBasicBlock is specified, only that block and those after
	/// it are renumbered.
	void RenumberBlocks(MachineBasicBlock *MBBFrom = nullptr);

	/// print - Print out the MachineFunction in a format suitable for debugging
	/// to the specified stream.
	void print(raw_ostream &OS, const SlotIndexes* = nullptr) const;

	/// viewCFG - This function is meant for use from the debugger. You can just
	/// say 'call F->viewCFG()' and a ghostview window should pop up from the
	/// program, displaying the CFG of the current function with the code for each
	/// basic block inside. This depends on there being a 'dot' and 'gv' program
	/// in your path.
	void viewCFG() const;

	/// viewCFGOnly - This function is meant for use from the debugger. It works
	/// just like viewCFG, but it does not include the contents of basic blocks
	/// into the nodes, just the label. If you are only interested in the CFG
	/// this can make the graph smaller.
	///
	void viewCFGOnly() const;

	/// dump - Print the current MachineFunction to cerr, useful for debugger use.
	void dump() const;

	/// Run the current MachineFunction through the machine code verifier, useful
	/// for debugger use.
	/// \returns true if no problems were found.
	bool verify(Pass p = nullptr, const char Banner = nullptr,
	bool AbortOnError = true) const;

	// Provide accessors for the MachineBasicBlock list...
	using iterator = BasicBlockListType::iterator;
	using const_iterator = BasicBlockListType::const_iterator;
	using const_reverse_iterator = BasicBlockListType::const_reverse_iterator;
	using reverse_iterator = BasicBlockListType::reverse_iterator;

	/// Support for MachineBasicBlock::getNextNode().
	static BasicBlockListType MachineFunction::*
	getSublistAccess(MachineBasicBlock *) {
	return &MachineFunction::BasicBlocks;
	}

	/// addLiveIn - Add the specified physical register as a live-in value and
	/// create a corresponding virtual register for it.
	unsigned addLiveIn(unsigned PReg, const TargetRegisterClass *RC);

	//===--------------------------------------------------------------------===//
	// BasicBlock accessor functions.
	//
	iterator begin() { return BasicBlocks.begin(); }
	const_iterator begin() const { return BasicBlocks.begin(); }
	iterator end () { return BasicBlocks.end(); }
	const_iterator end () const { return BasicBlocks.end(); }

	reverse_iterator rbegin() { return BasicBlocks.rbegin(); }
	const_reverse_iterator rbegin() const { return BasicBlocks.rbegin(); }
	reverse_iterator rend () { return BasicBlocks.rend(); }
	const_reverse_iterator rend () const { return BasicBlocks.rend(); }

	unsigned size() const { return (unsigned)BasicBlocks.size();}
	bool empty() const { return BasicBlocks.empty(); }
	const MachineBasicBlock &front() const { return BasicBlocks.front(); }
	MachineBasicBlock &front() { return BasicBlocks.front(); }
	const MachineBasicBlock & back() const { return BasicBlocks.back(); }
	MachineBasicBlock & back() { return BasicBlocks.back(); }

	void push_back (MachineBasicBlock *MBB) { BasicBlocks.push_back (MBB); }
	void push_front(MachineBasicBlock *MBB) { BasicBlocks.push_front(MBB); }
	void insert(iterator MBBI, MachineBasicBlock *MBB) {
	BasicBlocks.insert(MBBI, MBB);
	}
	void splice(iterator InsertPt, iterator MBBI) {
	BasicBlocks.splice(InsertPt, BasicBlocks, MBBI);
	}
	void splice(iterator InsertPt, MachineBasicBlock *MBB) {
	BasicBlocks.splice(InsertPt, BasicBlocks, MBB);
	}
	void splice(iterator InsertPt, iterator MBBI, iterator MBBE) {
	BasicBlocks.splice(InsertPt, BasicBlocks, MBBI, MBBE);
	}

	void remove(iterator MBBI) { BasicBlocks.remove(MBBI); }
	void remove(MachineBasicBlock *MBBI) { BasicBlocks.remove(MBBI); }
	void erase(iterator MBBI) { BasicBlocks.erase(MBBI); }
	void erase(MachineBasicBlock *MBBI) { BasicBlocks.erase(MBBI); }

	template <typename Comp>
	void sort(Comp comp) {
	BasicBlocks.sort(comp);
	}

	//===--------------------------------------------------------------------===//
	// Internal functions used to automatically number MachineBasicBlocks

	/// \brief Adds the MBB to the internal numbering. Returns the unique number
	/// assigned to the MBB.
	unsigned addToMBBNumbering(MachineBasicBlock *MBB) {
	MBBNumbering.push_back(MBB);
	return (unsigned)MBBNumbering.size()-1;
	}

	/// removeFromMBBNumbering - Remove the specific machine basic block from our
	/// tracker, this is only really to be used by the MachineBasicBlock
	/// implementation.
	void removeFromMBBNumbering(unsigned N) {
	assert(N < MBBNumbering.size() && "Illegal basic block #");
	MBBNumbering[N] = nullptr;
	}

	/// CreateMachineInstr - Allocate a new MachineInstr. Use this instead
	/// of `new MachineInstr'.
	MachineInstr *CreateMachineInstr(const MCInstrDesc &MCID, const DebugLoc &DL,
	bool NoImp = false);

	/// CloneMachineInstr - Create a new MachineInstr which is a copy of the
	/// 'Orig' instruction, identical in all ways except the instruction
	/// has no parent, prev, or next.
	///
	/// See also TargetInstrInfo::duplicate() for target-specific fixes to cloned
	/// instructions.
	MachineInstr CloneMachineInstr(const MachineInstr Orig);

	/// DeleteMachineInstr - Delete the given MachineInstr.
	void DeleteMachineInstr(MachineInstr *MI);

	/// CreateMachineBasicBlock - Allocate a new MachineBasicBlock. Use this
	/// instead of `new MachineBasicBlock'.
	MachineBasicBlock CreateMachineBasicBlock(const BasicBlock bb = nullptr);

	/// DeleteMachineBasicBlock - Delete the given MachineBasicBlock.
	void DeleteMachineBasicBlock(MachineBasicBlock *MBB);

	/// getMachineMemOperand - Allocate a new MachineMemOperand.
	/// MachineMemOperands are owned by the MachineFunction and need not be
	/// explicitly deallocated.
	MachineMemOperand *getMachineMemOperand(
	MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
	unsigned base_alignment, const AAMDNodes &AAInfo = AAMDNodes(),
	const MDNode *Ranges = nullptr,
	SyncScope::ID SSID = SyncScope::System,
	AtomicOrdering Ordering = AtomicOrdering::NotAtomic,
	AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic);

	/// getMachineMemOperand - Allocate a new MachineMemOperand by copying
	/// an existing one, adjusting by an offset and using the given size.
	/// MachineMemOperands are owned by the MachineFunction and need not be
	/// explicitly deallocated.
	MachineMemOperand getMachineMemOperand(const MachineMemOperand MMO,
	int64_t Offset, uint64_t Size);

	+ /// Allocate a new MachineMemOperand by copying an existing one,
	+ /// replacing only AliasAnalysis information. MachineMemOperands are owned
	+ /// by the MachineFunction and need not be explicitly deallocated.
	+ MachineMemOperand getMachineMemOperand(const MachineMemOperand MMO,
	+ const AAMDNodes &AAInfo);
	+
	using OperandCapacity = ArrayRecycler<MachineOperand>::Capacity;

	/// Allocate an array of MachineOperands. This is only intended for use by
	/// internal MachineInstr functions.
	MachineOperand *allocateOperandArray(OperandCapacity Cap) {
	return OperandRecycler.allocate(Cap, Allocator);
	}

	/// Dellocate an array of MachineOperands and recycle the memory. This is
	/// only intended for use by internal MachineInstr functions.
	/// Cap must be the same capacity that was used to allocate the array.
	void deallocateOperandArray(OperandCapacity Cap, MachineOperand *Array) {
	OperandRecycler.deallocate(Cap, Array);
	}

	/// \brief Allocate and initialize a register mask with @p NumRegister bits.
	uint32_t *allocateRegisterMask(unsigned NumRegister) {
	unsigned Size = (NumRegister + 31) / 32;
	uint32_t *Mask = Allocator.Allocate<uint32_t>(Size);
	for (unsigned i = 0; i != Size; ++i)
	Mask[i] = 0;
	return Mask;
	}

	/// allocateMemRefsArray - Allocate an array to hold MachineMemOperand
	/// pointers. This array is owned by the MachineFunction.
	MachineInstr::mmo_iterator allocateMemRefsArray(unsigned long Num);

	/// extractLoadMemRefs - Allocate an array and populate it with just the
	/// load information from the given MachineMemOperand sequence.
	std::pair<MachineInstr::mmo_iterator,
	MachineInstr::mmo_iterator>
	extractLoadMemRefs(MachineInstr::mmo_iterator Begin,
	MachineInstr::mmo_iterator End);

	/// extractStoreMemRefs - Allocate an array and populate it with just the
	/// store information from the given MachineMemOperand sequence.
	std::pair<MachineInstr::mmo_iterator,
	MachineInstr::mmo_iterator>
	extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
	MachineInstr::mmo_iterator End);

	/// Allocate a string and populate it with the given external symbol name.
	const char *createExternalSymbolName(StringRef Name);

	//===--------------------------------------------------------------------===//
	// Label Manipulation.

	/// getJTISymbol - Return the MCSymbol for the specified non-empty jump table.
	/// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a
	/// normal 'L' label is returned.
	MCSymbol *getJTISymbol(unsigned JTI, MCContext &Ctx,
	bool isLinkerPrivate = false) const;

	/// getPICBaseSymbol - Return a function-local symbol to represent the PIC
	/// base.
	MCSymbol *getPICBaseSymbol() const;

	/// Returns a reference to a list of cfi instructions in the function's
	/// prologue. Used to construct frame maps for debug and exception handling
	/// comsumers.
	const std::vector<MCCFIInstruction> &getFrameInstructions() const {
	return FrameInstructions;
	}

	LLVM_NODISCARD unsigned addFrameInst(const MCCFIInstruction &Inst) {
	FrameInstructions.push_back(Inst);
	return FrameInstructions.size() - 1;
	}

	/// \name Exception Handling
	/// \{

	bool callsEHReturn() const { return CallsEHReturn; }
	void setCallsEHReturn(bool b) { CallsEHReturn = b; }

	bool callsUnwindInit() const { return CallsUnwindInit; }
	void setCallsUnwindInit(bool b) { CallsUnwindInit = b; }

	bool hasEHFunclets() const { return HasEHFunclets; }
	void setHasEHFunclets(bool V) { HasEHFunclets = V; }

	/// Find or create an LandingPadInfo for the specified MachineBasicBlock.
	LandingPadInfo &getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad);

	/// Remap landing pad labels and remove any deleted landing pads.
	void tidyLandingPads(DenseMap<MCSymbol, uintptr_t> LPMap = nullptr);

	/// Return a reference to the landing pad info for the current function.
	const std::vector<LandingPadInfo> &getLandingPads() const {
	return LandingPads;
	}

	/// Provide the begin and end labels of an invoke style call and associate it
	/// with a try landing pad block.
	void addInvoke(MachineBasicBlock *LandingPad,
	MCSymbol BeginLabel, MCSymbol EndLabel);

	/// Add a new panding pad. Returns the label ID for the landing pad entry.
	MCSymbol addLandingPad(MachineBasicBlock LandingPad);

	/// Provide the catch typeinfo for a landing pad.
	void addCatchTypeInfo(MachineBasicBlock *LandingPad,
	ArrayRef<const GlobalValue *> TyInfo);

	/// Provide the filter typeinfo for a landing pad.
	void addFilterTypeInfo(MachineBasicBlock *LandingPad,
	ArrayRef<const GlobalValue *> TyInfo);

	/// Add a cleanup action for a landing pad.
	void addCleanup(MachineBasicBlock *LandingPad);

	void addSEHCatchHandler(MachineBasicBlock LandingPad, const Function Filter,
	const BlockAddress *RecoverLabel);

	void addSEHCleanupHandler(MachineBasicBlock *LandingPad,
	const Function *Cleanup);

	/// Return the type id for the specified typeinfo. This is function wide.
	unsigned getTypeIDFor(const GlobalValue *TI);

	/// Return the id of the filter encoded by TyIds. This is function wide.
	int getFilterIDFor(std::vector<unsigned> &TyIds);

	/// Map the landing pad's EH symbol to the call site indexes.
	void setCallSiteLandingPad(MCSymbol *Sym, ArrayRef<unsigned> Sites);

	/// Get the call site indexes for a landing pad EH symbol.
	SmallVectorImpl<unsigned> &getCallSiteLandingPad(MCSymbol *Sym) {
	assert(hasCallSiteLandingPad(Sym) &&
	"missing call site number for landing pad!");
	return LPadToCallSiteMap[Sym];
	}

	/// Return true if the landing pad Eh symbol has an associated call site.
	bool hasCallSiteLandingPad(MCSymbol *Sym) {
	return !LPadToCallSiteMap[Sym].empty();
	}

	/// Map the begin label for a call site.
	void setCallSiteBeginLabel(MCSymbol *BeginLabel, unsigned Site) {
	CallSiteMap[BeginLabel] = Site;
	}

	/// Get the call site number for a begin label.
	unsigned getCallSiteBeginLabel(MCSymbol *BeginLabel) const {
	assert(hasCallSiteBeginLabel(BeginLabel) &&
	"Missing call site number for EH_LABEL!");
	return CallSiteMap.lookup(BeginLabel);
	}

	/// Return true if the begin label has a call site number associated with it.
	bool hasCallSiteBeginLabel(MCSymbol *BeginLabel) const {
	return CallSiteMap.count(BeginLabel);
	}

	/// Return a reference to the C++ typeinfo for the current function.
	const std::vector<const GlobalValue *> &getTypeInfos() const {
	return TypeInfos;
	}

	/// Return a reference to the typeids encoding filters used in the current
	/// function.
	const std::vector<unsigned> &getFilterIds() const {
	return FilterIds;
	}

	/// \}

	/// Collect information used to emit debugging information of a variable.
	void setVariableDbgInfo(const DILocalVariable Var, const DIExpression Expr,
	unsigned Slot, const DILocation *Loc) {
	VariableDbgInfos.emplace_back(Var, Expr, Slot, Loc);
	}

	VariableDbgInfoMapTy &getVariableDbgInfo() { return VariableDbgInfos; }
	const VariableDbgInfoMapTy &getVariableDbgInfo() const {
	return VariableDbgInfos;
	}
	};

	/// \name Exception Handling
	/// \{

	/// Extract the exception handling information from the landingpad instruction
	/// and add them to the specified machine module info.
	void addLandingPadInfo(const LandingPadInst &I, MachineBasicBlock &MBB);

	/// \}

	//===--------------------------------------------------------------------===//
	// GraphTraits specializations for function basic block graphs (CFGs)
	//===--------------------------------------------------------------------===//

	// Provide specializations of GraphTraits to be able to treat a
	// machine function as a graph of machine basic blocks... these are
	// the same as the machine basic block iterators, except that the root
	// node is implicitly the first node of the function.
	//
	template <> struct GraphTraits<MachineFunction*> :
	public GraphTraits<MachineBasicBlock*> {
	static NodeRef getEntryNode(MachineFunction *F) { return &F->front(); }

	// nodes_iterator/begin/end - Allow iteration over all nodes in the graph
	using nodes_iterator = pointer_iterator<MachineFunction::iterator>;

	static nodes_iterator nodes_begin(MachineFunction *F) {
	return nodes_iterator(F->begin());
	}

	static nodes_iterator nodes_end(MachineFunction *F) {
	return nodes_iterator(F->end());
	}

	static unsigned size (MachineFunction *F) { return F->size(); }
	};
	template <> struct GraphTraits<const MachineFunction*> :
	public GraphTraits<const MachineBasicBlock*> {
	static NodeRef getEntryNode(const MachineFunction *F) { return &F->front(); }

	// nodes_iterator/begin/end - Allow iteration over all nodes in the graph
	using nodes_iterator = pointer_iterator<MachineFunction::const_iterator>;

	static nodes_iterator nodes_begin(const MachineFunction *F) {
	return nodes_iterator(F->begin());
	}

	static nodes_iterator nodes_end (const MachineFunction *F) {
	return nodes_iterator(F->end());
	}

	static unsigned size (const MachineFunction *F) {
	return F->size();
	}
	};

	// Provide specializations of GraphTraits to be able to treat a function as a
	// graph of basic blocks... and to walk it in inverse order. Inverse order for
	// a function is considered to be when traversing the predecessor edges of a BB
	// instead of the successor edges.
	//
	template <> struct GraphTraits<Inverse<MachineFunction*>> :
	public GraphTraits<Inverse<MachineBasicBlock*>> {
	static NodeRef getEntryNode(Inverse<MachineFunction *> G) {
	return &G.Graph->front();
	}
	};
	template <> struct GraphTraits<Inverse<const MachineFunction*>> :
	public GraphTraits<Inverse<const MachineBasicBlock*>> {
	static NodeRef getEntryNode(Inverse<const MachineFunction *> G) {
	return &G.Graph->front();
	}
	};

	} // end namespace llvm

	#endif // LLVM_CODEGEN_MACHINEFUNCTION_H
	Index: head/contrib/llvm/include/llvm/CodeGen/MachineInstr.h
	===================================================================
	--- head/contrib/llvm/include/llvm/CodeGen/MachineInstr.h (revision 322319)
	+++ head/contrib/llvm/include/llvm/CodeGen/MachineInstr.h (revision 322320)
	@@ -1,1346 +1,1349 @@
	//===- llvm/CodeGen/MachineInstr.h - MachineInstr class ---------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the declaration of the MachineInstr class, which is the
	// basic representation for all target dependent machine instructions used by
	// the back end.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CODEGEN_MACHINEINSTR_H
	#define LLVM_CODEGEN_MACHINEINSTR_H

	#include "llvm/ADT/DenseMapInfo.h"
	#include "llvm/ADT/ilist.h"
	#include "llvm/ADT/ilist_node.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/Support/ArrayRecycler.h"
	#include "llvm/Target/TargetOpcodes.h"
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <utility>

	namespace llvm {

	template <typename T> class ArrayRef;
	class DIExpression;
	class DILocalVariable;
	class MachineBasicBlock;
	class MachineFunction;
	class MachineMemOperand;
	class MachineRegisterInfo;
	class ModuleSlotTracker;
	class raw_ostream;
	template <typename T> class SmallVectorImpl;
	class StringRef;
	class TargetInstrInfo;
	class TargetRegisterClass;
	class TargetRegisterInfo;

	//===----------------------------------------------------------------------===//
	/// Representation of each machine instruction.
	///
	/// This class isn't a POD type, but it must have a trivial destructor. When a
	/// MachineFunction is deleted, all the contained MachineInstrs are deallocated
	/// without having their destructor called.
	///
	class MachineInstr
	: public ilist_node_with_parent<MachineInstr, MachineBasicBlock,
	ilist_sentinel_tracking<true>> {
	public:
	using mmo_iterator = MachineMemOperand **;

	/// Flags to specify different kinds of comments to output in
	/// assembly code. These flags carry semantic information not
	/// otherwise easily derivable from the IR text.
	///
	enum CommentFlag {
	ReloadReuse = 0x1 // higher bits are reserved for target dep comments.
	};

	enum MIFlag {
	NoFlags = 0,
	FrameSetup = 1 << 0, // Instruction is used as a part of
	// function frame setup code.
	FrameDestroy = 1 << 1, // Instruction is used as a part of
	// function frame destruction code.
	BundledPred = 1 << 2, // Instruction has bundled predecessors.
	BundledSucc = 1 << 3 // Instruction has bundled successors.
	};

	private:
	const MCInstrDesc *MCID; // Instruction descriptor.
	MachineBasicBlock *Parent = nullptr; // Pointer to the owning basic block.

	// Operands are allocated by an ArrayRecycler.
	MachineOperand *Operands = nullptr; // Pointer to the first operand.
	unsigned NumOperands = 0; // Number of operands on instruction.
	using OperandCapacity = ArrayRecycler<MachineOperand>::Capacity;
	OperandCapacity CapOperands; // Capacity of the Operands array.

	uint8_t Flags = 0; // Various bits of additional
	// information about machine
	// instruction.

	uint8_t AsmPrinterFlags = 0; // Various bits of information used by
	// the AsmPrinter to emit helpful
	// comments. This is not semantic
	// information. Do not use this for
	// anything other than to convey comment
	// information to AsmPrinter.

	uint8_t NumMemRefs = 0; // Information on memory references.
	// Note that MemRefs == nullptr, means 'don't know', not 'no memory access'.
	// Calling code must treat missing information conservatively. If the number
	// of memory operands required to be precise exceeds the maximum value of
	// NumMemRefs - currently 256 - we remove the operands entirely. Note also
	// that this is a non-owning reference to a shared copy on write buffer owned
	// by the MachineFunction and created via MF.allocateMemRefsArray.
	mmo_iterator MemRefs = nullptr;

	DebugLoc debugLoc; // Source line information.

	// Intrusive list support
	friend struct ilist_traits<MachineInstr>;
	friend struct ilist_callback_traits<MachineBasicBlock>;
	void setParent(MachineBasicBlock *P) { Parent = P; }

	/// This constructor creates a copy of the given
	/// MachineInstr in the given MachineFunction.
	MachineInstr(MachineFunction &, const MachineInstr &);

	/// This constructor create a MachineInstr and add the implicit operands.
	/// It reserves space for number of operands specified by
	/// MCInstrDesc. An explicit DebugLoc is supplied.
	MachineInstr(MachineFunction &, const MCInstrDesc &MCID, DebugLoc dl,
	bool NoImp = false);

	// MachineInstrs are pool-allocated and owned by MachineFunction.
	friend class MachineFunction;

	public:
	MachineInstr(const MachineInstr &) = delete;
	MachineInstr &operator=(const MachineInstr &) = delete;
	// Use MachineFunction::DeleteMachineInstr() instead.
	~MachineInstr() = delete;

	const MachineBasicBlock* getParent() const { return Parent; }
	MachineBasicBlock* getParent() { return Parent; }

	/// Return the asm printer flags bitvector.
	uint8_t getAsmPrinterFlags() const { return AsmPrinterFlags; }

	/// Clear the AsmPrinter bitvector.
	void clearAsmPrinterFlags() { AsmPrinterFlags = 0; }

	/// Return whether an AsmPrinter flag is set.
	bool getAsmPrinterFlag(CommentFlag Flag) const {
	return AsmPrinterFlags & Flag;
	}

	/// Set a flag for the AsmPrinter.
	void setAsmPrinterFlag(uint8_t Flag) {
	AsmPrinterFlags \|= Flag;
	}

	/// Clear specific AsmPrinter flags.
	void clearAsmPrinterFlag(CommentFlag Flag) {
	AsmPrinterFlags &= ~Flag;
	}

	/// Return the MI flags bitvector.
	uint8_t getFlags() const {
	return Flags;
	}

	/// Return whether an MI flag is set.
	bool getFlag(MIFlag Flag) const {
	return Flags & Flag;
	}

	/// Set a MI flag.
	void setFlag(MIFlag Flag) {
	Flags \|= (uint8_t)Flag;
	}

	void setFlags(unsigned flags) {
	// Filter out the automatically maintained flags.
	unsigned Mask = BundledPred \| BundledSucc;
	Flags = (Flags & Mask) \| (flags & ~Mask);
	}

	/// clearFlag - Clear a MI flag.
	void clearFlag(MIFlag Flag) {
	Flags &= ~((uint8_t)Flag);
	}

	/// Return true if MI is in a bundle (but not the first MI in a bundle).
	///
	/// A bundle looks like this before it's finalized:
	/// ----------------
	/// \| MI \|
	/// ----------------
	/// \|
	/// ----------------
	/// \| MI * \|
	/// ----------------
	/// \|
	/// ----------------
	/// \| MI * \|
	/// ----------------
	/// In this case, the first MI starts a bundle but is not inside a bundle, the
	/// next 2 MIs are considered "inside" the bundle.
	///
	/// After a bundle is finalized, it looks like this:
	/// ----------------
	/// \| Bundle \|
	/// ----------------
	/// \|
	/// ----------------
	/// \| MI * \|
	/// ----------------
	/// \|
	/// ----------------
	/// \| MI * \|
	/// ----------------
	/// \|
	/// ----------------
	/// \| MI * \|
	/// ----------------
	/// The first instruction has the special opcode "BUNDLE". It's not "inside"
	/// a bundle, but the next three MIs are.
	bool isInsideBundle() const {
	return getFlag(BundledPred);
	}

	/// Return true if this instruction part of a bundle. This is true
	/// if either itself or its following instruction is marked "InsideBundle".
	bool isBundled() const {
	return isBundledWithPred() \|\| isBundledWithSucc();
	}

	/// Return true if this instruction is part of a bundle, and it is not the
	/// first instruction in the bundle.
	bool isBundledWithPred() const { return getFlag(BundledPred); }

	/// Return true if this instruction is part of a bundle, and it is not the
	/// last instruction in the bundle.
	bool isBundledWithSucc() const { return getFlag(BundledSucc); }

	/// Bundle this instruction with its predecessor. This can be an unbundled
	/// instruction, or it can be the first instruction in a bundle.
	void bundleWithPred();

	/// Bundle this instruction with its successor. This can be an unbundled
	/// instruction, or it can be the last instruction in a bundle.
	void bundleWithSucc();

	/// Break bundle above this instruction.
	void unbundleFromPred();

	/// Break bundle below this instruction.
	void unbundleFromSucc();

	/// Returns the debug location id of this MachineInstr.
	const DebugLoc &getDebugLoc() const { return debugLoc; }

	/// Return the debug variable referenced by
	/// this DBG_VALUE instruction.
	const DILocalVariable *getDebugVariable() const;

	/// Return the complex address expression referenced by
	/// this DBG_VALUE instruction.
	const DIExpression *getDebugExpression() const;

	/// Emit an error referring to the source location of this instruction.
	/// This should only be used for inline assembly that is somehow
	/// impossible to compile. Other errors should have been handled much
	/// earlier.
	///
	/// If this method returns, the caller should try to recover from the error.
	void emitError(StringRef Msg) const;

	/// Returns the target instruction descriptor of this MachineInstr.
	const MCInstrDesc &getDesc() const { return *MCID; }

	/// Returns the opcode of this MachineInstr.
	unsigned getOpcode() const { return MCID->Opcode; }

	/// Access to explicit operands of the instruction.
	unsigned getNumOperands() const { return NumOperands; }

	const MachineOperand& getOperand(unsigned i) const {
	assert(i < getNumOperands() && "getOperand() out of range!");
	return Operands[i];
	}
	MachineOperand& getOperand(unsigned i) {
	assert(i < getNumOperands() && "getOperand() out of range!");
	return Operands[i];
	}

	/// Returns the number of non-implicit operands.
	unsigned getNumExplicitOperands() const;

	/// iterator/begin/end - Iterate over all operands of a machine instruction.
	using mop_iterator = MachineOperand *;
	using const_mop_iterator = const MachineOperand *;

	mop_iterator operands_begin() { return Operands; }
	mop_iterator operands_end() { return Operands + NumOperands; }

	const_mop_iterator operands_begin() const { return Operands; }
	const_mop_iterator operands_end() const { return Operands + NumOperands; }

	iterator_range<mop_iterator> operands() {
	return make_range(operands_begin(), operands_end());
	}
	iterator_range<const_mop_iterator> operands() const {
	return make_range(operands_begin(), operands_end());
	}
	iterator_range<mop_iterator> explicit_operands() {
	return make_range(operands_begin(),
	operands_begin() + getNumExplicitOperands());
	}
	iterator_range<const_mop_iterator> explicit_operands() const {
	return make_range(operands_begin(),
	operands_begin() + getNumExplicitOperands());
	}
	iterator_range<mop_iterator> implicit_operands() {
	return make_range(explicit_operands().end(), operands_end());
	}
	iterator_range<const_mop_iterator> implicit_operands() const {
	return make_range(explicit_operands().end(), operands_end());
	}
	/// Returns a range over all explicit operands that are register definitions.
	/// Implicit definition are not included!
	iterator_range<mop_iterator> defs() {
	return make_range(operands_begin(),
	operands_begin() + getDesc().getNumDefs());
	}
	/// \copydoc defs()
	iterator_range<const_mop_iterator> defs() const {
	return make_range(operands_begin(),
	operands_begin() + getDesc().getNumDefs());
	}
	/// Returns a range that includes all operands that are register uses.
	/// This may include unrelated operands which are not register uses.
	iterator_range<mop_iterator> uses() {
	return make_range(operands_begin() + getDesc().getNumDefs(),
	operands_end());
	}
	/// \copydoc uses()
	iterator_range<const_mop_iterator> uses() const {
	return make_range(operands_begin() + getDesc().getNumDefs(),
	operands_end());
	}
	iterator_range<mop_iterator> explicit_uses() {
	return make_range(operands_begin() + getDesc().getNumDefs(),
	operands_begin() + getNumExplicitOperands() );
	}
	iterator_range<const_mop_iterator> explicit_uses() const {
	return make_range(operands_begin() + getDesc().getNumDefs(),
	operands_begin() + getNumExplicitOperands() );
	}

	/// Returns the number of the operand iterator \p I points to.
	unsigned getOperandNo(const_mop_iterator I) const {
	return I - operands_begin();
	}

	/// Access to memory operands of the instruction
	mmo_iterator memoperands_begin() const { return MemRefs; }
	mmo_iterator memoperands_end() const { return MemRefs + NumMemRefs; }
	/// Return true if we don't have any memory operands which described the the
	/// memory access done by this instruction. If this is true, calling code
	/// must be conservative.
	bool memoperands_empty() const { return NumMemRefs == 0; }

	iterator_range<mmo_iterator> memoperands() {
	return make_range(memoperands_begin(), memoperands_end());
	}
	iterator_range<mmo_iterator> memoperands() const {
	return make_range(memoperands_begin(), memoperands_end());
	}

	/// Return true if this instruction has exactly one MachineMemOperand.
	bool hasOneMemOperand() const {
	return NumMemRefs == 1;
	}

	+ /// Return the number of memory operands.
	+ unsigned getNumMemOperands() const { return NumMemRefs; }
	+
	/// API for querying MachineInstr properties. They are the same as MCInstrDesc
	/// queries but they are bundle aware.

	enum QueryType {
	IgnoreBundle, // Ignore bundles
	AnyInBundle, // Return true if any instruction in bundle has property
	AllInBundle // Return true if all instructions in bundle have property
	};

	/// Return true if the instruction (or in the case of a bundle,
	/// the instructions inside the bundle) has the specified property.
	/// The first argument is the property being queried.
	/// The second argument indicates whether the query should look inside
	/// instruction bundles.
	bool hasProperty(unsigned MCFlag, QueryType Type = AnyInBundle) const {
	// Inline the fast path for unbundled or bundle-internal instructions.
	if (Type == IgnoreBundle \|\| !isBundled() \|\| isBundledWithPred())
	return getDesc().getFlags() & (1ULL << MCFlag);

	// If this is the first instruction in a bundle, take the slow path.
	return hasPropertyInBundle(1ULL << MCFlag, Type);
	}

	/// Return true if this instruction can have a variable number of operands.
	/// In this case, the variable operands will be after the normal
	/// operands but before the implicit definitions and uses (if any are
	/// present).
	bool isVariadic(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::Variadic, Type);
	}

	/// Set if this instruction has an optional definition, e.g.
	/// ARM instructions which can set condition code if 's' bit is set.
	bool hasOptionalDef(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::HasOptionalDef, Type);
	}

	/// Return true if this is a pseudo instruction that doesn't
	/// correspond to a real machine instruction.
	bool isPseudo(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::Pseudo, Type);
	}

	bool isReturn(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::Return, Type);
	}

	bool isCall(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::Call, Type);
	}

	/// Returns true if the specified instruction stops control flow
	/// from executing the instruction immediately following it. Examples include
	/// unconditional branches and return instructions.
	bool isBarrier(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::Barrier, Type);
	}

	/// Returns true if this instruction part of the terminator for a basic block.
	/// Typically this is things like return and branch instructions.
	///
	/// Various passes use this to insert code into the bottom of a basic block,
	/// but before control flow occurs.
	bool isTerminator(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::Terminator, Type);
	}

	/// Returns true if this is a conditional, unconditional, or indirect branch.
	/// Predicates below can be used to discriminate between
	/// these cases, and the TargetInstrInfo::AnalyzeBranch method can be used to
	/// get more information.
	bool isBranch(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::Branch, Type);
	}

	/// Return true if this is an indirect branch, such as a
	/// branch through a register.
	bool isIndirectBranch(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::IndirectBranch, Type);
	}

	/// Return true if this is a branch which may fall
	/// through to the next instruction or may transfer control flow to some other
	/// block. The TargetInstrInfo::AnalyzeBranch method can be used to get more
	/// information about this branch.
	bool isConditionalBranch(QueryType Type = AnyInBundle) const {
	return isBranch(Type) & !isBarrier(Type) & !isIndirectBranch(Type);
	}

	/// Return true if this is a branch which always
	/// transfers control flow to some other block. The
	/// TargetInstrInfo::AnalyzeBranch method can be used to get more information
	/// about this branch.
	bool isUnconditionalBranch(QueryType Type = AnyInBundle) const {
	return isBranch(Type) & isBarrier(Type) & !isIndirectBranch(Type);
	}

	/// Return true if this instruction has a predicate operand that
	/// controls execution. It may be set to 'always', or may be set to other
	/// values. There are various methods in TargetInstrInfo that can be used to
	/// control and modify the predicate in this instruction.
	bool isPredicable(QueryType Type = AllInBundle) const {
	// If it's a bundle than all bundled instructions must be predicable for this
	// to return true.
	return hasProperty(MCID::Predicable, Type);
	}

	/// Return true if this instruction is a comparison.
	bool isCompare(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::Compare, Type);
	}

	/// Return true if this instruction is a move immediate
	/// (including conditional moves) instruction.
	bool isMoveImmediate(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::MoveImm, Type);
	}

	/// Return true if this instruction is a bitcast instruction.
	bool isBitcast(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::Bitcast, Type);
	}

	/// Return true if this instruction is a select instruction.
	bool isSelect(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::Select, Type);
	}

	/// Return true if this instruction cannot be safely duplicated.
	/// For example, if the instruction has a unique labels attached
	/// to it, duplicating it would cause multiple definition errors.
	bool isNotDuplicable(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::NotDuplicable, Type);
	}

	/// Return true if this instruction is convergent.
	/// Convergent instructions can not be made control-dependent on any
	/// additional values.
	bool isConvergent(QueryType Type = AnyInBundle) const {
	if (isInlineAsm()) {
	unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
	if (ExtraInfo & InlineAsm::Extra_IsConvergent)
	return true;
	}
	return hasProperty(MCID::Convergent, Type);
	}

	/// Returns true if the specified instruction has a delay slot
	/// which must be filled by the code generator.
	bool hasDelaySlot(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::DelaySlot, Type);
	}

	/// Return true for instructions that can be folded as
	/// memory operands in other instructions. The most common use for this
	/// is instructions that are simple loads from memory that don't modify
	/// the loaded value in any way, but it can also be used for instructions
	/// that can be expressed as constant-pool loads, such as V_SETALLONES
	/// on x86, to allow them to be folded when it is beneficial.
	/// This should only be set on instructions that return a value in their
	/// only virtual register definition.
	bool canFoldAsLoad(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::FoldableAsLoad, Type);
	}

	/// \brief Return true if this instruction behaves
	/// the same way as the generic REG_SEQUENCE instructions.
	/// E.g., on ARM,
	/// dX VMOVDRR rY, rZ
	/// is equivalent to
	/// dX = REG_SEQUENCE rY, ssub_0, rZ, ssub_1.
	///
	/// Note that for the optimizers to be able to take advantage of
	/// this property, TargetInstrInfo::getRegSequenceLikeInputs has to be
	/// override accordingly.
	bool isRegSequenceLike(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::RegSequence, Type);
	}

	/// \brief Return true if this instruction behaves
	/// the same way as the generic EXTRACT_SUBREG instructions.
	/// E.g., on ARM,
	/// rX, rY VMOVRRD dZ
	/// is equivalent to two EXTRACT_SUBREG:
	/// rX = EXTRACT_SUBREG dZ, ssub_0
	/// rY = EXTRACT_SUBREG dZ, ssub_1
	///
	/// Note that for the optimizers to be able to take advantage of
	/// this property, TargetInstrInfo::getExtractSubregLikeInputs has to be
	/// override accordingly.
	bool isExtractSubregLike(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::ExtractSubreg, Type);
	}

	/// \brief Return true if this instruction behaves
	/// the same way as the generic INSERT_SUBREG instructions.
	/// E.g., on ARM,
	/// dX = VSETLNi32 dY, rZ, Imm
	/// is equivalent to a INSERT_SUBREG:
	/// dX = INSERT_SUBREG dY, rZ, translateImmToSubIdx(Imm)
	///
	/// Note that for the optimizers to be able to take advantage of
	/// this property, TargetInstrInfo::getInsertSubregLikeInputs has to be
	/// override accordingly.
	bool isInsertSubregLike(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::InsertSubreg, Type);
	}

	//===--------------------------------------------------------------------===//
	// Side Effect Analysis
	//===--------------------------------------------------------------------===//

	/// Return true if this instruction could possibly read memory.
	/// Instructions with this flag set are not necessarily simple load
	/// instructions, they may load a value and modify it, for example.
	bool mayLoad(QueryType Type = AnyInBundle) const {
	if (isInlineAsm()) {
	unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
	if (ExtraInfo & InlineAsm::Extra_MayLoad)
	return true;
	}
	return hasProperty(MCID::MayLoad, Type);
	}

	/// Return true if this instruction could possibly modify memory.
	/// Instructions with this flag set are not necessarily simple store
	/// instructions, they may store a modified value based on their operands, or
	/// may not actually modify anything, for example.
	bool mayStore(QueryType Type = AnyInBundle) const {
	if (isInlineAsm()) {
	unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
	if (ExtraInfo & InlineAsm::Extra_MayStore)
	return true;
	}
	return hasProperty(MCID::MayStore, Type);
	}

	/// Return true if this instruction could possibly read or modify memory.
	bool mayLoadOrStore(QueryType Type = AnyInBundle) const {
	return mayLoad(Type) \|\| mayStore(Type);
	}

	//===--------------------------------------------------------------------===//
	// Flags that indicate whether an instruction can be modified by a method.
	//===--------------------------------------------------------------------===//

	/// Return true if this may be a 2- or 3-address
	/// instruction (of the form "X = op Y, Z, ..."), which produces the same
	/// result if Y and Z are exchanged. If this flag is set, then the
	/// TargetInstrInfo::commuteInstruction method may be used to hack on the
	/// instruction.
	///
	/// Note that this flag may be set on instructions that are only commutable
	/// sometimes. In these cases, the call to commuteInstruction will fail.
	/// Also note that some instructions require non-trivial modification to
	/// commute them.
	bool isCommutable(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::Commutable, Type);
	}

	/// Return true if this is a 2-address instruction
	/// which can be changed into a 3-address instruction if needed. Doing this
	/// transformation can be profitable in the register allocator, because it
	/// means that the instruction can use a 2-address form if possible, but
	/// degrade into a less efficient form if the source and dest register cannot
	/// be assigned to the same register. For example, this allows the x86
	/// backend to turn a "shl reg, 3" instruction into an LEA instruction, which
	/// is the same speed as the shift but has bigger code size.
	///
	/// If this returns true, then the target must implement the
	/// TargetInstrInfo::convertToThreeAddress method for this instruction, which
	/// is allowed to fail if the transformation isn't valid for this specific
	/// instruction (e.g. shl reg, 4 on x86).
	///
	bool isConvertibleTo3Addr(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::ConvertibleTo3Addr, Type);
	}

	/// Return true if this instruction requires
	/// custom insertion support when the DAG scheduler is inserting it into a
	/// machine basic block. If this is true for the instruction, it basically
	/// means that it is a pseudo instruction used at SelectionDAG time that is
	/// expanded out into magic code by the target when MachineInstrs are formed.
	///
	/// If this is true, the TargetLoweringInfo::InsertAtEndOfBasicBlock method
	/// is used to insert this into the MachineBasicBlock.
	bool usesCustomInsertionHook(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::UsesCustomInserter, Type);
	}

	/// Return true if this instruction requires adjustment
	/// after instruction selection by calling a target hook. For example, this
	/// can be used to fill in ARM 's' optional operand depending on whether
	/// the conditional flag register is used.
	bool hasPostISelHook(QueryType Type = IgnoreBundle) const {
	return hasProperty(MCID::HasPostISelHook, Type);
	}

	/// Returns true if this instruction is a candidate for remat.
	/// This flag is deprecated, please don't use it anymore. If this
	/// flag is set, the isReallyTriviallyReMaterializable() method is called to
	/// verify the instruction is really rematable.
	bool isRematerializable(QueryType Type = AllInBundle) const {
	// It's only possible to re-mat a bundle if all bundled instructions are
	// re-materializable.
	return hasProperty(MCID::Rematerializable, Type);
	}

	/// Returns true if this instruction has the same cost (or less) than a move
	/// instruction. This is useful during certain types of optimizations
	/// (e.g., remat during two-address conversion or machine licm)
	/// where we would like to remat or hoist the instruction, but not if it costs
	/// more than moving the instruction into the appropriate register. Note, we
	/// are not marking copies from and to the same register class with this flag.
	bool isAsCheapAsAMove(QueryType Type = AllInBundle) const {
	// Only returns true for a bundle if all bundled instructions are cheap.
	return hasProperty(MCID::CheapAsAMove, Type);
	}

	/// Returns true if this instruction source operands
	/// have special register allocation requirements that are not captured by the
	/// operand register classes. e.g. ARM::STRD's two source registers must be an
	/// even / odd pair, ARM::STM registers have to be in ascending order.
	/// Post-register allocation passes should not attempt to change allocations
	/// for sources of instructions with this flag.
	bool hasExtraSrcRegAllocReq(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::ExtraSrcRegAllocReq, Type);
	}

	/// Returns true if this instruction def operands
	/// have special register allocation requirements that are not captured by the
	/// operand register classes. e.g. ARM::LDRD's two def registers must be an
	/// even / odd pair, ARM::LDM registers have to be in ascending order.
	/// Post-register allocation passes should not attempt to change allocations
	/// for definitions of instructions with this flag.
	bool hasExtraDefRegAllocReq(QueryType Type = AnyInBundle) const {
	return hasProperty(MCID::ExtraDefRegAllocReq, Type);
	}

	enum MICheckType {
	CheckDefs, // Check all operands for equality
	CheckKillDead, // Check all operands including kill / dead markers
	IgnoreDefs, // Ignore all definitions
	IgnoreVRegDefs // Ignore virtual register definitions
	};

	/// Return true if this instruction is identical to \p Other.
	/// Two instructions are identical if they have the same opcode and all their
	/// operands are identical (with respect to MachineOperand::isIdenticalTo()).
	/// Note that this means liveness related flags (dead, undef, kill) do not
	/// affect the notion of identical.
	bool isIdenticalTo(const MachineInstr &Other,
	MICheckType Check = CheckDefs) const;

	/// Unlink 'this' from the containing basic block, and return it without
	/// deleting it.
	///
	/// This function can not be used on bundled instructions, use
	/// removeFromBundle() to remove individual instructions from a bundle.
	MachineInstr *removeFromParent();

	/// Unlink this instruction from its basic block and return it without
	/// deleting it.
	///
	/// If the instruction is part of a bundle, the other instructions in the
	/// bundle remain bundled.
	MachineInstr *removeFromBundle();

	/// Unlink 'this' from the containing basic block and delete it.
	///
	/// If this instruction is the header of a bundle, the whole bundle is erased.
	/// This function can not be used for instructions inside a bundle, use
	/// eraseFromBundle() to erase individual bundled instructions.
	void eraseFromParent();

	/// Unlink 'this' from the containing basic block and delete it.
	///
	/// For all definitions mark their uses in DBG_VALUE nodes
	/// as undefined. Otherwise like eraseFromParent().
	void eraseFromParentAndMarkDBGValuesForRemoval();

	/// Unlink 'this' form its basic block and delete it.
	///
	/// If the instruction is part of a bundle, the other instructions in the
	/// bundle remain bundled.
	void eraseFromBundle();

	bool isEHLabel() const { return getOpcode() == TargetOpcode::EH_LABEL; }
	bool isGCLabel() const { return getOpcode() == TargetOpcode::GC_LABEL; }

	/// Returns true if the MachineInstr represents a label.
	bool isLabel() const { return isEHLabel() \|\| isGCLabel(); }

	bool isCFIInstruction() const {
	return getOpcode() == TargetOpcode::CFI_INSTRUCTION;
	}

	// True if the instruction represents a position in the function.
	bool isPosition() const { return isLabel() \|\| isCFIInstruction(); }

	bool isDebugValue() const { return getOpcode() == TargetOpcode::DBG_VALUE; }

	/// A DBG_VALUE is indirect iff the first operand is a register and
	/// the second operand is an immediate.
	bool isIndirectDebugValue() const {
	return isDebugValue()
	&& getOperand(0).isReg()
	&& getOperand(1).isImm();
	}

	bool isPHI() const { return getOpcode() == TargetOpcode::PHI; }
	bool isKill() const { return getOpcode() == TargetOpcode::KILL; }
	bool isImplicitDef() const { return getOpcode()==TargetOpcode::IMPLICIT_DEF; }
	bool isInlineAsm() const { return getOpcode() == TargetOpcode::INLINEASM; }

	bool isMSInlineAsm() const {
	return getOpcode() == TargetOpcode::INLINEASM && getInlineAsmDialect();
	}

	bool isStackAligningInlineAsm() const;
	InlineAsm::AsmDialect getInlineAsmDialect() const;

	bool isInsertSubreg() const {
	return getOpcode() == TargetOpcode::INSERT_SUBREG;
	}

	bool isSubregToReg() const {
	return getOpcode() == TargetOpcode::SUBREG_TO_REG;
	}

	bool isRegSequence() const {
	return getOpcode() == TargetOpcode::REG_SEQUENCE;
	}

	bool isBundle() const {
	return getOpcode() == TargetOpcode::BUNDLE;
	}

	bool isCopy() const {
	return getOpcode() == TargetOpcode::COPY;
	}

	bool isFullCopy() const {
	return isCopy() && !getOperand(0).getSubReg() && !getOperand(1).getSubReg();
	}

	bool isExtractSubreg() const {
	return getOpcode() == TargetOpcode::EXTRACT_SUBREG;
	}

	/// Return true if the instruction behaves like a copy.
	/// This does not include native copy instructions.
	bool isCopyLike() const {
	return isCopy() \|\| isSubregToReg();
	}

	/// Return true is the instruction is an identity copy.
	bool isIdentityCopy() const {
	return isCopy() && getOperand(0).getReg() == getOperand(1).getReg() &&
	getOperand(0).getSubReg() == getOperand(1).getSubReg();
	}

	/// Return true if this instruction doesn't produce any output in the form of
	/// executable instructions.
	bool isMetaInstruction() const {
	switch (getOpcode()) {
	default:
	return false;
	case TargetOpcode::IMPLICIT_DEF:
	case TargetOpcode::KILL:
	case TargetOpcode::CFI_INSTRUCTION:
	case TargetOpcode::EH_LABEL:
	case TargetOpcode::GC_LABEL:
	case TargetOpcode::DBG_VALUE:
	return true;
	}
	}

	/// Return true if this is a transient instruction that is either very likely
	/// to be eliminated during register allocation (such as copy-like
	/// instructions), or if this instruction doesn't have an execution-time cost.
	bool isTransient() const {
	switch (getOpcode()) {
	default:
	return isMetaInstruction();
	// Copy-like instructions are usually eliminated during register allocation.
	case TargetOpcode::PHI:
	case TargetOpcode::COPY:
	case TargetOpcode::INSERT_SUBREG:
	case TargetOpcode::SUBREG_TO_REG:
	case TargetOpcode::REG_SEQUENCE:
	return true;
	}
	}

	/// Return the number of instructions inside the MI bundle, excluding the
	/// bundle header.
	///
	/// This is the number of instructions that MachineBasicBlock::iterator
	/// skips, 0 for unbundled instructions.
	unsigned getBundleSize() const;

	/// Return true if the MachineInstr reads the specified register.
	/// If TargetRegisterInfo is passed, then it also checks if there
	/// is a read of a super-register.
	/// This does not count partial redefines of virtual registers as reads:
	/// %reg1024:6 = OP.
	bool readsRegister(unsigned Reg,
	const TargetRegisterInfo *TRI = nullptr) const {
	return findRegisterUseOperandIdx(Reg, false, TRI) != -1;
	}

	/// Return true if the MachineInstr reads the specified virtual register.
	/// Take into account that a partial define is a
	/// read-modify-write operation.
	bool readsVirtualRegister(unsigned Reg) const {
	return readsWritesVirtualRegister(Reg).first;
	}

	/// Return a pair of bools (reads, writes) indicating if this instruction
	/// reads or writes Reg. This also considers partial defines.
	/// If Ops is not null, all operand indices for Reg are added.
	std::pair<bool,bool> readsWritesVirtualRegister(unsigned Reg,
	SmallVectorImpl<unsigned> *Ops = nullptr) const;

	/// Return true if the MachineInstr kills the specified register.
	/// If TargetRegisterInfo is passed, then it also checks if there is
	/// a kill of a super-register.
	bool killsRegister(unsigned Reg,
	const TargetRegisterInfo *TRI = nullptr) const {
	return findRegisterUseOperandIdx(Reg, true, TRI) != -1;
	}

	/// Return true if the MachineInstr fully defines the specified register.
	/// If TargetRegisterInfo is passed, then it also checks
	/// if there is a def of a super-register.
	/// NOTE: It's ignoring subreg indices on virtual registers.
	bool definesRegister(unsigned Reg,
	const TargetRegisterInfo *TRI = nullptr) const {
	return findRegisterDefOperandIdx(Reg, false, false, TRI) != -1;
	}

	/// Return true if the MachineInstr modifies (fully define or partially
	/// define) the specified register.
	/// NOTE: It's ignoring subreg indices on virtual registers.
	bool modifiesRegister(unsigned Reg, const TargetRegisterInfo *TRI) const {
	return findRegisterDefOperandIdx(Reg, false, true, TRI) != -1;
	}

	/// Returns true if the register is dead in this machine instruction.
	/// If TargetRegisterInfo is passed, then it also checks
	/// if there is a dead def of a super-register.
	bool registerDefIsDead(unsigned Reg,
	const TargetRegisterInfo *TRI = nullptr) const {
	return findRegisterDefOperandIdx(Reg, true, false, TRI) != -1;
	}

	/// Returns true if the MachineInstr has an implicit-use operand of exactly
	/// the given register (not considering sub/super-registers).
	bool hasRegisterImplicitUseOperand(unsigned Reg) const;

	/// Returns the operand index that is a use of the specific register or -1
	/// if it is not found. It further tightens the search criteria to a use
	/// that kills the register if isKill is true.
	int findRegisterUseOperandIdx(unsigned Reg, bool isKill = false,
	const TargetRegisterInfo *TRI = nullptr) const;

	/// Wrapper for findRegisterUseOperandIdx, it returns
	/// a pointer to the MachineOperand rather than an index.
	MachineOperand *findRegisterUseOperand(unsigned Reg, bool isKill = false,
	const TargetRegisterInfo *TRI = nullptr) {
	int Idx = findRegisterUseOperandIdx(Reg, isKill, TRI);
	return (Idx == -1) ? nullptr : &getOperand(Idx);
	}

	const MachineOperand *findRegisterUseOperand(
	unsigned Reg, bool isKill = false,
	const TargetRegisterInfo *TRI = nullptr) const {
	return const_cast<MachineInstr *>(this)->
	findRegisterUseOperand(Reg, isKill, TRI);
	}

	/// Returns the operand index that is a def of the specified register or
	/// -1 if it is not found. If isDead is true, defs that are not dead are
	/// skipped. If Overlap is true, then it also looks for defs that merely
	/// overlap the specified register. If TargetRegisterInfo is non-null,
	/// then it also checks if there is a def of a super-register.
	/// This may also return a register mask operand when Overlap is true.
	int findRegisterDefOperandIdx(unsigned Reg,
	bool isDead = false, bool Overlap = false,
	const TargetRegisterInfo *TRI = nullptr) const;

	/// Wrapper for findRegisterDefOperandIdx, it returns
	/// a pointer to the MachineOperand rather than an index.
	MachineOperand *findRegisterDefOperand(unsigned Reg, bool isDead = false,
	const TargetRegisterInfo *TRI = nullptr) {
	int Idx = findRegisterDefOperandIdx(Reg, isDead, false, TRI);
	return (Idx == -1) ? nullptr : &getOperand(Idx);
	}

	/// Find the index of the first operand in the
	/// operand list that is used to represent the predicate. It returns -1 if
	/// none is found.
	int findFirstPredOperandIdx() const;

	/// Find the index of the flag word operand that
	/// corresponds to operand OpIdx on an inline asm instruction. Returns -1 if
	/// getOperand(OpIdx) does not belong to an inline asm operand group.
	///
	/// If GroupNo is not NULL, it will receive the number of the operand group
	/// containing OpIdx.
	///
	/// The flag operand is an immediate that can be decoded with methods like
	/// InlineAsm::hasRegClassConstraint().
	int findInlineAsmFlagIdx(unsigned OpIdx, unsigned *GroupNo = nullptr) const;

	/// Compute the static register class constraint for operand OpIdx.
	/// For normal instructions, this is derived from the MCInstrDesc.
	/// For inline assembly it is derived from the flag words.
	///
	/// Returns NULL if the static register class constraint cannot be
	/// determined.
	const TargetRegisterClass*
	getRegClassConstraint(unsigned OpIdx,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI) const;

	/// \brief Applies the constraints (def/use) implied by this MI on \p Reg to
	/// the given \p CurRC.
	/// If \p ExploreBundle is set and MI is part of a bundle, all the
	/// instructions inside the bundle will be taken into account. In other words,
	/// this method accumulates all the constraints of the operand of this MI and
	/// the related bundle if MI is a bundle or inside a bundle.
	///
	/// Returns the register class that satisfies both \p CurRC and the
	/// constraints set by MI. Returns NULL if such a register class does not
	/// exist.
	///
	/// \pre CurRC must not be NULL.
	const TargetRegisterClass *getRegClassConstraintEffectForVReg(
	unsigned Reg, const TargetRegisterClass *CurRC,
	const TargetInstrInfo TII, const TargetRegisterInfo TRI,
	bool ExploreBundle = false) const;

	/// \brief Applies the constraints (def/use) implied by the \p OpIdx operand
	/// to the given \p CurRC.
	///
	/// Returns the register class that satisfies both \p CurRC and the
	/// constraints set by \p OpIdx MI. Returns NULL if such a register class
	/// does not exist.
	///
	/// \pre CurRC must not be NULL.
	/// \pre The operand at \p OpIdx must be a register.
	const TargetRegisterClass *
	getRegClassConstraintEffect(unsigned OpIdx, const TargetRegisterClass *CurRC,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI) const;

	/// Add a tie between the register operands at DefIdx and UseIdx.
	/// The tie will cause the register allocator to ensure that the two
	/// operands are assigned the same physical register.
	///
	/// Tied operands are managed automatically for explicit operands in the
	/// MCInstrDesc. This method is for exceptional cases like inline asm.
	void tieOperands(unsigned DefIdx, unsigned UseIdx);

	/// Given the index of a tied register operand, find the
	/// operand it is tied to. Defs are tied to uses and vice versa. Returns the
	/// index of the tied operand which must exist.
	unsigned findTiedOperandIdx(unsigned OpIdx) const;

	/// Given the index of a register def operand,
	/// check if the register def is tied to a source operand, due to either
	/// two-address elimination or inline assembly constraints. Returns the
	/// first tied use operand index by reference if UseOpIdx is not null.
	bool isRegTiedToUseOperand(unsigned DefOpIdx,
	unsigned *UseOpIdx = nullptr) const {
	const MachineOperand &MO = getOperand(DefOpIdx);
	if (!MO.isReg() \|\| !MO.isDef() \|\| !MO.isTied())
	return false;
	if (UseOpIdx)
	*UseOpIdx = findTiedOperandIdx(DefOpIdx);
	return true;
	}

	/// Return true if the use operand of the specified index is tied to a def
	/// operand. It also returns the def operand index by reference if DefOpIdx
	/// is not null.
	bool isRegTiedToDefOperand(unsigned UseOpIdx,
	unsigned *DefOpIdx = nullptr) const {
	const MachineOperand &MO = getOperand(UseOpIdx);
	if (!MO.isReg() \|\| !MO.isUse() \|\| !MO.isTied())
	return false;
	if (DefOpIdx)
	*DefOpIdx = findTiedOperandIdx(UseOpIdx);
	return true;
	}

	/// Clears kill flags on all operands.
	void clearKillInfo();

	/// Replace all occurrences of FromReg with ToReg:SubIdx,
	/// properly composing subreg indices where necessary.
	void substituteRegister(unsigned FromReg, unsigned ToReg, unsigned SubIdx,
	const TargetRegisterInfo &RegInfo);

	/// We have determined MI kills a register. Look for the
	/// operand that uses it and mark it as IsKill. If AddIfNotFound is true,
	/// add a implicit operand if it's not found. Returns true if the operand
	/// exists / is added.
	bool addRegisterKilled(unsigned IncomingReg,
	const TargetRegisterInfo *RegInfo,
	bool AddIfNotFound = false);

	/// Clear all kill flags affecting Reg. If RegInfo is provided, this includes
	/// all aliasing registers.
	void clearRegisterKills(unsigned Reg, const TargetRegisterInfo *RegInfo);

	/// We have determined MI defined a register without a use.
	/// Look for the operand that defines it and mark it as IsDead. If
	/// AddIfNotFound is true, add a implicit operand if it's not found. Returns
	/// true if the operand exists / is added.
	bool addRegisterDead(unsigned Reg, const TargetRegisterInfo *RegInfo,
	bool AddIfNotFound = false);

	/// Clear all dead flags on operands defining register @p Reg.
	void clearRegisterDeads(unsigned Reg);

	/// Mark all subregister defs of register @p Reg with the undef flag.
	/// This function is used when we determined to have a subregister def in an
	/// otherwise undefined super register.
	void setRegisterDefReadUndef(unsigned Reg, bool IsUndef = true);

	/// We have determined MI defines a register. Make sure there is an operand
	/// defining Reg.
	void addRegisterDefined(unsigned Reg,
	const TargetRegisterInfo *RegInfo = nullptr);

	/// Mark every physreg used by this instruction as
	/// dead except those in the UsedRegs list.
	///
	/// On instructions with register mask operands, also add implicit-def
	/// operands for all registers in UsedRegs.
	void setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
	const TargetRegisterInfo &TRI);

	/// Return true if it is safe to move this instruction. If
	/// SawStore is set to true, it means that there is a store (or call) between
	/// the instruction's location and its intended destination.
	bool isSafeToMove(AliasAnalysis *AA, bool &SawStore) const;

	/// Returns true if this instruction's memory access aliases the memory
	/// access of Other.
	//
	/// Assumes any physical registers used to compute addresses
	/// have the same value for both instructions. Returns false if neither
	/// instruction writes to memory.
	///
	/// @param AA Optional alias analysis, used to compare memory operands.
	/// @param Other MachineInstr to check aliasing against.
	/// @param UseTBAA Whether to pass TBAA information to alias analysis.
	bool mayAlias(AliasAnalysis *AA, MachineInstr &Other, bool UseTBAA);

	/// Return true if this instruction may have an ordered
	/// or volatile memory reference, or if the information describing the memory
	/// reference is not available. Return false if it is known to have no
	/// ordered or volatile memory references.
	bool hasOrderedMemoryRef() const;

	/// Return true if this load instruction never traps and points to a memory
	/// location whose value doesn't change during the execution of this function.
	///
	/// Examples include loading a value from the constant pool or from the
	/// argument area of a function (if it does not change). If the instruction
	/// does multiple loads, this returns true only if all of the loads are
	/// dereferenceable and invariant.
	bool isDereferenceableInvariantLoad(AliasAnalysis *AA) const;

	/// If the specified instruction is a PHI that always merges together the
	/// same virtual register, return the register, otherwise return 0.
	unsigned isConstantValuePHI() const;

	/// Return true if this instruction has side effects that are not modeled
	/// by mayLoad / mayStore, etc.
	/// For all instructions, the property is encoded in MCInstrDesc::Flags
	/// (see MCInstrDesc::hasUnmodeledSideEffects(). The only exception is
	/// INLINEASM instruction, in which case the side effect property is encoded
	/// in one of its operands (see InlineAsm::Extra_HasSideEffect).
	///
	bool hasUnmodeledSideEffects() const;

	/// Returns true if it is illegal to fold a load across this instruction.
	bool isLoadFoldBarrier() const;

	/// Return true if all the defs of this instruction are dead.
	bool allDefsAreDead() const;

	/// Copy implicit register operands from specified
	/// instruction to this instruction.
	void copyImplicitOps(MachineFunction &MF, const MachineInstr &MI);

	/// Debugging support
	/// @{
	/// Print this MI to \p OS.
	/// Only print the defs and the opcode if \p SkipOpers is true.
	/// Otherwise, also print operands if \p SkipDebugLoc is true.
	/// Otherwise, also print the debug loc, with a terminating newline.
	/// \p TII is used to print the opcode name. If it's not present, but the
	/// MI is in a function, the opcode will be printed using the function's TII.
	void print(raw_ostream &OS, bool SkipOpers = false, bool SkipDebugLoc = false,
	const TargetInstrInfo *TII = nullptr) const;
	void print(raw_ostream &OS, ModuleSlotTracker &MST, bool SkipOpers = false,
	bool SkipDebugLoc = false,
	const TargetInstrInfo *TII = nullptr) const;
	void dump() const;
	/// @}

	//===--------------------------------------------------------------------===//
	// Accessors used to build up machine instructions.

	/// Add the specified operand to the instruction. If it is an implicit
	/// operand, it is added to the end of the operand list. If it is an
	/// explicit operand it is added at the end of the explicit operand list
	/// (before the first implicit operand).
	///
	/// MF must be the machine function that was used to allocate this
	/// instruction.
	///
	/// MachineInstrBuilder provides a more convenient interface for creating
	/// instructions and adding operands.
	void addOperand(MachineFunction &MF, const MachineOperand &Op);

	/// Add an operand without providing an MF reference. This only works for
	/// instructions that are inserted in a basic block.
	///
	/// MachineInstrBuilder and the two-argument addOperand(MF, MO) should be
	/// preferred.
	void addOperand(const MachineOperand &Op);

	/// Replace the instruction descriptor (thus opcode) of
	/// the current instruction with a new one.
	void setDesc(const MCInstrDesc &tid) { MCID = &tid; }

	/// Replace current source information with new such.
	/// Avoid using this, the constructor argument is preferable.
	void setDebugLoc(DebugLoc dl) {
	debugLoc = std::move(dl);
	assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
	}

	/// Erase an operand from an instruction, leaving it with one
	/// fewer operand than it started with.
	void RemoveOperand(unsigned i);

	/// Add a MachineMemOperand to the machine instruction.
	/// This function should be used only occasionally. The setMemRefs function
	/// is the primary method for setting up a MachineInstr's MemRefs list.
	void addMemOperand(MachineFunction &MF, MachineMemOperand *MO);

	/// Assign this MachineInstr's memory reference descriptor list.
	/// This does not transfer ownership.
	void setMemRefs(mmo_iterator NewMemRefs, mmo_iterator NewMemRefsEnd) {
	setMemRefs(std::make_pair(NewMemRefs, NewMemRefsEnd-NewMemRefs));
	}

	/// Assign this MachineInstr's memory reference descriptor list. First
	/// element in the pair is the begin iterator/pointer to the array; the
	/// second is the number of MemoryOperands. This does not transfer ownership
	/// of the underlying memory.
	void setMemRefs(std::pair<mmo_iterator, unsigned> NewMemRefs) {
	MemRefs = NewMemRefs.first;
	NumMemRefs = uint8_t(NewMemRefs.second);
	assert(NumMemRefs == NewMemRefs.second &&
	"Too many memrefs - must drop memory operands");
	}

	/// Return a set of memrefs (begin iterator, size) which conservatively
	/// describe the memory behavior of both MachineInstrs. This is appropriate
	/// for use when merging two MachineInstrs into one. This routine does not
	/// modify the memrefs of the this MachineInstr.
	std::pair<mmo_iterator, unsigned> mergeMemRefsWith(const MachineInstr& Other);

	/// Clear this MachineInstr's memory reference descriptor list. This resets
	/// the memrefs to their most conservative state. This should be used only
	/// as a last resort since it greatly pessimizes our knowledge of the memory
	/// access performed by the instruction.
	void dropMemRefs() {
	MemRefs = nullptr;
	NumMemRefs = 0;
	}

	/// Break any tie involving OpIdx.
	void untieRegOperand(unsigned OpIdx) {
	MachineOperand &MO = getOperand(OpIdx);
	if (MO.isReg() && MO.isTied()) {
	getOperand(findTiedOperandIdx(OpIdx)).TiedTo = 0;
	MO.TiedTo = 0;
	}
	}

	/// Add all implicit def and use operands to this instruction.
	void addImplicitDefUseOperands(MachineFunction &MF);

	private:
	/// If this instruction is embedded into a MachineFunction, return the
	/// MachineRegisterInfo object for the current function, otherwise
	/// return null.
	MachineRegisterInfo *getRegInfo();

	/// Unlink all of the register operands in this instruction from their
	/// respective use lists. This requires that the operands already be on their
	/// use lists.
	void RemoveRegOperandsFromUseLists(MachineRegisterInfo&);

	/// Add all of the register operands in this instruction from their
	/// respective use lists. This requires that the operands not be on their
	/// use lists yet.
	void AddRegOperandsToUseLists(MachineRegisterInfo&);

	/// Slow path for hasProperty when we're dealing with a bundle.
	bool hasPropertyInBundle(unsigned Mask, QueryType Type) const;

	/// \brief Implements the logic of getRegClassConstraintEffectForVReg for the
	/// this MI and the given operand index \p OpIdx.
	/// If the related operand does not constrained Reg, this returns CurRC.
	const TargetRegisterClass *getRegClassConstraintEffectForVRegImpl(
	unsigned OpIdx, unsigned Reg, const TargetRegisterClass *CurRC,
	const TargetInstrInfo TII, const TargetRegisterInfo TRI) const;
	};

	/// Special DenseMapInfo traits to compare MachineInstr* by value of the
	/// instruction rather than by pointer value.
	/// The hashing and equality testing functions ignore definitions so this is
	/// useful for CSE, etc.
	struct MachineInstrExpressionTrait : DenseMapInfo<MachineInstr*> {
	static inline MachineInstr *getEmptyKey() {
	return nullptr;
	}

	static inline MachineInstr *getTombstoneKey() {
	return reinterpret_cast<MachineInstr*>(-1);
	}

	static unsigned getHashValue(const MachineInstr* const &MI);

	static bool isEqual(const MachineInstr* const &LHS,
	const MachineInstr* const &RHS) {
	if (RHS == getEmptyKey() \|\| RHS == getTombstoneKey() \|\|
	LHS == getEmptyKey() \|\| LHS == getTombstoneKey())
	return LHS == RHS;
	return LHS->isIdenticalTo(*RHS, MachineInstr::IgnoreVRegDefs);
	}
	};

	//===----------------------------------------------------------------------===//
	// Debugging Support

	inline raw_ostream& operator<<(raw_ostream &OS, const MachineInstr &MI) {
	MI.print(OS);
	return OS;
	}

	} // end namespace llvm

	#endif // LLVM_CODEGEN_MACHINEINSTR_H
	Index: head/contrib/llvm/lib/Analysis/ValueTracking.cpp
	===================================================================
	--- head/contrib/llvm/lib/Analysis/ValueTracking.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Analysis/ValueTracking.cpp (revision 322320)
	@@ -1,4478 +1,4541 @@
	//===- ValueTracking.cpp - Walk computations to compute properties --------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains routines that help analyze properties that chains of
	// computations have.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/InstructionSimplify.h"
	#include "llvm/Analysis/Loads.h"
	#include "llvm/Analysis/LoopInfo.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/IR/Statepoint.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include <algorithm>
	#include <array>
	#include <cstring>
	using namespace llvm;
	using namespace llvm::PatternMatch;

	const unsigned MaxDepth = 6;

	// Controls the number of uses of the value searched for possible
	// dominating comparisons.
	static cl::opt<unsigned> DomConditionsMaxUses("dom-conditions-max-uses",
	cl::Hidden, cl::init(20));

	// This optimization is known to cause performance regressions is some cases,
	// keep it under a temporary flag for now.
	static cl::opt<bool>
	DontImproveNonNegativePhiBits("dont-improve-non-negative-phi-bits",
	cl::Hidden, cl::init(true));

	/// Returns the bitwidth of the given scalar or pointer type. For vector types,
	/// returns the element type's bitwidth.
	static unsigned getBitWidth(Type *Ty, const DataLayout &DL) {
	if (unsigned BitWidth = Ty->getScalarSizeInBits())
	return BitWidth;

	return DL.getPointerTypeSizeInBits(Ty);
	}

	namespace {
	// Simplifying using an assume can only be done in a particular control-flow
	// context (the context instruction provides that context). If an assume and
	// the context instruction are not in the same block then the DT helps in
	// figuring out if we can use it.
	struct Query {
	const DataLayout &DL;
	AssumptionCache *AC;
	const Instruction *CxtI;
	const DominatorTree *DT;
	// Unlike the other analyses, this may be a nullptr because not all clients
	// provide it currently.
	OptimizationRemarkEmitter *ORE;

	/// Set of assumptions that should be excluded from further queries.
	/// This is because of the potential for mutual recursion to cause
	/// computeKnownBits to repeatedly visit the same assume intrinsic. The
	/// classic case of this is assume(x = y), which will attempt to determine
	/// bits in x from bits in y, which will attempt to determine bits in y from
	/// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call
	/// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo
	/// (all of which can call computeKnownBits), and so on.
	std::array<const Value *, MaxDepth> Excluded;
	unsigned NumExcluded;

	Query(const DataLayout &DL, AssumptionCache AC, const Instruction CxtI,
	const DominatorTree DT, OptimizationRemarkEmitter ORE = nullptr)
	: DL(DL), AC(AC), CxtI(CxtI), DT(DT), ORE(ORE), NumExcluded(0) {}

	Query(const Query &Q, const Value *NewExcl)
	: DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), ORE(Q.ORE),
	NumExcluded(Q.NumExcluded) {
	Excluded = Q.Excluded;
	Excluded[NumExcluded++] = NewExcl;
	assert(NumExcluded <= Excluded.size());
	}

	bool isExcluded(const Value *Value) const {
	if (NumExcluded == 0)
	return false;
	auto End = Excluded.begin() + NumExcluded;
	return std::find(Excluded.begin(), End, Value) != End;
	}
	};
	} // end anonymous namespace

	// Given the provided Value and, potentially, a context instruction, return
	// the preferred context instruction (if any).
	static const Instruction safeCxtI(const Value V, const Instruction *CxtI) {
	// If we've been provided with a context instruction, then use that (provided
	// it has been inserted).
	if (CxtI && CxtI->getParent())
	return CxtI;

	// If the value is really an already-inserted instruction, then use that.
	CxtI = dyn_cast<Instruction>(V);
	if (CxtI && CxtI->getParent())
	return CxtI;

	return nullptr;
	}

	static void computeKnownBits(const Value *V, KnownBits &Known,
	unsigned Depth, const Query &Q);

	void llvm::computeKnownBits(const Value *V, KnownBits &Known,
	const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT,
	OptimizationRemarkEmitter *ORE) {
	::computeKnownBits(V, Known, Depth,
	Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
	}

	static KnownBits computeKnownBits(const Value *V, unsigned Depth,
	const Query &Q);

	KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
	unsigned Depth, AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT,
	OptimizationRemarkEmitter *ORE) {
	return ::computeKnownBits(V, Depth,
	Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
	}

	bool llvm::haveNoCommonBitsSet(const Value LHS, const Value RHS,
	const DataLayout &DL,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	assert(LHS->getType() == RHS->getType() &&
	"LHS and RHS should have the same type");
	assert(LHS->getType()->isIntOrIntVectorTy() &&
	"LHS and RHS should be integers");
	IntegerType *IT = cast<IntegerType>(LHS->getType()->getScalarType());
	KnownBits LHSKnown(IT->getBitWidth());
	KnownBits RHSKnown(IT->getBitWidth());
	computeKnownBits(LHS, LHSKnown, DL, 0, AC, CxtI, DT);
	computeKnownBits(RHS, RHSKnown, DL, 0, AC, CxtI, DT);
	return (LHSKnown.Zero \| RHSKnown.Zero).isAllOnesValue();
	}


	bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
	for (const User *U : CxtI->users()) {
	if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
	if (IC->isEquality())
	if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
	if (C->isNullValue())
	continue;
	return false;
	}
	return true;
	}

	static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
	const Query &Q);

	bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
	bool OrZero,
	unsigned Depth, AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	return ::isKnownToBeAPowerOfTwo(V, OrZero, Depth,
	Query(DL, AC, safeCxtI(V, CxtI), DT));
	}

	static bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q);

	bool llvm::isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	return ::isKnownNonZero(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
	}

	bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL,
	unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
	return Known.isNonNegative();
	}

	bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	if (auto *CI = dyn_cast<ConstantInt>(V))
	return CI->getValue().isStrictlyPositive();

	// TODO: We'd doing two recursive queries here. We should factor this such
	// that only a single query is needed.
	return isKnownNonNegative(V, DL, Depth, AC, CxtI, DT) &&
	isKnownNonZero(V, DL, Depth, AC, CxtI, DT);
	}

	bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT);
	return Known.isNegative();
	}

	static bool isKnownNonEqual(const Value V1, const Value V2, const Query &Q);

	bool llvm::isKnownNonEqual(const Value V1, const Value V2,
	const DataLayout &DL,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	return ::isKnownNonEqual(V1, V2, Query(DL, AC,
	safeCxtI(V1, safeCxtI(V2, CxtI)),
	DT));
	}

	static bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
	const Query &Q);

	bool llvm::MaskedValueIsZero(const Value *V, const APInt &Mask,
	const DataLayout &DL,
	unsigned Depth, AssumptionCache *AC,
	const Instruction CxtI, const DominatorTree DT) {
	return ::MaskedValueIsZero(V, Mask, Depth,
	Query(DL, AC, safeCxtI(V, CxtI), DT));
	}

	static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
	const Query &Q);

	unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
	unsigned Depth, AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	return ::ComputeNumSignBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
	}

	static void computeKnownBitsAddSub(bool Add, const Value Op0, const Value Op1,
	bool NSW,
	KnownBits &KnownOut, KnownBits &Known2,
	unsigned Depth, const Query &Q) {
	unsigned BitWidth = KnownOut.getBitWidth();

	// If an initial sequence of bits in the result is not needed, the
	// corresponding bits in the operands are not needed.
	KnownBits LHSKnown(BitWidth);
	computeKnownBits(Op0, LHSKnown, Depth + 1, Q);
	computeKnownBits(Op1, Known2, Depth + 1, Q);

	// Carry in a 1 for a subtract, rather than a 0.
	uint64_t CarryIn = 0;
	if (!Add) {
	// Sum = LHS + ~RHS + 1
	std::swap(Known2.Zero, Known2.One);
	CarryIn = 1;
	}

	APInt PossibleSumZero = ~LHSKnown.Zero + ~Known2.Zero + CarryIn;
	APInt PossibleSumOne = LHSKnown.One + Known2.One + CarryIn;

	// Compute known bits of the carry.
	APInt CarryKnownZero = ~(PossibleSumZero ^ LHSKnown.Zero ^ Known2.Zero);
	APInt CarryKnownOne = PossibleSumOne ^ LHSKnown.One ^ Known2.One;

	// Compute set of known bits (where all three relevant bits are known).
	APInt LHSKnownUnion = LHSKnown.Zero \| LHSKnown.One;
	APInt RHSKnownUnion = Known2.Zero \| Known2.One;
	APInt CarryKnownUnion = CarryKnownZero \| CarryKnownOne;
	APInt Known = LHSKnownUnion & RHSKnownUnion & CarryKnownUnion;

	assert((PossibleSumZero & Known) == (PossibleSumOne & Known) &&
	"known bits of sum differ");

	// Compute known bits of the result.
	KnownOut.Zero = ~PossibleSumOne & Known;
	KnownOut.One = PossibleSumOne & Known;

	// Are we still trying to solve for the sign bit?
	if (!Known.isSignBitSet()) {
	if (NSW) {
	// Adding two non-negative numbers, or subtracting a negative number from
	// a non-negative one, can't wrap into negative.
	if (LHSKnown.isNonNegative() && Known2.isNonNegative())
	KnownOut.makeNonNegative();
	// Adding two negative numbers, or subtracting a non-negative number from
	// a negative one, can't wrap into non-negative.
	else if (LHSKnown.isNegative() && Known2.isNegative())
	KnownOut.makeNegative();
	}
	}
	}

	static void computeKnownBitsMul(const Value Op0, const Value Op1, bool NSW,
	KnownBits &Known, KnownBits &Known2,
	unsigned Depth, const Query &Q) {
	unsigned BitWidth = Known.getBitWidth();
	computeKnownBits(Op1, Known, Depth + 1, Q);
	computeKnownBits(Op0, Known2, Depth + 1, Q);

	bool isKnownNegative = false;
	bool isKnownNonNegative = false;
	// If the multiplication is known not to overflow, compute the sign bit.
	if (NSW) {
	if (Op0 == Op1) {
	// The product of a number with itself is non-negative.
	isKnownNonNegative = true;
	} else {
	bool isKnownNonNegativeOp1 = Known.isNonNegative();
	bool isKnownNonNegativeOp0 = Known2.isNonNegative();
	bool isKnownNegativeOp1 = Known.isNegative();
	bool isKnownNegativeOp0 = Known2.isNegative();
	// The product of two numbers with the same sign is non-negative.
	isKnownNonNegative = (isKnownNegativeOp1 && isKnownNegativeOp0) \|\|
	(isKnownNonNegativeOp1 && isKnownNonNegativeOp0);
	// The product of a negative number and a non-negative number is either
	// negative or zero.
	if (!isKnownNonNegative)
	isKnownNegative = (isKnownNegativeOp1 && isKnownNonNegativeOp0 &&
	isKnownNonZero(Op0, Depth, Q)) \|\|
	(isKnownNegativeOp0 && isKnownNonNegativeOp1 &&
	isKnownNonZero(Op1, Depth, Q));
	}
	}

	// If low bits are zero in either operand, output low known-0 bits.
	// Also compute a conservative estimate for high known-0 bits.
	// More trickiness is possible, but this is sufficient for the
	// interesting case of alignment computation.
	unsigned TrailZ = Known.countMinTrailingZeros() +
	Known2.countMinTrailingZeros();
	unsigned LeadZ = std::max(Known.countMinLeadingZeros() +
	Known2.countMinLeadingZeros(),
	BitWidth) - BitWidth;

	TrailZ = std::min(TrailZ, BitWidth);
	LeadZ = std::min(LeadZ, BitWidth);
	Known.resetAll();
	Known.Zero.setLowBits(TrailZ);
	Known.Zero.setHighBits(LeadZ);

	// Only make use of no-wrap flags if we failed to compute the sign bit
	// directly. This matters if the multiplication always overflows, in
	// which case we prefer to follow the result of the direct computation,
	// though as the program is invoking undefined behaviour we can choose
	// whatever we like here.
	if (isKnownNonNegative && !Known.isNegative())
	Known.makeNonNegative();
	else if (isKnownNegative && !Known.isNonNegative())
	Known.makeNegative();
	}

	void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
	KnownBits &Known) {
	unsigned BitWidth = Known.getBitWidth();
	unsigned NumRanges = Ranges.getNumOperands() / 2;
	assert(NumRanges >= 1);

	Known.Zero.setAllBits();
	Known.One.setAllBits();

	for (unsigned i = 0; i < NumRanges; ++i) {
	ConstantInt *Lower =
	mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 0));
	ConstantInt *Upper =
	mdconst::extract<ConstantInt>(Ranges.getOperand(2 * i + 1));
	ConstantRange Range(Lower->getValue(), Upper->getValue());

	// The first CommonPrefixBits of all values in Range are equal.
	unsigned CommonPrefixBits =
	(Range.getUnsignedMax() ^ Range.getUnsignedMin()).countLeadingZeros();

	APInt Mask = APInt::getHighBitsSet(BitWidth, CommonPrefixBits);
	Known.One &= Range.getUnsignedMax() & Mask;
	Known.Zero &= ~Range.getUnsignedMax() & Mask;
	}
	}

	static bool isEphemeralValueOf(const Instruction I, const Value E) {
	SmallVector<const Value *, 16> WorkSet(1, I);
	SmallPtrSet<const Value *, 32> Visited;
	SmallPtrSet<const Value *, 16> EphValues;

	// The instruction defining an assumption's condition itself is always
	// considered ephemeral to that assumption (even if it has other
	// non-ephemeral users). See r246696's test case for an example.
	if (is_contained(I->operands(), E))
	return true;

	while (!WorkSet.empty()) {
	const Value *V = WorkSet.pop_back_val();
	if (!Visited.insert(V).second)
	continue;

	// If all uses of this value are ephemeral, then so is this value.
	if (all_of(V->users(), [&](const User *U) { return EphValues.count(U); })) {
	if (V == E)
	return true;

	EphValues.insert(V);
	if (const User *U = dyn_cast<User>(V))
	for (User::const_op_iterator J = U->op_begin(), JE = U->op_end();
	J != JE; ++J) {
	if (isSafeToSpeculativelyExecute(*J))
	WorkSet.push_back(*J);
	}
	}
	}

	return false;
	}

	// Is this an intrinsic that cannot be speculated but also cannot trap?
	static bool isAssumeLikeIntrinsic(const Instruction *I) {
	if (const CallInst *CI = dyn_cast<CallInst>(I))
	if (Function *F = CI->getCalledFunction())
	switch (F->getIntrinsicID()) {
	default: break;
	// FIXME: This list is repeated from NoTTI::getIntrinsicCost.
	case Intrinsic::assume:
	case Intrinsic::dbg_declare:
	case Intrinsic::dbg_value:
	case Intrinsic::invariant_start:
	case Intrinsic::invariant_end:
	case Intrinsic::lifetime_start:
	case Intrinsic::lifetime_end:
	case Intrinsic::objectsize:
	case Intrinsic::ptr_annotation:
	case Intrinsic::var_annotation:
	return true;
	}

	return false;
	}

	bool llvm::isValidAssumeForContext(const Instruction *Inv,
	const Instruction *CxtI,
	const DominatorTree *DT) {

	// There are two restrictions on the use of an assume:
	// 1. The assume must dominate the context (or the control flow must
	// reach the assume whenever it reaches the context).
	// 2. The context must not be in the assume's set of ephemeral values
	// (otherwise we will use the assume to prove that the condition
	// feeding the assume is trivially true, thus causing the removal of
	// the assume).

	if (DT) {
	if (DT->dominates(Inv, CxtI))
	return true;
	} else if (Inv->getParent() == CxtI->getParent()->getSinglePredecessor()) {
	// We don't have a DT, but this trivially dominates.
	return true;
	}

	// With or without a DT, the only remaining case we will check is if the
	// instructions are in the same BB. Give up if that is not the case.
	if (Inv->getParent() != CxtI->getParent())
	return false;

	// If we have a dom tree, then we now know that the assume doens't dominate
	// the other instruction. If we don't have a dom tree then we can check if
	// the assume is first in the BB.
	if (!DT) {
	// Search forward from the assume until we reach the context (or the end
	// of the block); the common case is that the assume will come first.
	for (auto I = std::next(BasicBlock::const_iterator(Inv)),
	IE = Inv->getParent()->end(); I != IE; ++I)
	if (&*I == CxtI)
	return true;
	}

	// The context comes first, but they're both in the same block. Make sure
	// there is nothing in between that might interrupt the control flow.
	for (BasicBlock::const_iterator I =
	std::next(BasicBlock::const_iterator(CxtI)), IE(Inv);
	I != IE; ++I)
	if (!isSafeToSpeculativelyExecute(&I) && !isAssumeLikeIntrinsic(&I))
	return false;

	return !isEphemeralValueOf(Inv, CxtI);
	}

	static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
	unsigned Depth, const Query &Q) {
	// Use of assumptions is context-sensitive. If we don't have a context, we
	// cannot use them!
	if (!Q.AC \|\| !Q.CxtI)
	return;

	unsigned BitWidth = Known.getBitWidth();

	// Note that the patterns below need to be kept in sync with the code
	// in AssumptionCache::updateAffectedValues.

	for (auto &AssumeVH : Q.AC->assumptionsFor(V)) {
	if (!AssumeVH)
	continue;
	CallInst *I = cast<CallInst>(AssumeVH);
	assert(I->getParent()->getParent() == Q.CxtI->getParent()->getParent() &&
	"Got assumption for the wrong function!");
	if (Q.isExcluded(I))
	continue;

	// Warning: This loop can end up being somewhat performance sensetive.
	// We're running this loop for once for each value queried resulting in a
	// runtime of ~O(#assumes * #values).

	assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
	"must be an assume intrinsic");

	Value *Arg = I->getArgOperand(0);

	if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	assert(BitWidth == 1 && "assume operand is not i1?");
	Known.setAllOnes();
	return;
	}
	if (match(Arg, m_Not(m_Specific(V))) &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	assert(BitWidth == 1 && "assume operand is not i1?");
	Known.setAllZero();
	return;
	}

	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth == MaxDepth)
	continue;

	Value A, B;
	auto m_V = m_CombineOr(m_Specific(V),
	m_CombineOr(m_PtrToInt(m_Specific(V)),
	m_BitCast(m_Specific(V))));

	CmpInst::Predicate Pred;
	ConstantInt *C;
	// assume(v = a)
	if (match(Arg, m_c_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	Known.Zero \|= RHSKnown.Zero;
	Known.One \|= RHSKnown.One;
	// assume(v & b = a)
	} else if (match(Arg,
	m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits MaskKnown(BitWidth);
	computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));

	// For those bits in the mask that are known to be one, we can propagate
	// known bits from the RHS to V.
	Known.Zero \|= RHSKnown.Zero & MaskKnown.One;
	Known.One \|= RHSKnown.One & MaskKnown.One;
	// assume(~(v & b) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits MaskKnown(BitWidth);
	computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));

	// For those bits in the mask that are known to be one, we can propagate
	// inverted known bits from the RHS to V.
	Known.Zero \|= RHSKnown.One & MaskKnown.One;
	Known.One \|= RHSKnown.Zero & MaskKnown.One;
	// assume(v \| b = a)
	} else if (match(Arg,
	m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits BKnown(BitWidth);
	computeKnownBits(B, BKnown, Depth+1, Query(Q, I));

	// For those bits in B that are known to be zero, we can propagate known
	// bits from the RHS to V.
	Known.Zero \|= RHSKnown.Zero & BKnown.Zero;
	Known.One \|= RHSKnown.One & BKnown.Zero;
	// assume(~(v \| b) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits BKnown(BitWidth);
	computeKnownBits(B, BKnown, Depth+1, Query(Q, I));

	// For those bits in B that are known to be zero, we can propagate
	// inverted known bits from the RHS to V.
	Known.Zero \|= RHSKnown.One & BKnown.Zero;
	Known.One \|= RHSKnown.Zero & BKnown.Zero;
	// assume(v ^ b = a)
	} else if (match(Arg,
	m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits BKnown(BitWidth);
	computeKnownBits(B, BKnown, Depth+1, Query(Q, I));

	// For those bits in B that are known to be zero, we can propagate known
	// bits from the RHS to V. For those bits in B that are known to be one,
	// we can propagate inverted known bits from the RHS to V.
	Known.Zero \|= RHSKnown.Zero & BKnown.Zero;
	Known.One \|= RHSKnown.One & BKnown.Zero;
	Known.Zero \|= RHSKnown.One & BKnown.One;
	Known.One \|= RHSKnown.Zero & BKnown.One;
	// assume(~(v ^ b) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	KnownBits BKnown(BitWidth);
	computeKnownBits(B, BKnown, Depth+1, Query(Q, I));

	// For those bits in B that are known to be zero, we can propagate
	// inverted known bits from the RHS to V. For those bits in B that are
	// known to be one, we can propagate known bits from the RHS to V.
	Known.Zero \|= RHSKnown.One & BKnown.Zero;
	Known.One \|= RHSKnown.Zero & BKnown.Zero;
	Known.Zero \|= RHSKnown.Zero & BKnown.One;
	Known.One \|= RHSKnown.One & BKnown.One;
	// assume(v << c = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	// For those bits in RHS that are known, we can propagate them to known
	// bits in V shifted to the right by C.
	RHSKnown.Zero.lshrInPlace(C->getZExtValue());
	Known.Zero \|= RHSKnown.Zero;
	RHSKnown.One.lshrInPlace(C->getZExtValue());
	Known.One \|= RHSKnown.One;
	// assume(~(v << c) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	// For those bits in RHS that are known, we can propagate them inverted
	// to known bits in V shifted to the right by C.
	RHSKnown.One.lshrInPlace(C->getZExtValue());
	Known.Zero \|= RHSKnown.One;
	RHSKnown.Zero.lshrInPlace(C->getZExtValue());
	Known.One \|= RHSKnown.Zero;
	// assume(v >> c = a)
	} else if (match(Arg,
	m_c_ICmp(Pred, m_Shr(m_V, m_ConstantInt(C)),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	// For those bits in RHS that are known, we can propagate them to known
	// bits in V shifted to the right by C.
	Known.Zero \|= RHSKnown.Zero << C->getZExtValue();
	Known.One \|= RHSKnown.One << C->getZExtValue();
	// assume(~(v >> c) = a)
	} else if (match(Arg, m_c_ICmp(Pred, m_Not(m_Shr(m_V, m_ConstantInt(C))),
	m_Value(A))) &&
	Pred == ICmpInst::ICMP_EQ &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
	// For those bits in RHS that are known, we can propagate them inverted
	// to known bits in V shifted to the right by C.
	Known.Zero \|= RHSKnown.One << C->getZExtValue();
	Known.One \|= RHSKnown.Zero << C->getZExtValue();
	// assume(v >=_s c) where c is non-negative
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_SGE &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	if (RHSKnown.isNonNegative()) {
	// We know that the sign bit is zero.
	Known.makeNonNegative();
	}
	// assume(v >_s c) where c is at least -1.
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_SGT &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	if (RHSKnown.isAllOnes() \|\| RHSKnown.isNonNegative()) {
	// We know that the sign bit is zero.
	Known.makeNonNegative();
	}
	// assume(v <=_s c) where c is negative
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_SLE &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	if (RHSKnown.isNegative()) {
	// We know that the sign bit is one.
	Known.makeNegative();
	}
	// assume(v <_s c) where c is non-positive
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_SLT &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	if (RHSKnown.isZero() \|\| RHSKnown.isNegative()) {
	// We know that the sign bit is one.
	Known.makeNegative();
	}
	// assume(v <=_u c)
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_ULE &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	// Whatever high bits in c are zero are known to be zero.
	Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
	// assume(v <_u c)
	} else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
	Pred == ICmpInst::ICMP_ULT &&
	isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));

	// Whatever high bits in c are zero are known to be zero (if c is a power
	// of 2, then one more).
	if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
	Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1);
	else
	Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
	}
	}

	// If assumptions conflict with each other or previous known bits, then we
	// have a logical fallacy. It's possible that the assumption is not reachable,
	// so this isn't a real bug. On the other hand, the program may have undefined
	// behavior, or we might have a bug in the compiler. We can't assert/crash, so
	// clear out the known bits, try to warn the user, and hope for the best.
	if (Known.Zero.intersects(Known.One)) {
	Known.resetAll();

	if (Q.ORE) {
	auto CxtI = const_cast<Instruction >(Q.CxtI);
	OptimizationRemarkAnalysis ORA("value-tracking", "BadAssumption", CxtI);
	Q.ORE->emit(ORA << "Detected conflicting code assumptions. Program may "
	"have undefined behavior, or compiler may have "
	"internal error.");
	}
	}
	}

	// Compute known bits from a shift operator, including those with a
	// non-constant shift amount. Known is the outputs of this function. Known2 is a
	// pre-allocated temporary with the/ same bit width as Known. KZF and KOF are
	// operator-specific functors that, given the known-zero or known-one bits
	// respectively, and a shift amount, compute the implied known-zero or known-one
	// bits of the shift operator's result respectively for that shift amount. The
	// results from calling KZF and KOF are conservatively combined for all
	// permitted shift amounts.
	static void computeKnownBitsFromShiftOperator(
	const Operator *I, KnownBits &Known, KnownBits &Known2,
	unsigned Depth, const Query &Q,
	function_ref<APInt(const APInt &, unsigned)> KZF,
	function_ref<APInt(const APInt &, unsigned)> KOF) {
	unsigned BitWidth = Known.getBitWidth();

	if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
	unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1);

	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	Known.Zero = KZF(Known.Zero, ShiftAmt);
	Known.One = KOF(Known.One, ShiftAmt);
	// If there is conflict between Known.Zero and Known.One, this must be an
	// overflowing left shift, so the shift result is undefined. Clear Known
	// bits so that other code could propagate this undef.
	if ((Known.Zero & Known.One) != 0)
	Known.resetAll();

	return;
	}

	computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);

	// If the shift amount could be greater than or equal to the bit-width of the LHS, the
	// value could be undef, so we don't know anything about it.
	if ((~Known.Zero).uge(BitWidth)) {
	Known.resetAll();
	return;
	}

	// Note: We cannot use Known.Zero.getLimitedValue() here, because if
	// BitWidth > 64 and any upper bits are known, we'll end up returning the
	// limit value (which implies all bits are known).
	uint64_t ShiftAmtKZ = Known.Zero.zextOrTrunc(64).getZExtValue();
	uint64_t ShiftAmtKO = Known.One.zextOrTrunc(64).getZExtValue();

	// It would be more-clearly correct to use the two temporaries for this
	// calculation. Reusing the APInts here to prevent unnecessary allocations.
	Known.resetAll();

	// If we know the shifter operand is nonzero, we can sometimes infer more
	// known bits. However this is expensive to compute, so be lazy about it and
	// only compute it when absolutely necessary.
	Optional<bool> ShifterOperandIsNonZero;

	// Early exit if we can't constrain any well-defined shift amount.
	if (!(ShiftAmtKZ & (PowerOf2Ceil(BitWidth) - 1)) &&
	!(ShiftAmtKO & (PowerOf2Ceil(BitWidth) - 1))) {
	ShifterOperandIsNonZero =
	isKnownNonZero(I->getOperand(1), Depth + 1, Q);
	if (!*ShifterOperandIsNonZero)
	return;
	}

	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	Known.Zero.setAllBits();
	Known.One.setAllBits();
	for (unsigned ShiftAmt = 0; ShiftAmt < BitWidth; ++ShiftAmt) {
	// Combine the shifted known input bits only for those shift amounts
	// compatible with its known constraints.
	if ((ShiftAmt & ~ShiftAmtKZ) != ShiftAmt)
	continue;
	if ((ShiftAmt \| ShiftAmtKO) != ShiftAmt)
	continue;
	// If we know the shifter is nonzero, we may be able to infer more known
	// bits. This check is sunk down as far as possible to avoid the expensive
	// call to isKnownNonZero if the cheaper checks above fail.
	if (ShiftAmt == 0) {
	if (!ShifterOperandIsNonZero.hasValue())
	ShifterOperandIsNonZero =
	isKnownNonZero(I->getOperand(1), Depth + 1, Q);
	if (*ShifterOperandIsNonZero)
	continue;
	}

	Known.Zero &= KZF(Known2.Zero, ShiftAmt);
	Known.One &= KOF(Known2.One, ShiftAmt);
	}

	// If there are no compatible shift amounts, then we've proven that the shift
	// amount must be >= the BitWidth, and the result is undefined. We could
	// return anything we'd like, but we need to make sure the sets of known bits
	// stay disjoint (it should be better for some other code to actually
	// propagate the undef than to pick a value here using known bits).
	if (Known.Zero.intersects(Known.One))
	Known.resetAll();
	}

	static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
	unsigned Depth, const Query &Q) {
	unsigned BitWidth = Known.getBitWidth();

	KnownBits Known2(Known);
	switch (I->getOpcode()) {
	default: break;
	case Instruction::Load:
	if (MDNode *MD = cast<LoadInst>(I)->getMetadata(LLVMContext::MD_range))
	computeKnownBitsFromRangeMetadata(*MD, Known);
	break;
	case Instruction::And: {
	// If either the LHS or the RHS are Zero, the result is zero.
	computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	// Output known-1 bits are only known if set in both the LHS & RHS.
	Known.One &= Known2.One;
	// Output known-0 are known to be clear if zero in either the LHS \| RHS.
	Known.Zero \|= Known2.Zero;

	// and(x, add (x, -1)) is a common idiom that always clears the low bit;
	// here we handle the more general case of adding any odd number by
	// matching the form add(x, add(x, y)) where y is odd.
	// TODO: This could be generalized to clearing any bit set in y where the
	// following bit is known to be unset in y.
	Value *Y = nullptr;
	if (!Known.Zero[0] && !Known.One[0] &&
	(match(I->getOperand(0), m_Add(m_Specific(I->getOperand(1)),
	m_Value(Y))) \|\|
	match(I->getOperand(1), m_Add(m_Specific(I->getOperand(0)),
	m_Value(Y))))) {
	Known2.resetAll();
	computeKnownBits(Y, Known2, Depth + 1, Q);
	if (Known2.countMinTrailingOnes() > 0)
	Known.Zero.setBit(0);
	}
	break;
	}
	case Instruction::Or: {
	computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	// Output known-0 bits are only known if clear in both the LHS & RHS.
	Known.Zero &= Known2.Zero;
	// Output known-1 are known to be set if set in either the LHS \| RHS.
	Known.One \|= Known2.One;
	break;
	}
	case Instruction::Xor: {
	computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	// Output known-0 bits are known if clear or set in both the LHS & RHS.
	APInt KnownZeroOut = (Known.Zero & Known2.Zero) \| (Known.One & Known2.One);
	// Output known-1 are known to be set if set in only one of the LHS, RHS.
	Known.One = (Known.Zero & Known2.One) \| (Known.One & Known2.Zero);
	Known.Zero = std::move(KnownZeroOut);
	break;
	}
	case Instruction::Mul: {
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known,
	Known2, Depth, Q);
	break;
	}
	case Instruction::UDiv: {
	// For the purposes of computing leading zeros we can conservatively
	// treat a udiv as a logical right shift by the power of 2 known to
	// be less than the denominator.
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	unsigned LeadZ = Known2.countMinLeadingZeros();

	Known2.resetAll();
	computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
	unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros();
	if (RHSMaxLeadingZeros != BitWidth)
	LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1);

	Known.Zero.setHighBits(LeadZ);
	break;
	}
	case Instruction::Select: {
	const Value LHS, RHS;
	SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
	if (SelectPatternResult::isMinOrMax(SPF)) {
	computeKnownBits(RHS, Known, Depth + 1, Q);
	computeKnownBits(LHS, Known2, Depth + 1, Q);
	} else {
	computeKnownBits(I->getOperand(2), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);
	}

	unsigned MaxHighOnes = 0;
	unsigned MaxHighZeros = 0;
	if (SPF == SPF_SMAX) {
	// If both sides are negative, the result is negative.
	if (Known.isNegative() && Known2.isNegative())
	// We can derive a lower bound on the result by taking the max of the
	// leading one bits.
	MaxHighOnes =
	std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
	// If either side is non-negative, the result is non-negative.
	else if (Known.isNonNegative() \|\| Known2.isNonNegative())
	MaxHighZeros = 1;
	} else if (SPF == SPF_SMIN) {
	// If both sides are non-negative, the result is non-negative.
	if (Known.isNonNegative() && Known2.isNonNegative())
	// We can derive an upper bound on the result by taking the max of the
	// leading zero bits.
	MaxHighZeros = std::max(Known.countMinLeadingZeros(),
	Known2.countMinLeadingZeros());
	// If either side is negative, the result is negative.
	else if (Known.isNegative() \|\| Known2.isNegative())
	MaxHighOnes = 1;
	} else if (SPF == SPF_UMAX) {
	// We can derive a lower bound on the result by taking the max of the
	// leading one bits.
	MaxHighOnes =
	std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes());
	} else if (SPF == SPF_UMIN) {
	// We can derive an upper bound on the result by taking the max of the
	// leading zero bits.
	MaxHighZeros =
	std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
	}

	// Only known if known in both the LHS and RHS.
	Known.One &= Known2.One;
	Known.Zero &= Known2.Zero;
	if (MaxHighOnes > 0)
	Known.One.setHighBits(MaxHighOnes);
	if (MaxHighZeros > 0)
	Known.Zero.setHighBits(MaxHighZeros);
	break;
	}
	case Instruction::FPTrunc:
	case Instruction::FPExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	break; // Can't work with floating point.
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	// Fall through and handle them the same as zext/trunc.
	LLVM_FALLTHROUGH;
	case Instruction::ZExt:
	case Instruction::Trunc: {
	Type *SrcTy = I->getOperand(0)->getType();

	unsigned SrcBitWidth;
	// Note that we handle pointer operands here because of inttoptr/ptrtoint
	// which fall through here.
	SrcBitWidth = Q.DL.getTypeSizeInBits(SrcTy->getScalarType());

	assert(SrcBitWidth && "SrcBitWidth can't be zero");
	Known = Known.zextOrTrunc(SrcBitWidth);
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	Known = Known.zextOrTrunc(BitWidth);
	// Any top bits are known to be zero.
	if (BitWidth > SrcBitWidth)
	Known.Zero.setBitsFrom(SrcBitWidth);
	break;
	}
	case Instruction::BitCast: {
	Type *SrcTy = I->getOperand(0)->getType();
	if ((SrcTy->isIntegerTy() \|\| SrcTy->isPointerTy()) &&
	// TODO: For now, not handling conversions like:
	// (bitcast i64 %x to <2 x i32>)
	!I->getType()->isVectorTy()) {
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	break;
	}
	break;
	}
	case Instruction::SExt: {
	// Compute the bits in the result that are not present in the input.
	unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();

	Known = Known.trunc(SrcBitWidth);
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	// If the sign bit of the input is known set or clear, then we know the
	// top bits of the result.
	Known = Known.sext(BitWidth);
	break;
	}
	case Instruction::Shl: {
	// (shl X, C1) & C2 == 0 iff (X & C2 >>u C1) == 0
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	auto KZF = [NSW](const APInt &KnownZero, unsigned ShiftAmt) {
	APInt KZResult = KnownZero << ShiftAmt;
	KZResult.setLowBits(ShiftAmt); // Low bits known 0.
	// If this shift has "nsw" keyword, then the result is either a poison
	// value or has the same sign bit as the first operand.
	if (NSW && KnownZero.isSignBitSet())
	KZResult.setSignBit();
	return KZResult;
	};

	auto KOF = [NSW](const APInt &KnownOne, unsigned ShiftAmt) {
	APInt KOResult = KnownOne << ShiftAmt;
	if (NSW && KnownOne.isSignBitSet())
	KOResult.setSignBit();
	return KOResult;
	};

	computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
	break;
	}
	case Instruction::LShr: {
	// (ushr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0
	auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
	APInt KZResult = KnownZero.lshr(ShiftAmt);
	// High bits known zero.
	KZResult.setHighBits(ShiftAmt);
	return KZResult;
	};

	auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
	return KnownOne.lshr(ShiftAmt);
	};

	computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
	break;
	}
	case Instruction::AShr: {
	// (ashr X, C1) & C2 == 0 iff (-1 >> C1) & C2 == 0
	auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
	return KnownZero.ashr(ShiftAmt);
	};

	auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
	return KnownOne.ashr(ShiftAmt);
	};

	computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
	break;
	}
	case Instruction::Sub: {
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
	Known, Known2, Depth, Q);
	break;
	}
	case Instruction::Add: {
	bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
	computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
	Known, Known2, Depth, Q);
	break;
	}
	case Instruction::SRem:
	if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
	APInt RA = Rem->getValue().abs();
	if (RA.isPowerOf2()) {
	APInt LowBits = RA - 1;
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);

	// The low bits of the first operand are unchanged by the srem.
	Known.Zero = Known2.Zero & LowBits;
	Known.One = Known2.One & LowBits;

	// If the first operand is non-negative or has all low bits zero, then
	// the upper bits are all zero.
	if (Known2.isNonNegative() \|\| LowBits.isSubsetOf(Known2.Zero))
	Known.Zero \|= ~LowBits;

	// If the first operand is negative and not all low bits are zero, then
	// the upper bits are all one.
	if (Known2.isNegative() && LowBits.intersects(Known2.One))
	Known.One \|= ~LowBits;

	assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
	break;
	}
	}

	// The sign bit is the LHS's sign bit, except when the result of the
	// remainder is zero.
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	// If it's known zero, our sign bit is also zero.
	if (Known2.isNonNegative())
	Known.makeNonNegative();

	break;
	case Instruction::URem: {
	if (ConstantInt *Rem = dyn_cast<ConstantInt>(I->getOperand(1))) {
	const APInt &RA = Rem->getValue();
	if (RA.isPowerOf2()) {
	APInt LowBits = (RA - 1);
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	Known.Zero \|= ~LowBits;
	Known.One &= LowBits;
	break;
	}
	}

	// Since the result is less than or equal to either operand, any leading
	// zero bits in either operand must also exist in the result.
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q);

	unsigned Leaders =
	std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros());
	Known.resetAll();
	Known.Zero.setHighBits(Leaders);
	break;
	}

	case Instruction::Alloca: {
	const AllocaInst *AI = cast<AllocaInst>(I);
	unsigned Align = AI->getAlignment();
	if (Align == 0)
	Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());

	if (Align > 0)
	Known.Zero.setLowBits(countTrailingZeros(Align));
	break;
	}
	case Instruction::GetElementPtr: {
	// Analyze all of the subscripts of this getelementptr instruction
	// to determine if we can prove known low zero bits.
	KnownBits LocalKnown(BitWidth);
	computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q);
	unsigned TrailZ = LocalKnown.countMinTrailingZeros();

	gep_type_iterator GTI = gep_type_begin(I);
	for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
	Value *Index = I->getOperand(i);
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	// Handle struct member offset arithmetic.

	// Handle case when index is vector zeroinitializer
	Constant *CIndex = cast<Constant>(Index);
	if (CIndex->isZeroValue())
	continue;

	if (CIndex->getType()->isVectorTy())
	Index = CIndex->getSplatValue();

	unsigned Idx = cast<ConstantInt>(Index)->getZExtValue();
	const StructLayout *SL = Q.DL.getStructLayout(STy);
	uint64_t Offset = SL->getElementOffset(Idx);
	TrailZ = std::min<unsigned>(TrailZ,
	countTrailingZeros(Offset));
	} else {
	// Handle array index arithmetic.
	Type *IndexedTy = GTI.getIndexedType();
	if (!IndexedTy->isSized()) {
	TrailZ = 0;
	break;
	}
	unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
	uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy);
	LocalKnown.Zero = LocalKnown.One = APInt(GEPOpiBits, 0);
	computeKnownBits(Index, LocalKnown, Depth + 1, Q);
	TrailZ = std::min(TrailZ,
	unsigned(countTrailingZeros(TypeSize) +
	LocalKnown.countMinTrailingZeros()));
	}
	}

	Known.Zero.setLowBits(TrailZ);
	break;
	}
	case Instruction::PHI: {
	const PHINode *P = cast<PHINode>(I);
	// Handle the case of a simple two-predecessor recurrence PHI.
	// There's a lot more that could theoretically be done here, but
	// this is sufficient to catch some interesting cases.
	if (P->getNumIncomingValues() == 2) {
	for (unsigned i = 0; i != 2; ++i) {
	Value *L = P->getIncomingValue(i);
	Value *R = P->getIncomingValue(!i);
	Operator *LU = dyn_cast<Operator>(L);
	if (!LU)
	continue;
	unsigned Opcode = LU->getOpcode();
	// Check for operations that have the property that if
	// both their operands have low zero bits, the result
	// will have low zero bits.
	if (Opcode == Instruction::Add \|\|
	Opcode == Instruction::Sub \|\|
	Opcode == Instruction::And \|\|
	Opcode == Instruction::Or \|\|
	Opcode == Instruction::Mul) {
	Value *LL = LU->getOperand(0);
	Value *LR = LU->getOperand(1);
	// Find a recurrence.
	if (LL == I)
	L = LR;
	else if (LR == I)
	L = LL;
	else
	break;
	// Ok, we have a PHI of the form L op= R. Check for low
	// zero bits.
	computeKnownBits(R, Known2, Depth + 1, Q);

	// We need to take the minimum number of known bits
	KnownBits Known3(Known);
	computeKnownBits(L, Known3, Depth + 1, Q);

	Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(),
	Known3.countMinTrailingZeros()));

	if (DontImproveNonNegativePhiBits)
	break;

	auto *OverflowOp = dyn_cast<OverflowingBinaryOperator>(LU);
	if (OverflowOp && OverflowOp->hasNoSignedWrap()) {
	// If initial value of recurrence is nonnegative, and we are adding
	// a nonnegative number with nsw, the result can only be nonnegative
	// or poison value regardless of the number of times we execute the
	// add in phi recurrence. If initial value is negative and we are
	// adding a negative number with nsw, the result can only be
	// negative or poison value. Similar arguments apply to sub and mul.
	//
	// (add non-negative, non-negative) --> non-negative
	// (add negative, negative) --> negative
	if (Opcode == Instruction::Add) {
	if (Known2.isNonNegative() && Known3.isNonNegative())
	Known.makeNonNegative();
	else if (Known2.isNegative() && Known3.isNegative())
	Known.makeNegative();
	}

	// (sub nsw non-negative, negative) --> non-negative
	// (sub nsw negative, non-negative) --> negative
	else if (Opcode == Instruction::Sub && LL == I) {
	if (Known2.isNonNegative() && Known3.isNegative())
	Known.makeNonNegative();
	else if (Known2.isNegative() && Known3.isNonNegative())
	Known.makeNegative();
	}

	// (mul nsw non-negative, non-negative) --> non-negative
	else if (Opcode == Instruction::Mul && Known2.isNonNegative() &&
	Known3.isNonNegative())
	Known.makeNonNegative();
	}

	break;
	}
	}
	}

	// Unreachable blocks may have zero-operand PHI nodes.
	if (P->getNumIncomingValues() == 0)
	break;

	// Otherwise take the unions of the known bit sets of the operands,
	// taking conservative care to avoid excessive recursion.
	if (Depth < MaxDepth - 1 && !Known.Zero && !Known.One) {
	// Skip if every incoming value references to ourself.
	if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
	break;

	Known.Zero.setAllBits();
	Known.One.setAllBits();
	for (Value *IncValue : P->incoming_values()) {
	// Skip direct self references.
	if (IncValue == P) continue;

	Known2 = KnownBits(BitWidth);
	// Recurse, but cap the recursion to one level, because we don't
	// want to waste time spinning around in loops.
	computeKnownBits(IncValue, Known2, MaxDepth - 1, Q);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	// If all bits have been ruled out, there's no need to check
	// more operands.
	if (!Known.Zero && !Known.One)
	break;
	}
	}
	break;
	}
	case Instruction::Call:
	case Instruction::Invoke:
	// If range metadata is attached to this call, set known bits from that,
	// and then intersect with known bits based on other properties of the
	// function.
	if (MDNode *MD = cast<Instruction>(I)->getMetadata(LLVMContext::MD_range))
	computeKnownBitsFromRangeMetadata(*MD, Known);
	if (const Value *RV = ImmutableCallSite(I).getReturnedArgOperand()) {
	computeKnownBits(RV, Known2, Depth + 1, Q);
	Known.Zero \|= Known2.Zero;
	Known.One \|= Known2.One;
	}
	if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::bitreverse:
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	Known.Zero \|= Known2.Zero.reverseBits();
	Known.One \|= Known2.One.reverseBits();
	break;
	case Intrinsic::bswap:
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	Known.Zero \|= Known2.Zero.byteSwap();
	Known.One \|= Known2.One.byteSwap();
	break;
	case Intrinsic::ctlz: {
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleLZ = Known2.One.countLeadingZeros();
	// If this call is undefined for 0, the result will be less than 2^n.
	if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
	PossibleLZ = std::min(PossibleLZ, BitWidth - 1);
	unsigned LowBits = Log2_32(PossibleLZ)+1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case Intrinsic::cttz: {
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	// If we have a known 1, its position is our upper bound.
	unsigned PossibleTZ = Known2.One.countTrailingZeros();
	// If this call is undefined for 0, the result will be less than 2^n.
	if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
	PossibleTZ = std::min(PossibleTZ, BitWidth - 1);
	unsigned LowBits = Log2_32(PossibleTZ)+1;
	Known.Zero.setBitsFrom(LowBits);
	break;
	}
	case Intrinsic::ctpop: {
	computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
	// We can bound the space the count needs. Also, bits known to be zero
	// can't contribute to the population.
	unsigned BitsPossiblySet = Known2.countMaxPopulation();
	unsigned LowBits = Log2_32(BitsPossiblySet)+1;
	Known.Zero.setBitsFrom(LowBits);
	// TODO: we could bound KnownOne using the lower bound on the number
	// of bits which might be set provided by popcnt KnownOne2.
	break;
	}
	case Intrinsic::x86_sse42_crc32_64_64:
	Known.Zero.setBitsFrom(32);
	break;
	}
	}
	break;
	case Instruction::ExtractElement:
	// Look through extract element. At the moment we keep this simple and skip
	// tracking the specific element. But at least we might find information
	// valid for all elements of the vector (for example if vector is sign
	// extended, shifted, etc).
	computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
	break;
	case Instruction::ExtractValue:
	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
	const ExtractValueInst *EVI = cast<ExtractValueInst>(I);
	if (EVI->getNumIndices() != 1) break;
	if (EVI->getIndices()[0] == 0) {
	switch (II->getIntrinsicID()) {
	default: break;
	case Intrinsic::uadd_with_overflow:
	case Intrinsic::sadd_with_overflow:
	computeKnownBitsAddSub(true, II->getArgOperand(0),
	II->getArgOperand(1), false, Known, Known2,
	Depth, Q);
	break;
	case Intrinsic::usub_with_overflow:
	case Intrinsic::ssub_with_overflow:
	computeKnownBitsAddSub(false, II->getArgOperand(0),
	II->getArgOperand(1), false, Known, Known2,
	Depth, Q);
	break;
	case Intrinsic::umul_with_overflow:
	case Intrinsic::smul_with_overflow:
	computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false,
	Known, Known2, Depth, Q);
	break;
	}
	}
	}
	}
	}

	/// Determine which bits of V are known to be either zero or one and return
	/// them.
	KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
	KnownBits Known(getBitWidth(V->getType(), Q.DL));
	computeKnownBits(V, Known, Depth, Q);
	return Known;
	}

	/// Determine which bits of V are known to be either zero or one and return
	/// them in the Known bit set.
	///
	/// NOTE: we cannot consider 'undef' to be "IsZero" here. The problem is that
	/// we cannot optimize based on the assumption that it is zero without changing
	/// it to be an explicit zero. If we don't change it to zero, other code could
	/// optimized based on the contradictory assumption that it is non-zero.
	/// Because instcombine aggressively folds operations with undef args anyway,
	/// this won't lose us code quality.
	///
	/// This function is defined on values with integer type, values with pointer
	/// type, and vectors of integers. In the case
	/// where V is a vector, known zero, and known one values are the
	/// same width as the vector element, and the bit is set only if it is true
	/// for all of the elements in the vector.
	void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
	const Query &Q) {
	assert(V && "No Value?");
	assert(Depth <= MaxDepth && "Limit Search Depth");
	unsigned BitWidth = Known.getBitWidth();

	assert((V->getType()->isIntOrIntVectorTy(BitWidth) \|\|
	V->getType()->isPtrOrPtrVectorTy()) &&
	"Not integer or pointer type!");
	assert(Q.DL.getTypeSizeInBits(V->getType()->getScalarType()) == BitWidth &&
	"V and Known should have same BitWidth");
	(void)BitWidth;

	const APInt *C;
	if (match(V, m_APInt(C))) {
	// We know all of the bits for a scalar constant or a splat vector constant!
	Known.One = *C;
	Known.Zero = ~Known.One;
	return;
	}
	// Null and aggregate-zero are all-zeros.
	if (isa<ConstantPointerNull>(V) \|\| isa<ConstantAggregateZero>(V)) {
	Known.setAllZero();
	return;
	}
	// Handle a constant vector by taking the intersection of the known bits of
	// each element.
	if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
	// We know that CDS must be a vector of integers. Take the intersection of
	// each element.
	Known.Zero.setAllBits(); Known.One.setAllBits();
	APInt Elt(BitWidth, 0);
	for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
	Elt = CDS->getElementAsInteger(i);
	Known.Zero &= ~Elt;
	Known.One &= Elt;
	}
	return;
	}

	if (const auto *CV = dyn_cast<ConstantVector>(V)) {
	// We know that CV must be a vector of integers. Take the intersection of
	// each element.
	Known.Zero.setAllBits(); Known.One.setAllBits();
	APInt Elt(BitWidth, 0);
	for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
	Constant *Element = CV->getAggregateElement(i);
	auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
	if (!ElementCI) {
	Known.resetAll();
	return;
	}
	Elt = ElementCI->getValue();
	Known.Zero &= ~Elt;
	Known.One &= Elt;
	}
	return;
	}

	// Start out not knowing anything.
	Known.resetAll();

	// We can't imply anything about undefs.
	if (isa<UndefValue>(V))
	return;

	// There's no point in looking through other users of ConstantData for
	// assumptions. Confirm that we've handled them all.
	assert(!isa<ConstantData>(V) && "Unhandled constant data!");

	// Limit search depth.
	// All recursive calls that increase depth must come after this.
	if (Depth == MaxDepth)
	return;

	// A weak GlobalAlias is totally unknown. A non-weak GlobalAlias has
	// the bits of its aliasee.
	if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
	if (!GA->isInterposable())
	computeKnownBits(GA->getAliasee(), Known, Depth + 1, Q);
	return;
	}

	if (const Operator *I = dyn_cast<Operator>(V))
	computeKnownBitsFromOperator(I, Known, Depth, Q);

	// Aligned pointers have trailing zeros - refine Known.Zero set
	if (V->getType()->isPointerTy()) {
	unsigned Align = V->getPointerAlignment(Q.DL);
	if (Align)
	Known.Zero.setLowBits(countTrailingZeros(Align));
	}

	// computeKnownBitsFromAssume strictly refines Known.
	// Therefore, we run them after computeKnownBitsFromOperator.

	// Check whether a nearby assume intrinsic can determine some known bits.
	computeKnownBitsFromAssume(V, Known, Depth, Q);

	assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
	}

	/// Return true if the given value is known to have exactly one
	/// bit set when defined. For vectors return true if every element is known to
	/// be a power of two when defined. Supports values with integer or pointer
	/// types and vectors of integers.
	bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
	const Query &Q) {
	if (const Constant *C = dyn_cast<Constant>(V)) {
	if (C->isNullValue())
	return OrZero;

	const APInt *ConstIntOrConstSplatInt;
	if (match(C, m_APInt(ConstIntOrConstSplatInt)))
	return ConstIntOrConstSplatInt->isPowerOf2();
	}

	// 1 << X is clearly a power of two if the one is not shifted off the end. If
	// it is shifted off the end then the result is undefined.
	if (match(V, m_Shl(m_One(), m_Value())))
	return true;

	// (signmask) >>l X is clearly a power of two if the one is not shifted off
	// the bottom. If it is shifted off the bottom then the result is undefined.
	if (match(V, m_LShr(m_SignMask(), m_Value())))
	return true;

	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth++ == MaxDepth)
	return false;

	Value X = nullptr, Y = nullptr;
	// A shift left or a logical shift right of a power of two is a power of two
	// or zero.
	if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) \|\|
	match(V, m_LShr(m_Value(X), m_Value()))))
	return isKnownToBeAPowerOfTwo(X, /OrZero/ true, Depth, Q);

	if (const ZExtInst *ZI = dyn_cast<ZExtInst>(V))
	return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth, Q);

	if (const SelectInst *SI = dyn_cast<SelectInst>(V))
	return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth, Q) &&
	isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth, Q);

	if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) {
	// A power of two and'd with anything is a power of two or zero.
	if (isKnownToBeAPowerOfTwo(X, /OrZero/ true, Depth, Q) \|\|
	isKnownToBeAPowerOfTwo(Y, /OrZero/ true, Depth, Q))
	return true;
	// X & (-X) is always a power of two or zero.
	if (match(X, m_Neg(m_Specific(Y))) \|\| match(Y, m_Neg(m_Specific(X))))
	return true;
	return false;
	}

	// Adding a power-of-two or zero to the same power-of-two or zero yields
	// either the original power-of-two, a larger power-of-two or zero.
	if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
	const OverflowingBinaryOperator *VOBO = cast<OverflowingBinaryOperator>(V);
	if (OrZero \|\| VOBO->hasNoUnsignedWrap() \|\| VOBO->hasNoSignedWrap()) {
	if (match(X, m_And(m_Specific(Y), m_Value())) \|\|
	match(X, m_And(m_Value(), m_Specific(Y))))
	if (isKnownToBeAPowerOfTwo(Y, OrZero, Depth, Q))
	return true;
	if (match(Y, m_And(m_Specific(X), m_Value())) \|\|
	match(Y, m_And(m_Value(), m_Specific(X))))
	if (isKnownToBeAPowerOfTwo(X, OrZero, Depth, Q))
	return true;

	unsigned BitWidth = V->getType()->getScalarSizeInBits();
	KnownBits LHSBits(BitWidth);
	computeKnownBits(X, LHSBits, Depth, Q);

	KnownBits RHSBits(BitWidth);
	computeKnownBits(Y, RHSBits, Depth, Q);
	// If i8 V is a power of two or zero:
	// ZeroBits: 1 1 1 0 1 1 1 1
	// ~ZeroBits: 0 0 0 1 0 0 0 0
	if ((~(LHSBits.Zero & RHSBits.Zero)).isPowerOf2())
	// If OrZero isn't set, we cannot give back a zero result.
	// Make sure either the LHS or RHS has a bit set.
	if (OrZero \|\| RHSBits.One.getBoolValue() \|\| LHSBits.One.getBoolValue())
	return true;
	}
	}

	// An exact divide or right shift can only shift off zero bits, so the result
	// is a power of two only if the first operand is a power of two and not
	// copying a sign bit (sdiv int_min, 2).
	if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) \|\|
	match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) {
	return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero,
	Depth, Q);
	}

	return false;
	}

	/// \brief Test whether a GEP's result is known to be non-null.
	///
	/// Uses properties inherent in a GEP to try to determine whether it is known
	/// to be non-null.
	///
	/// Currently this routine does not support vector GEPs.
	static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
	const Query &Q) {
	if (!GEP->isInBounds() \|\| GEP->getPointerAddressSpace() != 0)
	return false;

	// FIXME: Support vector-GEPs.
	assert(GEP->getType()->isPointerTy() && "We only support plain pointer GEP");

	// If the base pointer is non-null, we cannot walk to a null address with an
	// inbounds GEP in address space zero.
	if (isKnownNonZero(GEP->getPointerOperand(), Depth, Q))
	return true;

	// Walk the GEP operands and see if any operand introduces a non-zero offset.
	// If so, then the GEP cannot produce a null pointer, as doing so would
	// inherently violate the inbounds contract within address space zero.
	for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
	GTI != GTE; ++GTI) {
	// Struct types are easy -- they must always be indexed by a constant.
	if (StructType *STy = GTI.getStructTypeOrNull()) {
	ConstantInt *OpC = cast<ConstantInt>(GTI.getOperand());
	unsigned ElementIdx = OpC->getZExtValue();
	const StructLayout *SL = Q.DL.getStructLayout(STy);
	uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
	if (ElementOffset > 0)
	return true;
	continue;
	}

	// If we have a zero-sized type, the index doesn't matter. Keep looping.
	if (Q.DL.getTypeAllocSize(GTI.getIndexedType()) == 0)
	continue;

	// Fast path the constant operand case both for efficiency and so we don't
	// increment Depth when just zipping down an all-constant GEP.
	if (ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand())) {
	if (!OpC->isZero())
	return true;
	continue;
	}

	// We post-increment Depth here because while isKnownNonZero increments it
	// as well, when we pop back up that increment won't persist. We don't want
	// to recurse 10k times just because we have 10k GEP operands. We don't
	// bail completely out because we want to handle constant GEPs regardless
	// of depth.
	if (Depth++ >= MaxDepth)
	continue;

	if (isKnownNonZero(GTI.getOperand(), Depth, Q))
	return true;
	}

	return false;
	}

	/// Does the 'Range' metadata (which must be a valid MD_range operand list)
	/// ensure that the value it's attached to is never Value? 'RangeType' is
	/// is the type of the value described by the range.
	static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value) {
	const unsigned NumRanges = Ranges->getNumOperands() / 2;
	assert(NumRanges >= 1);
	for (unsigned i = 0; i < NumRanges; ++i) {
	ConstantInt *Lower =
	mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 0));
	ConstantInt *Upper =
	mdconst::extract<ConstantInt>(Ranges->getOperand(2 * i + 1));
	ConstantRange Range(Lower->getValue(), Upper->getValue());
	if (Range.contains(Value))
	return false;
	}
	return true;
	}

	/// Return true if the given value is known to be non-zero when defined. For
	/// vectors, return true if every element is known to be non-zero when
	/// defined. For pointers, if the context instruction and dominator tree are
	/// specified, perform context-sensitive analysis and return true if the
	/// pointer couldn't possibly be null at the specified instruction.
	/// Supports values with integer or pointer type and vectors of integers.
	bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
	if (auto *C = dyn_cast<Constant>(V)) {
	if (C->isNullValue())
	return false;
	if (isa<ConstantInt>(C))
	// Must be non-zero due to null test above.
	return true;

	// For constant vectors, check that all elements are undefined or known
	// non-zero to determine that the whole vector is known non-zero.
	if (auto *VecTy = dyn_cast<VectorType>(C->getType())) {
	for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) {
	Constant *Elt = C->getAggregateElement(i);
	if (!Elt \|\| Elt->isNullValue())
	return false;
	if (!isa<UndefValue>(Elt) && !isa<ConstantInt>(Elt))
	return false;
	}
	return true;
	}

	return false;
	}

	if (auto *I = dyn_cast<Instruction>(V)) {
	if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range)) {
	// If the possible ranges don't contain zero, then the value is
	// definitely non-zero.
	if (auto *Ty = dyn_cast<IntegerType>(V->getType())) {
	const APInt ZeroValue(Ty->getBitWidth(), 0);
	if (rangeMetadataExcludesValue(Ranges, ZeroValue))
	return true;
	}
	}
	}

	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth++ >= MaxDepth)
	return false;

	// Check for pointer simplifications.
	if (V->getType()->isPointerTy()) {
	if (isKnownNonNullAt(V, Q.CxtI, Q.DT))
	return true;
	if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V))
	if (isGEPKnownNonNull(GEP, Depth, Q))
	return true;
	}

	unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), Q.DL);

	// X \| Y != 0 if X != 0 or Y != 0.
	Value X = nullptr, Y = nullptr;
	if (match(V, m_Or(m_Value(X), m_Value(Y))))
	return isKnownNonZero(X, Depth, Q) \|\| isKnownNonZero(Y, Depth, Q);

	// ext X != 0 if X != 0.
	if (isa<SExtInst>(V) \|\| isa<ZExtInst>(V))
	return isKnownNonZero(cast<Instruction>(V)->getOperand(0), Depth, Q);

	// shl X, Y != 0 if X is odd. Note that the value of the shift is undefined
	// if the lowest bit is shifted off the end.
	if (match(V, m_Shl(m_Value(X), m_Value(Y)))) {
	// shl nuw can't remove any non-zero bits.
	const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
	if (BO->hasNoUnsignedWrap())
	return isKnownNonZero(X, Depth, Q);

	KnownBits Known(BitWidth);
	computeKnownBits(X, Known, Depth, Q);
	if (Known.One[0])
	return true;
	}
	// shr X, Y != 0 if X is negative. Note that the value of the shift is not
	// defined if the sign bit is shifted off the end.
	else if (match(V, m_Shr(m_Value(X), m_Value(Y)))) {
	// shr exact can only shift out zero bits.
	const PossiblyExactOperator *BO = cast<PossiblyExactOperator>(V);
	if (BO->isExact())
	return isKnownNonZero(X, Depth, Q);

	KnownBits Known = computeKnownBits(X, Depth, Q);
	if (Known.isNegative())
	return true;

	// If the shifter operand is a constant, and all of the bits shifted
	// out are known to be zero, and X is known non-zero then at least one
	// non-zero bit must remain.
	if (ConstantInt *Shift = dyn_cast<ConstantInt>(Y)) {
	auto ShiftVal = Shift->getLimitedValue(BitWidth - 1);
	// Is there a known one in the portion not shifted out?
	if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal)
	return true;
	// Are all the bits to be shifted out known zero?
	if (Known.countMinTrailingZeros() >= ShiftVal)
	return isKnownNonZero(X, Depth, Q);
	}
	}
	// div exact can only produce a zero if the dividend is zero.
	else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
	return isKnownNonZero(X, Depth, Q);
	}
	// X + Y.
	else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
	KnownBits XKnown = computeKnownBits(X, Depth, Q);
	KnownBits YKnown = computeKnownBits(Y, Depth, Q);

	// If X and Y are both non-negative (as signed values) then their sum is not
	// zero unless both X and Y are zero.
	if (XKnown.isNonNegative() && YKnown.isNonNegative())
	if (isKnownNonZero(X, Depth, Q) \|\| isKnownNonZero(Y, Depth, Q))
	return true;

	// If X and Y are both negative (as signed values) then their sum is not
	// zero unless both X and Y equal INT_MIN.
	if (XKnown.isNegative() && YKnown.isNegative()) {
	APInt Mask = APInt::getSignedMaxValue(BitWidth);
	// The sign bit of X is set. If some other bit is set then X is not equal
	// to INT_MIN.
	if (XKnown.One.intersects(Mask))
	return true;
	// The sign bit of Y is set. If some other bit is set then Y is not equal
	// to INT_MIN.
	if (YKnown.One.intersects(Mask))
	return true;
	}

	// The sum of a non-negative number and a power of two is not zero.
	if (XKnown.isNonNegative() &&
	isKnownToBeAPowerOfTwo(Y, /OrZero/ false, Depth, Q))
	return true;
	if (YKnown.isNonNegative() &&
	isKnownToBeAPowerOfTwo(X, /OrZero/ false, Depth, Q))
	return true;
	}
	// X * Y.
	else if (match(V, m_Mul(m_Value(X), m_Value(Y)))) {
	const OverflowingBinaryOperator *BO = cast<OverflowingBinaryOperator>(V);
	// If X and Y are non-zero then so is X * Y as long as the multiplication
	// does not overflow.
	if ((BO->hasNoSignedWrap() \|\| BO->hasNoUnsignedWrap()) &&
	isKnownNonZero(X, Depth, Q) && isKnownNonZero(Y, Depth, Q))
	return true;
	}
	// (C ? X : Y) != 0 if X != 0 and Y != 0.
	else if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
	if (isKnownNonZero(SI->getTrueValue(), Depth, Q) &&
	isKnownNonZero(SI->getFalseValue(), Depth, Q))
	return true;
	}
	// PHI
	else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
	// Try and detect a recurrence that monotonically increases from a
	// starting value, as these are common as induction variables.
	if (PN->getNumIncomingValues() == 2) {
	Value *Start = PN->getIncomingValue(0);
	Value *Induction = PN->getIncomingValue(1);
	if (isa<ConstantInt>(Induction) && !isa<ConstantInt>(Start))
	std::swap(Start, Induction);
	if (ConstantInt *C = dyn_cast<ConstantInt>(Start)) {
	if (!C->isZero() && !C->isNegative()) {
	ConstantInt *X;
	if ((match(Induction, m_NSWAdd(m_Specific(PN), m_ConstantInt(X))) \|\|
	match(Induction, m_NUWAdd(m_Specific(PN), m_ConstantInt(X)))) &&
	!X->isNegative())
	return true;
	}
	}
	}
	// Check if all incoming values are non-zero constant.
	bool AllNonZeroConstants = all_of(PN->operands(), [](Value *V) {
	return isa<ConstantInt>(V) && !cast<ConstantInt>(V)->isZero();
	});
	if (AllNonZeroConstants)
	return true;
	}

	KnownBits Known(BitWidth);
	computeKnownBits(V, Known, Depth, Q);
	return Known.One != 0;
	}

	/// Return true if V2 == V1 + X, where X is known non-zero.
	static bool isAddOfNonZero(const Value V1, const Value V2, const Query &Q) {
	const BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
	if (!BO \|\| BO->getOpcode() != Instruction::Add)
	return false;
	Value *Op = nullptr;
	if (V2 == BO->getOperand(0))
	Op = BO->getOperand(1);
	else if (V2 == BO->getOperand(1))
	Op = BO->getOperand(0);
	else
	return false;
	return isKnownNonZero(Op, 0, Q);
	}

	/// Return true if it is known that V1 != V2.
	static bool isKnownNonEqual(const Value V1, const Value V2, const Query &Q) {
	if (V1 == V2)
	return false;
	if (V1->getType() != V2->getType())
	// We can't look through casts yet.
	return false;
	if (isAddOfNonZero(V1, V2, Q) \|\| isAddOfNonZero(V2, V1, Q))
	return true;

	if (V1->getType()->isIntOrIntVectorTy()) {
	// Are any known bits in V1 contradictory to known bits in V2? If V1
	// has a known zero where V2 has a known one, they must not be equal.
	KnownBits Known1 = computeKnownBits(V1, 0, Q);
	KnownBits Known2 = computeKnownBits(V2, 0, Q);

	if (Known1.Zero.intersects(Known2.One) \|\|
	Known2.Zero.intersects(Known1.One))
	return true;
	}
	return false;
	}

	/// Return true if 'V & Mask' is known to be zero. We use this predicate to
	/// simplify operations downstream. Mask is known to be zero for bits that V
	/// cannot have.
	///
	/// This function is defined on values with integer type, values with pointer
	/// type, and vectors of integers. In the case
	/// where V is a vector, the mask, known zero, and known one values are the
	/// same width as the vector element, and the bit is set only if it is true
	/// for all of the elements in the vector.
	bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth,
	const Query &Q) {
	KnownBits Known(Mask.getBitWidth());
	computeKnownBits(V, Known, Depth, Q);
	return Mask.isSubsetOf(Known.Zero);
	}

	/// For vector constants, loop over the elements and find the constant with the
	/// minimum number of sign bits. Return 0 if the value is not a vector constant
	/// or if any element was not analyzed; otherwise, return the count for the
	/// element with the minimum number of sign bits.
	static unsigned computeNumSignBitsVectorConstant(const Value *V,
	unsigned TyBits) {
	const auto *CV = dyn_cast<Constant>(V);
	if (!CV \|\| !CV->getType()->isVectorTy())
	return 0;

	unsigned MinSignBits = TyBits;
	unsigned NumElts = CV->getType()->getVectorNumElements();
	for (unsigned i = 0; i != NumElts; ++i) {
	// If we find a non-ConstantInt, bail out.
	auto *Elt = dyn_cast_or_null<ConstantInt>(CV->getAggregateElement(i));
	if (!Elt)
	return 0;

	// If the sign bit is 1, flip the bits, so we always count leading zeros.
	APInt EltVal = Elt->getValue();
	if (EltVal.isNegative())
	EltVal = ~EltVal;
	MinSignBits = std::min(MinSignBits, EltVal.countLeadingZeros());
	}

	return MinSignBits;
	}

	static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
	const Query &Q);

	static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
	const Query &Q) {
	unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
	assert(Result > 0 && "At least one sign bit needs to be present!");
	return Result;
	}

	/// Return the number of times the sign bit of the register is replicated into
	/// the other bits. We know that at least 1 bit is always equal to the sign bit
	/// (itself), but other cases can give us information. For example, immediately
	/// after an "ashr X, 2", we know that the top 3 bits are all equal to each
	/// other, so we return 3. For vectors, return the number of sign bits for the
	/// vector element with the mininum number of known sign bits.
	static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
	const Query &Q) {

	// We return the minimum number of sign bits that are guaranteed to be present
	// in V, so for undef we have to conservatively return 1. We don't have the
	// same behavior for poison though -- that's a FIXME today.

	unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType());
	unsigned Tmp, Tmp2;
	unsigned FirstAnswer = 1;

	// Note that ConstantInt is handled by the general computeKnownBits case
	// below.

	if (Depth == MaxDepth)
	return 1; // Limit search depth.

	const Operator *U = dyn_cast<Operator>(V);
	switch (Operator::getOpcode(V)) {
	default: break;
	case Instruction::SExt:
	Tmp = TyBits - U->getOperand(0)->getType()->getScalarSizeInBits();
	return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q) + Tmp;

	case Instruction::SDiv: {
	const APInt *Denominator;
	// sdiv X, C -> adds log(C) sign bits.
	if (match(U->getOperand(1), m_APInt(Denominator))) {

	// Ignore non-positive denominator.
	if (!Denominator->isStrictlyPositive())
	break;

	// Calculate the incoming numerator bits.
	unsigned NumBits = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);

	// Add floor(log(C)) bits to the numerator bits.
	return std::min(TyBits, NumBits + Denominator->logBase2());
	}
	break;
	}

	case Instruction::SRem: {
	const APInt *Denominator;
	// srem X, C -> we know that the result is within [-C+1,C) when C is a
	// positive constant. This let us put a lower bound on the number of sign
	// bits.
	if (match(U->getOperand(1), m_APInt(Denominator))) {

	// Ignore non-positive denominator.
	if (!Denominator->isStrictlyPositive())
	break;

	// Calculate the incoming numerator bits. SRem by a positive constant
	// can't lower the number of sign bits.
	unsigned NumrBits =
	ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);

	// Calculate the leading sign bit constraints by examining the
	// denominator. Given that the denominator is positive, there are two
	// cases:
	//
	// 1. the numerator is positive. The result range is [0,C) and [0,C) u<
	// (1 << ceilLogBase2(C)).
	//
	// 2. the numerator is negative. Then the result range is (-C,0] and
	// integers in (-C,0] are either 0 or >u (-1 << ceilLogBase2(C)).
	//
	// Thus a lower bound on the number of sign bits is `TyBits -
	// ceilLogBase2(C)`.

	unsigned ResBits = TyBits - Denominator->ceilLogBase2();
	return std::max(NumrBits, ResBits);
	}
	break;
	}

	case Instruction::AShr: {
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	// ashr X, C -> adds C sign bits. Vectors too.
	const APInt *ShAmt;
	if (match(U->getOperand(1), m_APInt(ShAmt))) {
	unsigned ShAmtLimited = ShAmt->getZExtValue();
	if (ShAmtLimited >= TyBits)
	break; // Bad shift.
	Tmp += ShAmtLimited;
	if (Tmp > TyBits) Tmp = TyBits;
	}
	return Tmp;
	}
	case Instruction::Shl: {
	const APInt *ShAmt;
	if (match(U->getOperand(1), m_APInt(ShAmt))) {
	// shl destroys sign bits.
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	Tmp2 = ShAmt->getZExtValue();
	if (Tmp2 >= TyBits \|\| // Bad shift.
	Tmp2 >= Tmp) break; // Shifted all sign bits out.
	return Tmp - Tmp2;
	}
	break;
	}
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor: // NOT is handled here.
	// Logical binary ops preserve the number of sign bits at the worst.
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	if (Tmp != 1) {
	Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
	FirstAnswer = std::min(Tmp, Tmp2);
	// We computed what we know about the sign bits as our first
	// answer. Now proceed to the generic code that uses
	// computeKnownBits, and pick whichever answer is better.
	}
	break;

	case Instruction::Select:
	Tmp = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
	if (Tmp == 1) return 1; // Early out.
	Tmp2 = ComputeNumSignBits(U->getOperand(2), Depth + 1, Q);
	return std::min(Tmp, Tmp2);

	case Instruction::Add:
	// Add can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	if (Tmp == 1) return 1; // Early out.

	// Special case decrementing a value (ADD X, -1):
	if (const auto *CRHS = dyn_cast<Constant>(U->getOperand(1)))
	if (CRHS->isAllOnesValue()) {
	KnownBits Known(TyBits);
	computeKnownBits(U->getOperand(0), Known, Depth + 1, Q);

	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return TyBits;

	// If we are subtracting one from a positive number, there is no carry
	// out of the result.
	if (Known.isNonNegative())
	return Tmp;
	}

	Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
	if (Tmp2 == 1) return 1;
	return std::min(Tmp, Tmp2)-1;

	case Instruction::Sub:
	Tmp2 = ComputeNumSignBits(U->getOperand(1), Depth + 1, Q);
	if (Tmp2 == 1) return 1;

	// Handle NEG.
	if (const auto *CLHS = dyn_cast<Constant>(U->getOperand(0)))
	if (CLHS->isNullValue()) {
	KnownBits Known(TyBits);
	computeKnownBits(U->getOperand(1), Known, Depth + 1, Q);
	// If the input is known to be 0 or 1, the output is 0/-1, which is all
	// sign bits set.
	if ((Known.Zero \| 1).isAllOnesValue())
	return TyBits;

	// If the input is known to be positive (the sign bit is known clear),
	// the output of the NEG has the same number of sign bits as the input.
	if (Known.isNonNegative())
	return Tmp2;

	// Otherwise, we treat this like a SUB.
	}

	// Sub can have at most one carry bit. Thus we know that the output
	// is, at worst, one more bit than the inputs.
	Tmp = ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	if (Tmp == 1) return 1; // Early out.
	return std::min(Tmp, Tmp2)-1;

	case Instruction::PHI: {
	const PHINode *PN = cast<PHINode>(U);
	unsigned NumIncomingValues = PN->getNumIncomingValues();
	// Don't analyze large in-degree PHIs.
	if (NumIncomingValues > 4) break;
	// Unreachable blocks may have zero-operand PHI nodes.
	if (NumIncomingValues == 0) break;

	// Take the minimum of all incoming values. This can't infinitely loop
	// because of our depth threshold.
	Tmp = ComputeNumSignBits(PN->getIncomingValue(0), Depth + 1, Q);
	for (unsigned i = 1, e = NumIncomingValues; i != e; ++i) {
	if (Tmp == 1) return Tmp;
	Tmp = std::min(
	Tmp, ComputeNumSignBits(PN->getIncomingValue(i), Depth + 1, Q));
	}
	return Tmp;
	}

	case Instruction::Trunc:
	// FIXME: it's tricky to do anything useful for this, but it is an important
	// case for targets like X86.
	break;

	case Instruction::ExtractElement:
	// Look through extract element. At the moment we keep this simple and skip
	// tracking the specific element. But at least we might find information
	// valid for all elements of the vector (for example if vector is sign
	// extended, shifted, etc).
	return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
	}

	// Finally, if we can prove that the top bits of the result are 0's or 1's,
	// use this information.

	// If we can examine all elements of a vector constant successfully, we're
	// done (we can't do any better than that). If not, keep trying.
	if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits))
	return VecSignBits;

	KnownBits Known(TyBits);
	computeKnownBits(V, Known, Depth, Q);

	// If we know that the sign bit is either zero or one, determine the number of
	// identical bits in the top of the input value.
	return std::max(FirstAnswer, Known.countMinSignBits());
	}

	/// This function computes the integer multiple of Base that equals V.
	/// If successful, it returns true and returns the multiple in
	/// Multiple. If unsuccessful, it returns false. It looks
	/// through SExt instructions only if LookThroughSExt is true.
	bool llvm::ComputeMultiple(Value V, unsigned Base, Value &Multiple,
	bool LookThroughSExt, unsigned Depth) {
	const unsigned MaxDepth = 6;

	assert(V && "No Value?");
	assert(Depth <= MaxDepth && "Limit Search Depth");
	assert(V->getType()->isIntegerTy() && "Not integer or pointer type!");

	Type *T = V->getType();

	ConstantInt *CI = dyn_cast<ConstantInt>(V);

	if (Base == 0)
	return false;

	if (Base == 1) {
	Multiple = V;
	return true;
	}

	ConstantExpr *CO = dyn_cast<ConstantExpr>(V);
	Constant *BaseVal = ConstantInt::get(T, Base);
	if (CO && CO == BaseVal) {
	// Multiple is 1.
	Multiple = ConstantInt::get(T, 1);
	return true;
	}

	if (CI && CI->getZExtValue() % Base == 0) {
	Multiple = ConstantInt::get(T, CI->getZExtValue() / Base);
	return true;
	}

	if (Depth == MaxDepth) return false; // Limit search depth.

	Operator *I = dyn_cast<Operator>(V);
	if (!I) return false;

	switch (I->getOpcode()) {
	default: break;
	case Instruction::SExt:
	if (!LookThroughSExt) return false;
	// otherwise fall through to ZExt
	LLVM_FALLTHROUGH;
	case Instruction::ZExt:
	return ComputeMultiple(I->getOperand(0), Base, Multiple,
	LookThroughSExt, Depth+1);
	case Instruction::Shl:
	case Instruction::Mul: {
	Value *Op0 = I->getOperand(0);
	Value *Op1 = I->getOperand(1);

	if (I->getOpcode() == Instruction::Shl) {
	ConstantInt *Op1CI = dyn_cast<ConstantInt>(Op1);
	if (!Op1CI) return false;
	// Turn Op0 << Op1 into Op0 * 2^Op1
	APInt Op1Int = Op1CI->getValue();
	uint64_t BitToSet = Op1Int.getLimitedValue(Op1Int.getBitWidth() - 1);
	APInt API(Op1Int.getBitWidth(), 0);
	API.setBit(BitToSet);
	Op1 = ConstantInt::get(V->getContext(), API);
	}

	Value *Mul0 = nullptr;
	if (ComputeMultiple(Op0, Base, Mul0, LookThroughSExt, Depth+1)) {
	if (Constant *Op1C = dyn_cast<Constant>(Op1))
	if (Constant *MulC = dyn_cast<Constant>(Mul0)) {
	if (Op1C->getType()->getPrimitiveSizeInBits() <
	MulC->getType()->getPrimitiveSizeInBits())
	Op1C = ConstantExpr::getZExt(Op1C, MulC->getType());
	if (Op1C->getType()->getPrimitiveSizeInBits() >
	MulC->getType()->getPrimitiveSizeInBits())
	MulC = ConstantExpr::getZExt(MulC, Op1C->getType());

	// V == Base * (Mul0 * Op1), so return (Mul0 * Op1)
	Multiple = ConstantExpr::getMul(MulC, Op1C);
	return true;
	}

	if (ConstantInt *Mul0CI = dyn_cast<ConstantInt>(Mul0))
	if (Mul0CI->getValue() == 1) {
	// V == Base * Op1, so return Op1
	Multiple = Op1;
	return true;
	}
	}

	Value *Mul1 = nullptr;
	if (ComputeMultiple(Op1, Base, Mul1, LookThroughSExt, Depth+1)) {
	if (Constant *Op0C = dyn_cast<Constant>(Op0))
	if (Constant *MulC = dyn_cast<Constant>(Mul1)) {
	if (Op0C->getType()->getPrimitiveSizeInBits() <
	MulC->getType()->getPrimitiveSizeInBits())
	Op0C = ConstantExpr::getZExt(Op0C, MulC->getType());
	if (Op0C->getType()->getPrimitiveSizeInBits() >
	MulC->getType()->getPrimitiveSizeInBits())
	MulC = ConstantExpr::getZExt(MulC, Op0C->getType());

	// V == Base * (Mul1 * Op0), so return (Mul1 * Op0)
	Multiple = ConstantExpr::getMul(MulC, Op0C);
	return true;
	}

	if (ConstantInt *Mul1CI = dyn_cast<ConstantInt>(Mul1))
	if (Mul1CI->getValue() == 1) {
	// V == Base * Op0, so return Op0
	Multiple = Op0;
	return true;
	}
	}
	}
	}

	// We could not determine if V is a multiple of Base.
	return false;
	}

	Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
	const TargetLibraryInfo *TLI) {
	const Function *F = ICS.getCalledFunction();
	if (!F)
	return Intrinsic::not_intrinsic;

	if (F->isIntrinsic())
	return F->getIntrinsicID();

	if (!TLI)
	return Intrinsic::not_intrinsic;

	LibFunc Func;
	// We're going to make assumptions on the semantics of the functions, check
	// that the target knows that it's available in this environment and it does
	// not have local linkage.
	if (!F \|\| F->hasLocalLinkage() \|\| !TLI->getLibFunc(*F, Func))
	return Intrinsic::not_intrinsic;

	if (!ICS.onlyReadsMemory())
	return Intrinsic::not_intrinsic;

	// Otherwise check if we have a call to a function that can be turned into a
	// vector intrinsic.
	switch (Func) {
	default:
	break;
	case LibFunc_sin:
	case LibFunc_sinf:
	case LibFunc_sinl:
	return Intrinsic::sin;
	case LibFunc_cos:
	case LibFunc_cosf:
	case LibFunc_cosl:
	return Intrinsic::cos;
	case LibFunc_exp:
	case LibFunc_expf:
	case LibFunc_expl:
	return Intrinsic::exp;
	case LibFunc_exp2:
	case LibFunc_exp2f:
	case LibFunc_exp2l:
	return Intrinsic::exp2;
	case LibFunc_log:
	case LibFunc_logf:
	case LibFunc_logl:
	return Intrinsic::log;
	case LibFunc_log10:
	case LibFunc_log10f:
	case LibFunc_log10l:
	return Intrinsic::log10;
	case LibFunc_log2:
	case LibFunc_log2f:
	case LibFunc_log2l:
	return Intrinsic::log2;
	case LibFunc_fabs:
	case LibFunc_fabsf:
	case LibFunc_fabsl:
	return Intrinsic::fabs;
	case LibFunc_fmin:
	case LibFunc_fminf:
	case LibFunc_fminl:
	return Intrinsic::minnum;
	case LibFunc_fmax:
	case LibFunc_fmaxf:
	case LibFunc_fmaxl:
	return Intrinsic::maxnum;
	case LibFunc_copysign:
	case LibFunc_copysignf:
	case LibFunc_copysignl:
	return Intrinsic::copysign;
	case LibFunc_floor:
	case LibFunc_floorf:
	case LibFunc_floorl:
	return Intrinsic::floor;
	case LibFunc_ceil:
	case LibFunc_ceilf:
	case LibFunc_ceill:
	return Intrinsic::ceil;
	case LibFunc_trunc:
	case LibFunc_truncf:
	case LibFunc_truncl:
	return Intrinsic::trunc;
	case LibFunc_rint:
	case LibFunc_rintf:
	case LibFunc_rintl:
	return Intrinsic::rint;
	case LibFunc_nearbyint:
	case LibFunc_nearbyintf:
	case LibFunc_nearbyintl:
	return Intrinsic::nearbyint;
	case LibFunc_round:
	case LibFunc_roundf:
	case LibFunc_roundl:
	return Intrinsic::round;
	case LibFunc_pow:
	case LibFunc_powf:
	case LibFunc_powl:
	return Intrinsic::pow;
	case LibFunc_sqrt:
	case LibFunc_sqrtf:
	case LibFunc_sqrtl:
	if (ICS->hasNoNaNs())
	return Intrinsic::sqrt;
	return Intrinsic::not_intrinsic;
	}

	return Intrinsic::not_intrinsic;
	}

	/// Return true if we can prove that the specified FP value is never equal to
	/// -0.0.
	///
	/// NOTE: this function will need to be revisited when we support non-default
	/// rounding modes!
	///
	bool llvm::CannotBeNegativeZero(const Value V, const TargetLibraryInfo TLI,
	unsigned Depth) {
	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
	return !CFP->getValueAPF().isNegZero();

	if (Depth == MaxDepth)
	return false; // Limit search depth.

	const Operator *I = dyn_cast<Operator>(V);
	if (!I) return false;

	// Check if the nsz fast-math flag is set
	if (const FPMathOperator *FPO = dyn_cast<FPMathOperator>(I))
	if (FPO->hasNoSignedZeros())
	return true;

	// (add x, 0.0) is guaranteed to return +0.0, not -0.0.
	if (I->getOpcode() == Instruction::FAdd)
	if (ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(1)))
	if (CFP->isNullValue())
	return true;

	// sitofp and uitofp turn into +0.0 for zero.
	if (isa<SIToFPInst>(I) \|\| isa<UIToFPInst>(I))
	return true;

	if (const CallInst *CI = dyn_cast<CallInst>(I)) {
	Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
	switch (IID) {
	default:
	break;
	// sqrt(-0.0) = -0.0, no other negative results are possible.
	case Intrinsic::sqrt:
	return CannotBeNegativeZero(CI->getArgOperand(0), TLI, Depth + 1);
	// fabs(x) != -0.0
	case Intrinsic::fabs:
	return true;
	}
	}

	return false;
	}

	/// If \p SignBitOnly is true, test for a known 0 sign bit rather than a
	/// standard ordered compare. e.g. make -0.0 olt 0.0 be true because of the sign
	/// bit despite comparing equal.
	static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
	const TargetLibraryInfo *TLI,
	bool SignBitOnly,
	unsigned Depth) {
	// TODO: This function does not do the right thing when SignBitOnly is true
	// and we're lowering to a hypothetical IEEE 754-compliant-but-evil platform
	// which flips the sign bits of NaNs. See
	// https://llvm.org/bugs/show_bug.cgi?id=31702.

	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
	return !CFP->getValueAPF().isNegative() \|\|
	(!SignBitOnly && CFP->getValueAPF().isZero());
	}

	if (Depth == MaxDepth)
	return false; // Limit search depth.

	const Operator *I = dyn_cast<Operator>(V);
	if (!I)
	return false;

	switch (I->getOpcode()) {
	default:
	break;
	// Unsigned integers are always nonnegative.
	case Instruction::UIToFP:
	return true;
	case Instruction::FMul:
	// x*x is always non-negative or a NaN.
	if (I->getOperand(0) == I->getOperand(1) &&
	(!SignBitOnly \|\| cast<FPMathOperator>(I)->hasNoNaNs()))
	return true;

	LLVM_FALLTHROUGH;
	case Instruction::FAdd:
	case Instruction::FDiv:
	case Instruction::FRem:
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1) &&
	cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
	Depth + 1);
	case Instruction::Select:
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
	Depth + 1) &&
	cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
	Depth + 1);
	case Instruction::FPExt:
	case Instruction::FPTrunc:
	// Widening/narrowing never change sign.
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1);
	case Instruction::Call:
	const auto *CI = cast<CallInst>(I);
	Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
	switch (IID) {
	default:
	break;
	case Intrinsic::maxnum:
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1) \|\|
	cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
	Depth + 1);
	case Intrinsic::minnum:
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1) &&
	cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI, SignBitOnly,
	Depth + 1);
	case Intrinsic::exp:
	case Intrinsic::exp2:
	case Intrinsic::fabs:
	return true;

	case Intrinsic::sqrt:
	// sqrt(x) is always >= -0 or NaN. Moreover, sqrt(x) == -0 iff x == -0.
	if (!SignBitOnly)
	return true;
	return CI->hasNoNaNs() && (CI->hasNoSignedZeros() \|\|
	CannotBeNegativeZero(CI->getOperand(0), TLI));

	case Intrinsic::powi:
	if (ConstantInt *Exponent = dyn_cast<ConstantInt>(I->getOperand(1))) {
	// powi(x,n) is non-negative if n is even.
	if (Exponent->getBitWidth() <= 64 && Exponent->getSExtValue() % 2u == 0)
	return true;
	}
	// TODO: This is not correct. Given that exp is an integer, here are the
	// ways that pow can return a negative value:
	//
	// pow(x, exp) --> negative if exp is odd and x is negative.
	// pow(-0, exp) --> -inf if exp is negative odd.
	// pow(-0, exp) --> -0 if exp is positive odd.
	// pow(-inf, exp) --> -0 if exp is negative odd.
	// pow(-inf, exp) --> -inf if exp is positive odd.
	//
	// Therefore, if !SignBitOnly, we can return true if x >= +0 or x is NaN,
	// but we must return false if x == -0. Unfortunately we do not currently
	// have a way of expressing this constraint. See details in
	// https://llvm.org/bugs/show_bug.cgi?id=31702.
	return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
	Depth + 1);

	case Intrinsic::fma:
	case Intrinsic::fmuladd:
	// x*x+y is non-negative if y is non-negative.
	return I->getOperand(0) == I->getOperand(1) &&
	(!SignBitOnly \|\| cast<FPMathOperator>(I)->hasNoNaNs()) &&
	cannotBeOrderedLessThanZeroImpl(I->getOperand(2), TLI, SignBitOnly,
	Depth + 1);
	}
	break;
	}
	return false;
	}

	bool llvm::CannotBeOrderedLessThanZero(const Value *V,
	const TargetLibraryInfo *TLI) {
	return cannotBeOrderedLessThanZeroImpl(V, TLI, false, 0);
	}

	bool llvm::SignBitMustBeZero(const Value V, const TargetLibraryInfo TLI) {
	return cannotBeOrderedLessThanZeroImpl(V, TLI, true, 0);
	}

	/// If the specified value can be set by repeating the same byte in memory,
	/// return the i8 value that it is represented with. This is
	/// true for all i8 values obviously, but is also true for i32 0, i32 -1,
	/// i16 0xF0F0, double 0.0 etc. If the value can't be handled with a repeated
	/// byte store (e.g. i16 0x1234), return null.
	Value llvm::isBytewiseValue(Value V) {
	// All byte-wide stores are splatable, even of arbitrary variables.
	if (V->getType()->isIntegerTy(8)) return V;

	// Handle 'null' ConstantArrayZero etc.
	if (Constant *C = dyn_cast<Constant>(V))
	if (C->isNullValue())
	return Constant::getNullValue(Type::getInt8Ty(V->getContext()));

	// Constant float and double values can be handled as integer values if the
	// corresponding integer value is "byteable". An important case is 0.0.
	if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
	if (CFP->getType()->isFloatTy())
	V = ConstantExpr::getBitCast(CFP, Type::getInt32Ty(V->getContext()));
	if (CFP->getType()->isDoubleTy())
	V = ConstantExpr::getBitCast(CFP, Type::getInt64Ty(V->getContext()));
	// Don't handle long double formats, which have strange constraints.
	}

	// We can handle constant integers that are multiple of 8 bits.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
	if (CI->getBitWidth() % 8 == 0) {
	assert(CI->getBitWidth() > 8 && "8 bits should be handled above!");

	if (!CI->getValue().isSplat(8))
	return nullptr;
	return ConstantInt::get(V->getContext(), CI->getValue().trunc(8));
	}
	}

	// A ConstantDataArray/Vector is splatable if all its members are equal and
	// also splatable.
	if (ConstantDataSequential *CA = dyn_cast<ConstantDataSequential>(V)) {
	Value *Elt = CA->getElementAsConstant(0);
	Value *Val = isBytewiseValue(Elt);
	if (!Val)
	return nullptr;

	for (unsigned I = 1, E = CA->getNumElements(); I != E; ++I)
	if (CA->getElementAsConstant(I) != Elt)
	return nullptr;

	return Val;
	}

	// Conceptually, we could handle things like:
	// %a = zext i8 %X to i16
	// %b = shl i16 %a, 8
	// %c = or i16 %a, %b
	// but until there is an example that actually needs this, it doesn't seem
	// worth worrying about.
	return nullptr;
	}


	// This is the recursive version of BuildSubAggregate. It takes a few different
	// arguments. Idxs is the index within the nested struct From that we are
	// looking at now (which is of type IndexedType). IdxSkip is the number of
	// indices from Idxs that should be left out when inserting into the resulting
	// struct. To is the result struct built so far, new insertvalue instructions
	// build on that.
	static Value BuildSubAggregate(Value From, Value* To, Type *IndexedType,
	SmallVectorImpl<unsigned> &Idxs,
	unsigned IdxSkip,
	Instruction *InsertBefore) {
	llvm::StructType *STy = dyn_cast<llvm::StructType>(IndexedType);
	if (STy) {
	// Save the original To argument so we can modify it
	Value *OrigTo = To;
	// General case, the type indexed by Idxs is a struct
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	// Process each struct element recursively
	Idxs.push_back(i);
	Value *PrevTo = To;
	To = BuildSubAggregate(From, To, STy->getElementType(i), Idxs, IdxSkip,
	InsertBefore);
	Idxs.pop_back();
	if (!To) {
	// Couldn't find any inserted value for this index? Cleanup
	while (PrevTo != OrigTo) {
	InsertValueInst* Del = cast<InsertValueInst>(PrevTo);
	PrevTo = Del->getAggregateOperand();
	Del->eraseFromParent();
	}
	// Stop processing elements
	break;
	}
	}
	// If we successfully found a value for each of our subaggregates
	if (To)
	return To;
	}
	// Base case, the type indexed by SourceIdxs is not a struct, or not all of
	// the struct's elements had a value that was inserted directly. In the latter
	// case, perhaps we can't determine each of the subelements individually, but
	// we might be able to find the complete struct somewhere.

	// Find the value that is at that particular spot
	Value *V = FindInsertedValue(From, Idxs);

	if (!V)
	return nullptr;

	// Insert the value in the new (sub) aggregrate
	return llvm::InsertValueInst::Create(To, V, makeArrayRef(Idxs).slice(IdxSkip),
	"tmp", InsertBefore);
	}

	// This helper takes a nested struct and extracts a part of it (which is again a
	// struct) into a new value. For example, given the struct:
	// { a, { b, { c, d }, e } }
	// and the indices "1, 1" this returns
	// { c, d }.
	//
	// It does this by inserting an insertvalue for each element in the resulting
	// struct, as opposed to just inserting a single struct. This will only work if
	// each of the elements of the substruct are known (ie, inserted into From by an
	// insertvalue instruction somewhere).
	//
	// All inserted insertvalue instructions are inserted before InsertBefore
	static Value BuildSubAggregate(Value From, ArrayRef<unsigned> idx_range,
	Instruction *InsertBefore) {
	assert(InsertBefore && "Must have someplace to insert!");
	Type *IndexedType = ExtractValueInst::getIndexedType(From->getType(),
	idx_range);
	Value *To = UndefValue::get(IndexedType);
	SmallVector<unsigned, 10> Idxs(idx_range.begin(), idx_range.end());
	unsigned IdxSkip = Idxs.size();

	return BuildSubAggregate(From, To, IndexedType, Idxs, IdxSkip, InsertBefore);
	}

	/// Given an aggregrate and an sequence of indices, see if
	/// the scalar value indexed is already around as a register, for example if it
	/// were inserted directly into the aggregrate.
	///
	/// If InsertBefore is not null, this function will duplicate (modified)
	/// insertvalues when a part of a nested struct is extracted.
	Value llvm::FindInsertedValue(Value V, ArrayRef<unsigned> idx_range,
	Instruction *InsertBefore) {
	// Nothing to index? Just return V then (this is useful at the end of our
	// recursion).
	if (idx_range.empty())
	return V;
	// We have indices, so V should have an indexable type.
	assert((V->getType()->isStructTy() \|\| V->getType()->isArrayTy()) &&
	"Not looking at a struct or array?");
	assert(ExtractValueInst::getIndexedType(V->getType(), idx_range) &&
	"Invalid indices for type?");

	if (Constant *C = dyn_cast<Constant>(V)) {
	C = C->getAggregateElement(idx_range[0]);
	if (!C) return nullptr;
	return FindInsertedValue(C, idx_range.slice(1), InsertBefore);
	}

	if (InsertValueInst *I = dyn_cast<InsertValueInst>(V)) {
	// Loop the indices for the insertvalue instruction in parallel with the
	// requested indices
	const unsigned *req_idx = idx_range.begin();
	for (const unsigned i = I->idx_begin(), e = I->idx_end();
	i != e; ++i, ++req_idx) {
	if (req_idx == idx_range.end()) {
	// We can't handle this without inserting insertvalues
	if (!InsertBefore)
	return nullptr;

	// The requested index identifies a part of a nested aggregate. Handle
	// this specially. For example,
	// %A = insertvalue { i32, {i32, i32 } } undef, i32 10, 1, 0
	// %B = insertvalue { i32, {i32, i32 } } %A, i32 11, 1, 1
	// %C = extractvalue {i32, { i32, i32 } } %B, 1
	// This can be changed into
	// %A = insertvalue {i32, i32 } undef, i32 10, 0
	// %C = insertvalue {i32, i32 } %A, i32 11, 1
	// which allows the unused 0,0 element from the nested struct to be
	// removed.
	return BuildSubAggregate(V, makeArrayRef(idx_range.begin(), req_idx),
	InsertBefore);
	}

	// This insert value inserts something else than what we are looking for.
	// See if the (aggregate) value inserted into has the value we are
	// looking for, then.
	if (req_idx != i)
	return FindInsertedValue(I->getAggregateOperand(), idx_range,
	InsertBefore);
	}
	// If we end up here, the indices of the insertvalue match with those
	// requested (though possibly only partially). Now we recursively look at
	// the inserted value, passing any remaining indices.
	return FindInsertedValue(I->getInsertedValueOperand(),
	makeArrayRef(req_idx, idx_range.end()),
	InsertBefore);
	}

	if (ExtractValueInst *I = dyn_cast<ExtractValueInst>(V)) {
	// If we're extracting a value from an aggregate that was extracted from
	// something else, we can extract from that something else directly instead.
	// However, we will need to chain I's indices with the requested indices.

	// Calculate the number of indices required
	unsigned size = I->getNumIndices() + idx_range.size();
	// Allocate some space to put the new indices in
	SmallVector<unsigned, 5> Idxs;
	Idxs.reserve(size);
	// Add indices from the extract value instruction
	Idxs.append(I->idx_begin(), I->idx_end());

	// Add requested indices
	Idxs.append(idx_range.begin(), idx_range.end());

	assert(Idxs.size() == size
	&& "Number of indices added not correct?");

	return FindInsertedValue(I->getAggregateOperand(), Idxs, InsertBefore);
	}
	// Otherwise, we don't know (such as, extracting from a function return value
	// or load instruction)
	return nullptr;
	}

	/// Analyze the specified pointer to see if it can be expressed as a base
	/// pointer plus a constant offset. Return the base and offset to the caller.
	Value llvm::GetPointerBaseWithConstantOffset(Value Ptr, int64_t &Offset,
	const DataLayout &DL) {
	unsigned BitWidth = DL.getPointerTypeSizeInBits(Ptr->getType());
	APInt ByteOffset(BitWidth, 0);

	// We walk up the defs but use a visited set to handle unreachable code. In
	// that case, we stop after accumulating the cycle once (not that it
	// matters).
	SmallPtrSet<Value *, 16> Visited;
	while (Visited.insert(Ptr).second) {
	if (Ptr->getType()->isVectorTy())
	break;

	if (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
	// If one of the values we have visited is an addrspacecast, then
	// the pointer type of this GEP may be different from the type
	// of the Ptr parameter which was passed to this function. This
	// means when we construct GEPOffset, we need to use the size
	// of GEP's pointer type rather than the size of the original
	// pointer type.
	APInt GEPOffset(DL.getPointerTypeSizeInBits(Ptr->getType()), 0);
	if (!GEP->accumulateConstantOffset(DL, GEPOffset))
	break;

	ByteOffset += GEPOffset.getSExtValue();

	Ptr = GEP->getPointerOperand();
	} else if (Operator::getOpcode(Ptr) == Instruction::BitCast \|\|
	Operator::getOpcode(Ptr) == Instruction::AddrSpaceCast) {
	Ptr = cast<Operator>(Ptr)->getOperand(0);
	} else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
	if (GA->isInterposable())
	break;
	Ptr = GA->getAliasee();
	} else {
	break;
	}
	}
	Offset = ByteOffset.getSExtValue();
	return Ptr;
	}

	bool llvm::isGEPBasedOnPointerToString(const GEPOperator *GEP,
	unsigned CharSize) {
	// Make sure the GEP has exactly three arguments.
	if (GEP->getNumOperands() != 3)
	return false;

	// Make sure the index-ee is a pointer to array of \p CharSize integers.
	// CharSize.
	ArrayType *AT = dyn_cast<ArrayType>(GEP->getSourceElementType());
	if (!AT \|\| !AT->getElementType()->isIntegerTy(CharSize))
	return false;

	// Check to make sure that the first operand of the GEP is an integer and
	// has value 0 so that we are sure we're indexing into the initializer.
	const ConstantInt *FirstIdx = dyn_cast<ConstantInt>(GEP->getOperand(1));
	if (!FirstIdx \|\| !FirstIdx->isZero())
	return false;

	return true;
	}

	bool llvm::getConstantDataArrayInfo(const Value *V,
	ConstantDataArraySlice &Slice,
	unsigned ElementSize, uint64_t Offset) {
	assert(V);

	// Look through bitcast instructions and geps.
	V = V->stripPointerCasts();

	// If the value is a GEP instruction or constant expression, treat it as an
	// offset.
	if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
	// The GEP operator should be based on a pointer to string constant, and is
	// indexing into the string constant.
	if (!isGEPBasedOnPointerToString(GEP, ElementSize))
	return false;

	// If the second index isn't a ConstantInt, then this is a variable index
	// into the array. If this occurs, we can't say anything meaningful about
	// the string.
	uint64_t StartIdx = 0;
	if (const ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(2)))
	StartIdx = CI->getZExtValue();
	else
	return false;
	return getConstantDataArrayInfo(GEP->getOperand(0), Slice, ElementSize,
	StartIdx + Offset);
	}

	// The GEP instruction, constant or instruction, must reference a global
	// variable that is a constant and is initialized. The referenced constant
	// initializer is the array that we'll use for optimization.
	const GlobalVariable *GV = dyn_cast<GlobalVariable>(V);
	if (!GV \|\| !GV->isConstant() \|\| !GV->hasDefinitiveInitializer())
	return false;

	const ConstantDataArray *Array;
	ArrayType *ArrayTy;
	if (GV->getInitializer()->isNullValue()) {
	Type *GVTy = GV->getValueType();
	if ( (ArrayTy = dyn_cast<ArrayType>(GVTy)) ) {
	// A zeroinitializer for the array; there is no ConstantDataArray.
	Array = nullptr;
	} else {
	const DataLayout &DL = GV->getParent()->getDataLayout();
	uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy);
	uint64_t Length = SizeInBytes / (ElementSize / 8);
	if (Length <= Offset)
	return false;

	Slice.Array = nullptr;
	Slice.Offset = 0;
	Slice.Length = Length - Offset;
	return true;
	}
	} else {
	// This must be a ConstantDataArray.
	Array = dyn_cast<ConstantDataArray>(GV->getInitializer());
	if (!Array)
	return false;
	ArrayTy = Array->getType();
	}
	if (!ArrayTy->getElementType()->isIntegerTy(ElementSize))
	return false;

	uint64_t NumElts = ArrayTy->getArrayNumElements();
	if (Offset > NumElts)
	return false;

	Slice.Array = Array;
	Slice.Offset = Offset;
	Slice.Length = NumElts - Offset;
	return true;
	}

	/// This function computes the length of a null-terminated C string pointed to
	/// by V. If successful, it returns true and returns the string in Str.
	/// If unsuccessful, it returns false.
	bool llvm::getConstantStringInfo(const Value *V, StringRef &Str,
	uint64_t Offset, bool TrimAtNul) {
	ConstantDataArraySlice Slice;
	if (!getConstantDataArrayInfo(V, Slice, 8, Offset))
	return false;

	if (Slice.Array == nullptr) {
	if (TrimAtNul) {
	Str = StringRef();
	return true;
	}
	if (Slice.Length == 1) {
	Str = StringRef("", 1);
	return true;
	}
	// We cannot instantiate a StringRef as we do not have an appropriate string
	// of 0s at hand.
	return false;
	}

	// Start out with the entire array in the StringRef.
	Str = Slice.Array->getAsString();
	// Skip over 'offset' bytes.
	Str = Str.substr(Slice.Offset);

	if (TrimAtNul) {
	// Trim off the \0 and anything after it. If the array is not nul
	// terminated, we just return the whole end of string. The client may know
	// some other way that the string is length-bound.
	Str = Str.substr(0, Str.find('\0'));
	}
	return true;
	}

	// These next two are very similar to the above, but also look through PHI
	// nodes.
	// TODO: See if we can integrate these two together.

	/// If we can compute the length of the string pointed to by
	/// the specified pointer, return 'len+1'. If we can't, return 0.
	static uint64_t GetStringLengthH(const Value *V,
	SmallPtrSetImpl<const PHINode*> &PHIs,
	unsigned CharSize) {
	// Look through noop bitcast instructions.
	V = V->stripPointerCasts();

	// If this is a PHI node, there are two cases: either we have already seen it
	// or we haven't.
	if (const PHINode *PN = dyn_cast<PHINode>(V)) {
	if (!PHIs.insert(PN).second)
	return ~0ULL; // already in the set.

	// If it was new, see if all the input strings are the same length.
	uint64_t LenSoFar = ~0ULL;
	for (Value *IncValue : PN->incoming_values()) {
	uint64_t Len = GetStringLengthH(IncValue, PHIs, CharSize);
	if (Len == 0) return 0; // Unknown length -> unknown.

	if (Len == ~0ULL) continue;

	if (Len != LenSoFar && LenSoFar != ~0ULL)
	return 0; // Disagree -> unknown.
	LenSoFar = Len;
	}

	// Success, all agree.
	return LenSoFar;
	}

	// strlen(select(c,x,y)) -> strlen(x) ^ strlen(y)
	if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
	uint64_t Len1 = GetStringLengthH(SI->getTrueValue(), PHIs, CharSize);
	if (Len1 == 0) return 0;
	uint64_t Len2 = GetStringLengthH(SI->getFalseValue(), PHIs, CharSize);
	if (Len2 == 0) return 0;
	if (Len1 == ~0ULL) return Len2;
	if (Len2 == ~0ULL) return Len1;
	if (Len1 != Len2) return 0;
	return Len1;
	}

	// Otherwise, see if we can read the string.
	ConstantDataArraySlice Slice;
	if (!getConstantDataArrayInfo(V, Slice, CharSize))
	return 0;

	if (Slice.Array == nullptr)
	return 1;

	// Search for nul characters
	unsigned NullIndex = 0;
	for (unsigned E = Slice.Length; NullIndex < E; ++NullIndex) {
	if (Slice.Array->getElementAsInteger(Slice.Offset + NullIndex) == 0)
	break;
	}

	return NullIndex + 1;
	}

	/// If we can compute the length of the string pointed to by
	/// the specified pointer, return 'len+1'. If we can't, return 0.
	uint64_t llvm::GetStringLength(const Value *V, unsigned CharSize) {
	if (!V->getType()->isPointerTy()) return 0;

	SmallPtrSet<const PHINode*, 32> PHIs;
	uint64_t Len = GetStringLengthH(V, PHIs, CharSize);
	// If Len is ~0ULL, we had an infinite phi cycle: this is dead code, so return
	// an empty string as a length.
	return Len == ~0ULL ? 1 : Len;
	}

	/// \brief \p PN defines a loop-variant pointer to an object. Check if the
	/// previous iteration of the loop was referring to the same object as \p PN.
	static bool isSameUnderlyingObjectInLoop(const PHINode *PN,
	const LoopInfo *LI) {
	// Find the loop-defined value.
	Loop *L = LI->getLoopFor(PN->getParent());
	if (PN->getNumIncomingValues() != 2)
	return true;

	// Find the value from previous iteration.
	auto *PrevValue = dyn_cast<Instruction>(PN->getIncomingValue(0));
	if (!PrevValue \|\| LI->getLoopFor(PrevValue->getParent()) != L)
	PrevValue = dyn_cast<Instruction>(PN->getIncomingValue(1));
	if (!PrevValue \|\| LI->getLoopFor(PrevValue->getParent()) != L)
	return true;

	// If a new pointer is loaded in the loop, the pointer references a different
	// object in every iteration. E.g.:
	// for (i)
	// int *p = a[i];
	// ...
	if (auto *Load = dyn_cast<LoadInst>(PrevValue))
	if (!L->isLoopInvariant(Load->getPointerOperand()))
	return false;
	return true;
	}

	Value llvm::GetUnderlyingObject(Value V, const DataLayout &DL,
	unsigned MaxLookup) {
	if (!V->getType()->isPointerTy())
	return V;
	for (unsigned Count = 0; MaxLookup == 0 \|\| Count < MaxLookup; ++Count) {
	if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
	V = GEP->getPointerOperand();
	} else if (Operator::getOpcode(V) == Instruction::BitCast \|\|
	Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
	V = cast<Operator>(V)->getOperand(0);
	} else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
	if (GA->isInterposable())
	return V;
	V = GA->getAliasee();
	} else if (isa<AllocaInst>(V)) {
	// An alloca can't be further simplified.
	return V;
	} else {
	if (auto CS = CallSite(V))
	if (Value *RV = CS.getReturnedArgOperand()) {
	V = RV;
	continue;
	}

	// See if InstructionSimplify knows any relevant tricks.
	if (Instruction *I = dyn_cast<Instruction>(V))
	// TODO: Acquire a DominatorTree and AssumptionCache and use them.
	if (Value *Simplified = SimplifyInstruction(I, {DL, I})) {
	V = Simplified;
	continue;
	}

	return V;
	}
	assert(V->getType()->isPointerTy() && "Unexpected operand type!");
	}
	return V;
	}

	void llvm::GetUnderlyingObjects(Value V, SmallVectorImpl<Value > &Objects,
	const DataLayout &DL, LoopInfo *LI,
	unsigned MaxLookup) {
	SmallPtrSet<Value *, 4> Visited;
	SmallVector<Value *, 4> Worklist;
	Worklist.push_back(V);
	do {
	Value *P = Worklist.pop_back_val();
	P = GetUnderlyingObject(P, DL, MaxLookup);

	if (!Visited.insert(P).second)
	continue;

	if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
	Worklist.push_back(SI->getTrueValue());
	Worklist.push_back(SI->getFalseValue());
	continue;
	}

	if (PHINode *PN = dyn_cast<PHINode>(P)) {
	// If this PHI changes the underlying object in every iteration of the
	// loop, don't look through it. Consider:
	// int **A;
	// for (i) {
	// Prev = Curr; // Prev = PHI (Prev_0, Curr)
	// Curr = A[i];
	// Prev, Curr;
	//
	// Prev is tracking Curr one iteration behind so they refer to different
	// underlying objects.
	if (!LI \|\| !LI->isLoopHeader(PN->getParent()) \|\|
	isSameUnderlyingObjectInLoop(PN, LI))
	for (Value *IncValue : PN->incoming_values())
	Worklist.push_back(IncValue);
	continue;
	}

	Objects.push_back(P);
	} while (!Worklist.empty());
	}

	+/// This is the function that does the work of looking through basic
	+/// ptrtoint+arithmetic+inttoptr sequences.
	+static const Value getUnderlyingObjectFromInt(const Value V) {
	+ do {
	+ if (const Operator *U = dyn_cast<Operator>(V)) {
	+ // If we find a ptrtoint, we can transfer control back to the
	+ // regular getUnderlyingObjectFromInt.
	+ if (U->getOpcode() == Instruction::PtrToInt)
	+ return U->getOperand(0);
	+ // If we find an add of a constant, a multiplied value, or a phi, it's
	+ // likely that the other operand will lead us to the base
	+ // object. We don't have to worry about the case where the
	+ // object address is somehow being computed by the multiply,
	+ // because our callers only care when the result is an
	+ // identifiable object.
	+ if (U->getOpcode() != Instruction::Add \|\|
	+ (!isa<ConstantInt>(U->getOperand(1)) &&
	+ Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
	+ !isa<PHINode>(U->getOperand(1))))
	+ return V;
	+ V = U->getOperand(0);
	+ } else {
	+ return V;
	+ }
	+ assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
	+ } while (true);
	+}
	+
	+/// This is a wrapper around GetUnderlyingObjects and adds support for basic
	+/// ptrtoint+arithmetic+inttoptr sequences.
	+void llvm::getUnderlyingObjectsForCodeGen(const Value *V,
	+ SmallVectorImpl<Value *> &Objects,
	+ const DataLayout &DL) {
	+ SmallPtrSet<const Value *, 16> Visited;
	+ SmallVector<const Value *, 4> Working(1, V);
	+ do {
	+ V = Working.pop_back_val();
	+
	+ SmallVector<Value *, 4> Objs;
	+ GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
	+
	+ for (Value *V : Objs) {
	+ if (!Visited.insert(V).second)
	+ continue;
	+ if (Operator::getOpcode(V) == Instruction::IntToPtr) {
	+ const Value *O =
	+ getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
	+ if (O->getType()->isPointerTy()) {
	+ Working.push_back(O);
	+ continue;
	+ }
	+ }
	+ // If GetUnderlyingObjects fails to find an identifiable object,
	+ // getUnderlyingObjectsForCodeGen also fails for safety.
	+ if (!isIdentifiedObject(V)) {
	+ Objects.clear();
	+ return;
	+ }
	+ Objects.push_back(const_cast<Value *>(V));
	+ }
	+ } while (!Working.empty());
	+}
	+
	/// Return true if the only users of this pointer are lifetime markers.
	bool llvm::onlyUsedByLifetimeMarkers(const Value *V) {
	for (const User *U : V->users()) {
	const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
	if (!II) return false;

	if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
	II->getIntrinsicID() != Intrinsic::lifetime_end)
	return false;
	}
	return true;
	}

	bool llvm::isSafeToSpeculativelyExecute(const Value *V,
	const Instruction *CtxI,
	const DominatorTree *DT) {
	const Operator *Inst = dyn_cast<Operator>(V);
	if (!Inst)
	return false;

	for (unsigned i = 0, e = Inst->getNumOperands(); i != e; ++i)
	if (Constant *C = dyn_cast<Constant>(Inst->getOperand(i)))
	if (C->canTrap())
	return false;

	switch (Inst->getOpcode()) {
	default:
	return true;
	case Instruction::UDiv:
	case Instruction::URem: {
	// x / y is undefined if y == 0.
	const APInt *V;
	if (match(Inst->getOperand(1), m_APInt(V)))
	return *V != 0;
	return false;
	}
	case Instruction::SDiv:
	case Instruction::SRem: {
	// x / y is undefined if y == 0 or x == INT_MIN and y == -1
	const APInt Numerator, Denominator;
	if (!match(Inst->getOperand(1), m_APInt(Denominator)))
	return false;
	// We cannot hoist this division if the denominator is 0.
	if (*Denominator == 0)
	return false;
	// It's safe to hoist if the denominator is not 0 or -1.
	if (*Denominator != -1)
	return true;
	// At this point we know that the denominator is -1. It is safe to hoist as
	// long we know that the numerator is not INT_MIN.
	if (match(Inst->getOperand(0), m_APInt(Numerator)))
	return !Numerator->isMinSignedValue();
	// The numerator might be MinSignedValue.
	return false;
	}
	case Instruction::Load: {
	const LoadInst *LI = cast<LoadInst>(Inst);
	if (!LI->isUnordered() \|\|
	// Speculative load may create a race that did not exist in the source.
	LI->getFunction()->hasFnAttribute(Attribute::SanitizeThread) \|\|
	// Speculative load may load data from dirty regions.
	LI->getFunction()->hasFnAttribute(Attribute::SanitizeAddress))
	return false;
	const DataLayout &DL = LI->getModule()->getDataLayout();
	return isDereferenceableAndAlignedPointer(LI->getPointerOperand(),
	LI->getAlignment(), DL, CtxI, DT);
	}
	case Instruction::Call: {
	auto *CI = cast<const CallInst>(Inst);
	const Function *Callee = CI->getCalledFunction();

	// The called function could have undefined behavior or side-effects, even
	// if marked readnone nounwind.
	return Callee && Callee->isSpeculatable();
	}
	case Instruction::VAArg:
	case Instruction::Alloca:
	case Instruction::Invoke:
	case Instruction::PHI:
	case Instruction::Store:
	case Instruction::Ret:
	case Instruction::Br:
	case Instruction::IndirectBr:
	case Instruction::Switch:
	case Instruction::Unreachable:
	case Instruction::Fence:
	case Instruction::AtomicRMW:
	case Instruction::AtomicCmpXchg:
	case Instruction::LandingPad:
	case Instruction::Resume:
	case Instruction::CatchSwitch:
	case Instruction::CatchPad:
	case Instruction::CatchRet:
	case Instruction::CleanupPad:
	case Instruction::CleanupRet:
	return false; // Misc instructions which have effects
	}
	}

	bool llvm::mayBeMemoryDependent(const Instruction &I) {
	return I.mayReadOrWriteMemory() \|\| !isSafeToSpeculativelyExecute(&I);
	}

	/// Return true if we know that the specified value is never null.
	bool llvm::isKnownNonNull(const Value *V) {
	assert(V->getType()->isPointerTy() && "V must be pointer type");

	// Alloca never returns null, malloc might.
	if (isa<AllocaInst>(V)) return true;

	// A byval, inalloca, or nonnull argument is never null.
	if (const Argument *A = dyn_cast<Argument>(V))
	return A->hasByValOrInAllocaAttr() \|\| A->hasNonNullAttr();

	// A global variable in address space 0 is non null unless extern weak
	// or an absolute symbol reference. Other address spaces may have null as a
	// valid address for a global, so we can't assume anything.
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
	return !GV->isAbsoluteSymbolRef() && !GV->hasExternalWeakLinkage() &&
	GV->getType()->getAddressSpace() == 0;

	// A Load tagged with nonnull metadata is never null.
	if (const LoadInst *LI = dyn_cast<LoadInst>(V))
	return LI->getMetadata(LLVMContext::MD_nonnull);

	if (auto CS = ImmutableCallSite(V))
	if (CS.isReturnNonNull())
	return true;

	return false;
	}

	static bool isKnownNonNullFromDominatingCondition(const Value *V,
	const Instruction *CtxI,
	const DominatorTree *DT) {
	assert(V->getType()->isPointerTy() && "V must be pointer type");
	assert(!isa<ConstantData>(V) && "Did not expect ConstantPointerNull");
	assert(CtxI && "Context instruction required for analysis");
	assert(DT && "Dominator tree required for analysis");

	unsigned NumUsesExplored = 0;
	for (auto *U : V->users()) {
	// Avoid massive lists
	if (NumUsesExplored >= DomConditionsMaxUses)
	break;
	NumUsesExplored++;

	// If the value is used as an argument to a call or invoke, then argument
	// attributes may provide an answer about null-ness.
	if (auto CS = ImmutableCallSite(U))
	if (auto *CalledFunc = CS.getCalledFunction())
	for (const Argument &Arg : CalledFunc->args())
	if (CS.getArgOperand(Arg.getArgNo()) == V &&
	Arg.hasNonNullAttr() && DT->dominates(CS.getInstruction(), CtxI))
	return true;

	// Consider only compare instructions uniquely controlling a branch
	CmpInst::Predicate Pred;
	if (!match(const_cast<User *>(U),
	m_c_ICmp(Pred, m_Specific(V), m_Zero())) \|\|
	(Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE))
	continue;

	for (auto *CmpU : U->users()) {
	if (const BranchInst *BI = dyn_cast<BranchInst>(CmpU)) {
	assert(BI->isConditional() && "uses a comparison!");

	BasicBlock *NonNullSuccessor =
	BI->getSuccessor(Pred == ICmpInst::ICMP_EQ ? 1 : 0);
	BasicBlockEdge Edge(BI->getParent(), NonNullSuccessor);
	if (Edge.isSingleEdge() && DT->dominates(Edge, CtxI->getParent()))
	return true;
	} else if (Pred == ICmpInst::ICMP_NE &&
	match(CmpU, m_Intrinsic<Intrinsic::experimental_guard>()) &&
	DT->dominates(cast<Instruction>(CmpU), CtxI)) {
	return true;
	}
	}
	}

	return false;
	}

	bool llvm::isKnownNonNullAt(const Value V, const Instruction CtxI,
	const DominatorTree *DT) {
	if (isa<ConstantPointerNull>(V) \|\| isa<UndefValue>(V))
	return false;

	if (isKnownNonNull(V))
	return true;

	if (!CtxI \|\| !DT)
	return false;

	return ::isKnownNonNullFromDominatingCondition(V, CtxI, DT);
	}

	OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS,
	const Value *RHS,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	// Multiplying n * m significant bits yields a result of n + m significant
	// bits. If the total number of significant bits does not exceed the
	// result bit width (minus 1), there is no overflow.
	// This means if we have enough leading zero bits in the operands
	// we can guarantee that the result does not overflow.
	// Ref: "Hacker's Delight" by Henry Warren
	unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
	KnownBits LHSKnown(BitWidth);
	KnownBits RHSKnown(BitWidth);
	computeKnownBits(LHS, LHSKnown, DL, /Depth=/0, AC, CxtI, DT);
	computeKnownBits(RHS, RHSKnown, DL, /Depth=/0, AC, CxtI, DT);
	// Note that underestimating the number of zero bits gives a more
	// conservative answer.
	unsigned ZeroBits = LHSKnown.countMinLeadingZeros() +
	RHSKnown.countMinLeadingZeros();
	// First handle the easy case: if we have enough zero bits there's
	// definitely no overflow.
	if (ZeroBits >= BitWidth)
	return OverflowResult::NeverOverflows;

	// Get the largest possible values for each operand.
	APInt LHSMax = ~LHSKnown.Zero;
	APInt RHSMax = ~RHSKnown.Zero;

	// We know the multiply operation doesn't overflow if the maximum values for
	// each operand will not overflow after we multiply them together.
	bool MaxOverflow;
	(void)LHSMax.umul_ov(RHSMax, MaxOverflow);
	if (!MaxOverflow)
	return OverflowResult::NeverOverflows;

	// We know it always overflows if multiplying the smallest possible values for
	// the operands also results in overflow.
	bool MinOverflow;
	(void)LHSKnown.One.umul_ov(RHSKnown.One, MinOverflow);
	if (MinOverflow)
	return OverflowResult::AlwaysOverflows;

	return OverflowResult::MayOverflow;
	}

	OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS,
	const Value *RHS,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	KnownBits LHSKnown = computeKnownBits(LHS, DL, /Depth=/0, AC, CxtI, DT);
	if (LHSKnown.isNonNegative() \|\| LHSKnown.isNegative()) {
	KnownBits RHSKnown = computeKnownBits(RHS, DL, /Depth=/0, AC, CxtI, DT);

	if (LHSKnown.isNegative() && RHSKnown.isNegative()) {
	// The sign bit is set in both cases: this MUST overflow.
	// Create a simple add instruction, and insert it into the struct.
	return OverflowResult::AlwaysOverflows;
	}

	if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) {
	// The sign bit is clear in both cases: this CANNOT overflow.
	// Create a simple add instruction, and insert it into the struct.
	return OverflowResult::NeverOverflows;
	}
	}

	return OverflowResult::MayOverflow;
	}

	/// \brief Return true if we can prove that adding the two values of the
	/// knownbits will not overflow.
	/// Otherwise return false.
	static bool checkRippleForSignedAdd(const KnownBits &LHSKnown,
	const KnownBits &RHSKnown) {
	// Addition of two 2's complement numbers having opposite signs will never
	// overflow.
	if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) \|\|
	(LHSKnown.isNonNegative() && RHSKnown.isNegative()))
	return true;

	// If either of the values is known to be non-negative, adding them can only
	// overflow if the second is also non-negative, so we can assume that.
	// Two non-negative numbers will only overflow if there is a carry to the
	// sign bit, so we can check if even when the values are as big as possible
	// there is no overflow to the sign bit.
	if (LHSKnown.isNonNegative() \|\| RHSKnown.isNonNegative()) {
	APInt MaxLHS = ~LHSKnown.Zero;
	MaxLHS.clearSignBit();
	APInt MaxRHS = ~RHSKnown.Zero;
	MaxRHS.clearSignBit();
	APInt Result = std::move(MaxLHS) + std::move(MaxRHS);
	return Result.isSignBitClear();
	}

	// If either of the values is known to be negative, adding them can only
	// overflow if the second is also negative, so we can assume that.
	// Two negative number will only overflow if there is no carry to the sign
	// bit, so we can check if even when the values are as small as possible
	// there is overflow to the sign bit.
	if (LHSKnown.isNegative() \|\| RHSKnown.isNegative()) {
	APInt MinLHS = LHSKnown.One;
	MinLHS.clearSignBit();
	APInt MinRHS = RHSKnown.One;
	MinRHS.clearSignBit();
	APInt Result = std::move(MinLHS) + std::move(MinRHS);
	return Result.isSignBitSet();
	}

	// If we reached here it means that we know nothing about the sign bits.
	// In this case we can't know if there will be an overflow, since by
	// changing the sign bits any two values can be made to overflow.
	return false;
	}

	static OverflowResult computeOverflowForSignedAdd(const Value *LHS,
	const Value *RHS,
	const AddOperator *Add,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	if (Add && Add->hasNoSignedWrap()) {
	return OverflowResult::NeverOverflows;
	}

	// If LHS and RHS each have at least two sign bits, the addition will look
	// like
	//
	// XX..... +
	// YY.....
	//
	// If the carry into the most significant position is 0, X and Y can't both
	// be 1 and therefore the carry out of the addition is also 0.
	//
	// If the carry into the most significant position is 1, X and Y can't both
	// be 0 and therefore the carry out of the addition is also 1.
	//
	// Since the carry into the most significant position is always equal to
	// the carry out of the addition, there is no signed overflow.
	if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 &&
	ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1)
	return OverflowResult::NeverOverflows;

	KnownBits LHSKnown = computeKnownBits(LHS, DL, /Depth=/0, AC, CxtI, DT);
	KnownBits RHSKnown = computeKnownBits(RHS, DL, /Depth=/0, AC, CxtI, DT);

	if (checkRippleForSignedAdd(LHSKnown, RHSKnown))
	return OverflowResult::NeverOverflows;

	// The remaining code needs Add to be available. Early returns if not so.
	if (!Add)
	return OverflowResult::MayOverflow;

	// If the sign of Add is the same as at least one of the operands, this add
	// CANNOT overflow. This is particularly useful when the sum is
	// @llvm.assume'ed non-negative rather than proved so from analyzing its
	// operands.
	bool LHSOrRHSKnownNonNegative =
	(LHSKnown.isNonNegative() \|\| RHSKnown.isNonNegative());
	bool LHSOrRHSKnownNegative =
	(LHSKnown.isNegative() \|\| RHSKnown.isNegative());
	if (LHSOrRHSKnownNonNegative \|\| LHSOrRHSKnownNegative) {
	KnownBits AddKnown = computeKnownBits(Add, DL, /Depth=/0, AC, CxtI, DT);
	if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) \|\|
	(AddKnown.isNegative() && LHSOrRHSKnownNegative)) {
	return OverflowResult::NeverOverflows;
	}
	}

	return OverflowResult::MayOverflow;
	}

	bool llvm::isOverflowIntrinsicNoWrap(const IntrinsicInst *II,
	const DominatorTree &DT) {
	#ifndef NDEBUG
	auto IID = II->getIntrinsicID();
	assert((IID == Intrinsic::sadd_with_overflow \|\|
	IID == Intrinsic::uadd_with_overflow \|\|
	IID == Intrinsic::ssub_with_overflow \|\|
	IID == Intrinsic::usub_with_overflow \|\|
	IID == Intrinsic::smul_with_overflow \|\|
	IID == Intrinsic::umul_with_overflow) &&
	"Not an overflow intrinsic!");
	#endif

	SmallVector<const BranchInst *, 2> GuardingBranches;
	SmallVector<const ExtractValueInst *, 2> Results;

	for (const User *U : II->users()) {
	if (const auto *EVI = dyn_cast<ExtractValueInst>(U)) {
	assert(EVI->getNumIndices() == 1 && "Obvious from CI's type");

	if (EVI->getIndices()[0] == 0)
	Results.push_back(EVI);
	else {
	assert(EVI->getIndices()[0] == 1 && "Obvious from CI's type");

	for (const auto *U : EVI->users())
	if (const auto *B = dyn_cast<BranchInst>(U)) {
	assert(B->isConditional() && "How else is it using an i1?");
	GuardingBranches.push_back(B);
	}
	}
	} else {
	// We are using the aggregate directly in a way we don't want to analyze
	// here (storing it to a global, say).
	return false;
	}
	}

	auto AllUsesGuardedByBranch = [&](const BranchInst *BI) {
	BasicBlockEdge NoWrapEdge(BI->getParent(), BI->getSuccessor(1));
	if (!NoWrapEdge.isSingleEdge())
	return false;

	// Check if all users of the add are provably no-wrap.
	for (const auto *Result : Results) {
	// If the extractvalue itself is not executed on overflow, the we don't
	// need to check each use separately, since domination is transitive.
	if (DT.dominates(NoWrapEdge, Result->getParent()))
	continue;

	for (auto &RU : Result->uses())
	if (!DT.dominates(NoWrapEdge, RU))
	return false;
	}

	return true;
	};

	return any_of(GuardingBranches, AllUsesGuardedByBranch);
	}


	OverflowResult llvm::computeOverflowForSignedAdd(const AddOperator *Add,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	return ::computeOverflowForSignedAdd(Add->getOperand(0), Add->getOperand(1),
	Add, DL, AC, CxtI, DT);
	}

	OverflowResult llvm::computeOverflowForSignedAdd(const Value *LHS,
	const Value *RHS,
	const DataLayout &DL,
	AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	return ::computeOverflowForSignedAdd(LHS, RHS, nullptr, DL, AC, CxtI, DT);
	}

	bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
	// A memory operation returns normally if it isn't volatile. A volatile
	// operation is allowed to trap.
	//
	// An atomic operation isn't guaranteed to return in a reasonable amount of
	// time because it's possible for another thread to interfere with it for an
	// arbitrary length of time, but programs aren't allowed to rely on that.
	if (const LoadInst *LI = dyn_cast<LoadInst>(I))
	return !LI->isVolatile();
	if (const StoreInst *SI = dyn_cast<StoreInst>(I))
	return !SI->isVolatile();
	if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
	return !CXI->isVolatile();
	if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
	return !RMWI->isVolatile();
	if (const MemIntrinsic *MII = dyn_cast<MemIntrinsic>(I))
	return !MII->isVolatile();

	// If there is no successor, then execution can't transfer to it.
	if (const auto *CRI = dyn_cast<CleanupReturnInst>(I))
	return !CRI->unwindsToCaller();
	if (const auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I))
	return !CatchSwitch->unwindsToCaller();
	if (isa<ResumeInst>(I))
	return false;
	if (isa<ReturnInst>(I))
	return false;
	if (isa<UnreachableInst>(I))
	return false;

	// Calls can throw, or contain an infinite loop, or kill the process.
	if (auto CS = ImmutableCallSite(I)) {
	// Call sites that throw have implicit non-local control flow.
	if (!CS.doesNotThrow())
	return false;

	// Non-throwing call sites can loop infinitely, call exit/pthread_exit
	// etc. and thus not return. However, LLVM already assumes that
	//
	// - Thread exiting actions are modeled as writes to memory invisible to
	// the program.
	//
	// - Loops that don't have side effects (side effects are volatile/atomic
	// stores and IO) always terminate (see http://llvm.org/PR965).
	// Furthermore IO itself is also modeled as writes to memory invisible to
	// the program.
	//
	// We rely on those assumptions here, and use the memory effects of the call
	// target as a proxy for checking that it always returns.

	// FIXME: This isn't aggressive enough; a call which only writes to a global
	// is guaranteed to return.
	return CS.onlyReadsMemory() \|\| CS.onlyAccessesArgMemory() \|\|
	match(I, m_Intrinsic<Intrinsic::assume>());
	}

	// Other instructions return normally.
	return true;
	}

	bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
	const Loop *L) {
	// The loop header is guaranteed to be executed for every iteration.
	//
	// FIXME: Relax this constraint to cover all basic blocks that are
	// guaranteed to be executed at every iteration.
	if (I->getParent() != L->getHeader()) return false;

	for (const Instruction &LI : *L->getHeader()) {
	if (&LI == I) return true;
	if (!isGuaranteedToTransferExecutionToSuccessor(&LI)) return false;
	}
	llvm_unreachable("Instruction not contained in its own parent basic block.");
	}

	bool llvm::propagatesFullPoison(const Instruction *I) {
	switch (I->getOpcode()) {
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Xor:
	case Instruction::Trunc:
	case Instruction::BitCast:
	case Instruction::AddrSpaceCast:
	case Instruction::Mul:
	case Instruction::Shl:
	case Instruction::GetElementPtr:
	// These operations all propagate poison unconditionally. Note that poison
	// is not any particular value, so xor or subtraction of poison with
	// itself still yields poison, not zero.
	return true;

	case Instruction::AShr:
	case Instruction::SExt:
	// For these operations, one bit of the input is replicated across
	// multiple output bits. A replicated poison bit is still poison.
	return true;

	case Instruction::ICmp:
	// Comparing poison with any value yields poison. This is why, for
	// instance, x s< (x +nsw 1) can be folded to true.
	return true;

	default:
	return false;
	}
	}

	const Value llvm::getGuaranteedNonFullPoisonOp(const Instruction I) {
	switch (I->getOpcode()) {
	case Instruction::Store:
	return cast<StoreInst>(I)->getPointerOperand();

	case Instruction::Load:
	return cast<LoadInst>(I)->getPointerOperand();

	case Instruction::AtomicCmpXchg:
	return cast<AtomicCmpXchgInst>(I)->getPointerOperand();

	case Instruction::AtomicRMW:
	return cast<AtomicRMWInst>(I)->getPointerOperand();

	case Instruction::UDiv:
	case Instruction::SDiv:
	case Instruction::URem:
	case Instruction::SRem:
	return I->getOperand(1);

	default:
	return nullptr;
	}
	}

	bool llvm::programUndefinedIfFullPoison(const Instruction *PoisonI) {
	// We currently only look for uses of poison values within the same basic
	// block, as that makes it easier to guarantee that the uses will be
	// executed given that PoisonI is executed.
	//
	// FIXME: Expand this to consider uses beyond the same basic block. To do
	// this, look out for the distinction between post-dominance and strong
	// post-dominance.
	const BasicBlock *BB = PoisonI->getParent();

	// Set of instructions that we have proved will yield poison if PoisonI
	// does.
	SmallSet<const Value *, 16> YieldsPoison;
	SmallSet<const BasicBlock *, 4> Visited;
	YieldsPoison.insert(PoisonI);
	Visited.insert(PoisonI->getParent());

	BasicBlock::const_iterator Begin = PoisonI->getIterator(), End = BB->end();

	unsigned Iter = 0;
	while (Iter++ < MaxDepth) {
	for (auto &I : make_range(Begin, End)) {
	if (&I != PoisonI) {
	const Value *NotPoison = getGuaranteedNonFullPoisonOp(&I);
	if (NotPoison != nullptr && YieldsPoison.count(NotPoison))
	return true;
	if (!isGuaranteedToTransferExecutionToSuccessor(&I))
	return false;
	}

	// Mark poison that propagates from I through uses of I.
	if (YieldsPoison.count(&I)) {
	for (const User *User : I.users()) {
	const Instruction *UserI = cast<Instruction>(User);
	if (propagatesFullPoison(UserI))
	YieldsPoison.insert(User);
	}
	}
	}

	if (auto *NextBB = BB->getSingleSuccessor()) {
	if (Visited.insert(NextBB).second) {
	BB = NextBB;
	Begin = BB->getFirstNonPHI()->getIterator();
	End = BB->end();
	continue;
	}
	}

	break;
	};
	return false;
	}

	static bool isKnownNonNaN(const Value *V, FastMathFlags FMF) {
	if (FMF.noNaNs())
	return true;

	if (auto *C = dyn_cast<ConstantFP>(V))
	return !C->isNaN();
	return false;
	}

	static bool isKnownNonZero(const Value *V) {
	if (auto *C = dyn_cast<ConstantFP>(V))
	return !C->isZero();
	return false;
	}

	/// Match non-obvious integer minimum and maximum sequences.
	static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
	Value CmpLHS, Value CmpRHS,
	Value TrueVal, Value FalseVal,
	Value &LHS, Value &RHS) {
	// Assume success. If there's no match, callers should not use these anyway.
	LHS = TrueVal;
	RHS = FalseVal;

	// Recognize variations of:
	// CLAMP(v,l,h) ==> ((v) < (l) ? (l) : ((v) > (h) ? (h) : (v)))
	const APInt *C1;
	if (CmpRHS == TrueVal && match(CmpRHS, m_APInt(C1))) {
	const APInt *C2;

	// (X <s C1) ? C1 : SMIN(X, C2) ==> SMAX(SMIN(X, C2), C1)
	if (match(FalseVal, m_SMin(m_Specific(CmpLHS), m_APInt(C2))) &&
	C1->slt(*C2) && Pred == CmpInst::ICMP_SLT)
	return {SPF_SMAX, SPNB_NA, false};

	// (X >s C1) ? C1 : SMAX(X, C2) ==> SMIN(SMAX(X, C2), C1)
	if (match(FalseVal, m_SMax(m_Specific(CmpLHS), m_APInt(C2))) &&
	C1->sgt(*C2) && Pred == CmpInst::ICMP_SGT)
	return {SPF_SMIN, SPNB_NA, false};

	// (X <u C1) ? C1 : UMIN(X, C2) ==> UMAX(UMIN(X, C2), C1)
	if (match(FalseVal, m_UMin(m_Specific(CmpLHS), m_APInt(C2))) &&
	C1->ult(*C2) && Pred == CmpInst::ICMP_ULT)
	return {SPF_UMAX, SPNB_NA, false};

	// (X >u C1) ? C1 : UMAX(X, C2) ==> UMIN(UMAX(X, C2), C1)
	if (match(FalseVal, m_UMax(m_Specific(CmpLHS), m_APInt(C2))) &&
	C1->ugt(*C2) && Pred == CmpInst::ICMP_UGT)
	return {SPF_UMIN, SPNB_NA, false};
	}

	if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT)
	return {SPF_UNKNOWN, SPNB_NA, false};

	// Z = X -nsw Y
	// (X >s Y) ? 0 : Z ==> (Z >s 0) ? 0 : Z ==> SMIN(Z, 0)
	// (X <s Y) ? 0 : Z ==> (Z <s 0) ? 0 : Z ==> SMAX(Z, 0)
	if (match(TrueVal, m_Zero()) &&
	match(FalseVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
	return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};

	// Z = X -nsw Y
	// (X >s Y) ? Z : 0 ==> (Z >s 0) ? Z : 0 ==> SMAX(Z, 0)
	// (X <s Y) ? Z : 0 ==> (Z <s 0) ? Z : 0 ==> SMIN(Z, 0)
	if (match(FalseVal, m_Zero()) &&
	match(TrueVal, m_NSWSub(m_Specific(CmpLHS), m_Specific(CmpRHS))))
	return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};

	if (!match(CmpRHS, m_APInt(C1)))
	return {SPF_UNKNOWN, SPNB_NA, false};

	// An unsigned min/max can be written with a signed compare.
	const APInt *C2;
	if ((CmpLHS == TrueVal && match(FalseVal, m_APInt(C2))) \|\|
	(CmpLHS == FalseVal && match(TrueVal, m_APInt(C2)))) {
	// Is the sign bit set?
	// (X <s 0) ? X : MAXVAL ==> (X >u MAXVAL) ? X : MAXVAL ==> UMAX
	// (X <s 0) ? MAXVAL : X ==> (X >u MAXVAL) ? MAXVAL : X ==> UMIN
	if (Pred == CmpInst::ICMP_SLT && *C1 == 0 && C2->isMaxSignedValue())
	return {CmpLHS == TrueVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};

	// Is the sign bit clear?
	// (X >s -1) ? MINVAL : X ==> (X <u MINVAL) ? MINVAL : X ==> UMAX
	// (X >s -1) ? X : MINVAL ==> (X <u MINVAL) ? X : MINVAL ==> UMIN
	if (Pred == CmpInst::ICMP_SGT && C1->isAllOnesValue() &&
	C2->isMinSignedValue())
	return {CmpLHS == FalseVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
	}

	// Look through 'not' ops to find disguised signed min/max.
	// (X >s C) ? ~X : ~C ==> (~X <s ~C) ? ~X : ~C ==> SMIN(~X, ~C)
	// (X <s C) ? ~X : ~C ==> (~X >s ~C) ? ~X : ~C ==> SMAX(~X, ~C)
	if (match(TrueVal, m_Not(m_Specific(CmpLHS))) &&
	match(FalseVal, m_APInt(C2)) && ~(C1) == C2)
	return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};

	// (X >s C) ? ~C : ~X ==> (~X <s ~C) ? ~C : ~X ==> SMAX(~C, ~X)
	// (X <s C) ? ~C : ~X ==> (~X >s ~C) ? ~C : ~X ==> SMIN(~C, ~X)
	if (match(FalseVal, m_Not(m_Specific(CmpLHS))) &&
	match(TrueVal, m_APInt(C2)) && ~(C1) == C2)
	return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};

	return {SPF_UNKNOWN, SPNB_NA, false};
	}

	static SelectPatternResult matchSelectPattern(CmpInst::Predicate Pred,
	FastMathFlags FMF,
	Value CmpLHS, Value CmpRHS,
	Value TrueVal, Value FalseVal,
	Value &LHS, Value &RHS) {
	LHS = CmpLHS;
	RHS = CmpRHS;

	// If the predicate is an "or-equal" (FP) predicate, then signed zeroes may
	// return inconsistent results between implementations.
	// (0.0 <= -0.0) ? 0.0 : -0.0 // Returns 0.0
	// minNum(0.0, -0.0) // May return -0.0 or 0.0 (IEEE 754-2008 5.3.1)
	// Therefore we behave conservatively and only proceed if at least one of the
	// operands is known to not be zero, or if we don't care about signed zeroes.
	switch (Pred) {
	default: break;
	case CmpInst::FCMP_OGE: case CmpInst::FCMP_OLE:
	case CmpInst::FCMP_UGE: case CmpInst::FCMP_ULE:
	if (!FMF.noSignedZeros() && !isKnownNonZero(CmpLHS) &&
	!isKnownNonZero(CmpRHS))
	return {SPF_UNKNOWN, SPNB_NA, false};
	}

	SelectPatternNaNBehavior NaNBehavior = SPNB_NA;
	bool Ordered = false;

	// When given one NaN and one non-NaN input:
	// - maxnum/minnum (C99 fmaxf()/fminf()) return the non-NaN input.
	// - A simple C99 (a < b ? a : b) construction will return 'b' (as the
	// ordered comparison fails), which could be NaN or non-NaN.
	// so here we discover exactly what NaN behavior is required/accepted.
	if (CmpInst::isFPPredicate(Pred)) {
	bool LHSSafe = isKnownNonNaN(CmpLHS, FMF);
	bool RHSSafe = isKnownNonNaN(CmpRHS, FMF);

	if (LHSSafe && RHSSafe) {
	// Both operands are known non-NaN.
	NaNBehavior = SPNB_RETURNS_ANY;
	} else if (CmpInst::isOrdered(Pred)) {
	// An ordered comparison will return false when given a NaN, so it
	// returns the RHS.
	Ordered = true;
	if (LHSSafe)
	// LHS is non-NaN, so if RHS is NaN then NaN will be returned.
	NaNBehavior = SPNB_RETURNS_NAN;
	else if (RHSSafe)
	NaNBehavior = SPNB_RETURNS_OTHER;
	else
	// Completely unsafe.
	return {SPF_UNKNOWN, SPNB_NA, false};
	} else {
	Ordered = false;
	// An unordered comparison will return true when given a NaN, so it
	// returns the LHS.
	if (LHSSafe)
	// LHS is non-NaN, so if RHS is NaN then non-NaN will be returned.
	NaNBehavior = SPNB_RETURNS_OTHER;
	else if (RHSSafe)
	NaNBehavior = SPNB_RETURNS_NAN;
	else
	// Completely unsafe.
	return {SPF_UNKNOWN, SPNB_NA, false};
	}
	}

	if (TrueVal == CmpRHS && FalseVal == CmpLHS) {
	std::swap(CmpLHS, CmpRHS);
	Pred = CmpInst::getSwappedPredicate(Pred);
	if (NaNBehavior == SPNB_RETURNS_NAN)
	NaNBehavior = SPNB_RETURNS_OTHER;
	else if (NaNBehavior == SPNB_RETURNS_OTHER)
	NaNBehavior = SPNB_RETURNS_NAN;
	Ordered = !Ordered;
	}

	// ([if]cmp X, Y) ? X : Y
	if (TrueVal == CmpLHS && FalseVal == CmpRHS) {
	switch (Pred) {
	default: return {SPF_UNKNOWN, SPNB_NA, false}; // Equality.
	case ICmpInst::ICMP_UGT:
	case ICmpInst::ICMP_UGE: return {SPF_UMAX, SPNB_NA, false};
	case ICmpInst::ICMP_SGT:
	case ICmpInst::ICMP_SGE: return {SPF_SMAX, SPNB_NA, false};
	case ICmpInst::ICMP_ULT:
	case ICmpInst::ICMP_ULE: return {SPF_UMIN, SPNB_NA, false};
	case ICmpInst::ICMP_SLT:
	case ICmpInst::ICMP_SLE: return {SPF_SMIN, SPNB_NA, false};
	case FCmpInst::FCMP_UGT:
	case FCmpInst::FCMP_UGE:
	case FCmpInst::FCMP_OGT:
	case FCmpInst::FCMP_OGE: return {SPF_FMAXNUM, NaNBehavior, Ordered};
	case FCmpInst::FCMP_ULT:
	case FCmpInst::FCMP_ULE:
	case FCmpInst::FCMP_OLT:
	case FCmpInst::FCMP_OLE: return {SPF_FMINNUM, NaNBehavior, Ordered};
	}
	}

	const APInt *C1;
	if (match(CmpRHS, m_APInt(C1))) {
	if ((CmpLHS == TrueVal && match(FalseVal, m_Neg(m_Specific(CmpLHS)))) \|\|
	(CmpLHS == FalseVal && match(TrueVal, m_Neg(m_Specific(CmpLHS))))) {

	// ABS(X) ==> (X >s 0) ? X : -X and (X >s -1) ? X : -X
	// NABS(X) ==> (X >s 0) ? -X : X and (X >s -1) ? -X : X
	if (Pred == ICmpInst::ICMP_SGT && (*C1 == 0 \|\| C1->isAllOnesValue())) {
	return {(CmpLHS == TrueVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
	}

	// ABS(X) ==> (X <s 0) ? -X : X and (X <s 1) ? -X : X
	// NABS(X) ==> (X <s 0) ? X : -X and (X <s 1) ? X : -X
	if (Pred == ICmpInst::ICMP_SLT && (C1 == 0 \|\| C1 == 1)) {
	return {(CmpLHS == FalseVal) ? SPF_ABS : SPF_NABS, SPNB_NA, false};
	}
	}
	}

	return matchMinMax(Pred, CmpLHS, CmpRHS, TrueVal, FalseVal, LHS, RHS);
	}

	static Value lookThroughCast(CmpInst CmpI, Value V1, Value V2,
	Instruction::CastOps *CastOp) {
	auto *Cast1 = dyn_cast<CastInst>(V1);
	if (!Cast1)
	return nullptr;

	*CastOp = Cast1->getOpcode();
	Type *SrcTy = Cast1->getSrcTy();
	if (auto *Cast2 = dyn_cast<CastInst>(V2)) {
	// If V1 and V2 are both the same cast from the same type, look through V1.
	if (*CastOp == Cast2->getOpcode() && SrcTy == Cast2->getSrcTy())
	return Cast2->getOperand(0);
	return nullptr;
	}

	auto *C = dyn_cast<Constant>(V2);
	if (!C)
	return nullptr;

	Constant *CastedTo = nullptr;
	switch (*CastOp) {
	case Instruction::ZExt:
	if (CmpI->isUnsigned())
	CastedTo = ConstantExpr::getTrunc(C, SrcTy);
	break;
	case Instruction::SExt:
	if (CmpI->isSigned())
	CastedTo = ConstantExpr::getTrunc(C, SrcTy, true);
	break;
	case Instruction::Trunc:
	CastedTo = ConstantExpr::getIntegerCast(C, SrcTy, CmpI->isSigned());
	break;
	case Instruction::FPTrunc:
	CastedTo = ConstantExpr::getFPExtend(C, SrcTy, true);
	break;
	case Instruction::FPExt:
	CastedTo = ConstantExpr::getFPTrunc(C, SrcTy, true);
	break;
	case Instruction::FPToUI:
	CastedTo = ConstantExpr::getUIToFP(C, SrcTy, true);
	break;
	case Instruction::FPToSI:
	CastedTo = ConstantExpr::getSIToFP(C, SrcTy, true);
	break;
	case Instruction::UIToFP:
	CastedTo = ConstantExpr::getFPToUI(C, SrcTy, true);
	break;
	case Instruction::SIToFP:
	CastedTo = ConstantExpr::getFPToSI(C, SrcTy, true);
	break;
	default:
	break;
	}

	if (!CastedTo)
	return nullptr;

	// Make sure the cast doesn't lose any information.
	Constant *CastedBack =
	ConstantExpr::getCast(*CastOp, CastedTo, C->getType(), true);
	if (CastedBack != C)
	return nullptr;

	return CastedTo;
	}

	SelectPatternResult llvm::matchSelectPattern(Value V, Value &LHS, Value *&RHS,
	Instruction::CastOps *CastOp) {
	SelectInst *SI = dyn_cast<SelectInst>(V);
	if (!SI) return {SPF_UNKNOWN, SPNB_NA, false};

	CmpInst *CmpI = dyn_cast<CmpInst>(SI->getCondition());
	if (!CmpI) return {SPF_UNKNOWN, SPNB_NA, false};

	CmpInst::Predicate Pred = CmpI->getPredicate();
	Value *CmpLHS = CmpI->getOperand(0);
	Value *CmpRHS = CmpI->getOperand(1);
	Value *TrueVal = SI->getTrueValue();
	Value *FalseVal = SI->getFalseValue();
	FastMathFlags FMF;
	if (isa<FPMathOperator>(CmpI))
	FMF = CmpI->getFastMathFlags();

	// Bail out early.
	if (CmpI->isEquality())
	return {SPF_UNKNOWN, SPNB_NA, false};

	// Deal with type mismatches.
	if (CastOp && CmpLHS->getType() != TrueVal->getType()) {
	if (Value *C = lookThroughCast(CmpI, TrueVal, FalseVal, CastOp))
	return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
	cast<CastInst>(TrueVal)->getOperand(0), C,
	LHS, RHS);
	if (Value *C = lookThroughCast(CmpI, FalseVal, TrueVal, CastOp))
	return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS,
	C, cast<CastInst>(FalseVal)->getOperand(0),
	LHS, RHS);
	}
	return ::matchSelectPattern(Pred, FMF, CmpLHS, CmpRHS, TrueVal, FalseVal,
	LHS, RHS);
	}

	/// Return true if "icmp Pred LHS RHS" is always true.
	static bool isTruePredicate(CmpInst::Predicate Pred,
	const Value LHS, const Value RHS,
	const DataLayout &DL, unsigned Depth,
	AssumptionCache AC, const Instruction CxtI,
	const DominatorTree *DT) {
	assert(!LHS->getType()->isVectorTy() && "TODO: extend to handle vectors!");
	if (ICmpInst::isTrueWhenEqual(Pred) && LHS == RHS)
	return true;

	switch (Pred) {
	default:
	return false;

	case CmpInst::ICMP_SLE: {
	const APInt *C;

	// LHS s<= LHS +_{nsw} C if C >= 0
	if (match(RHS, m_NSWAdd(m_Specific(LHS), m_APInt(C))))
	return !C->isNegative();
	return false;
	}

	case CmpInst::ICMP_ULE: {
	const APInt *C;

	// LHS u<= LHS +_{nuw} C for any C
	if (match(RHS, m_NUWAdd(m_Specific(LHS), m_APInt(C))))
	return true;

	// Match A to (X +_{nuw} CA) and B to (X +_{nuw} CB)
	auto MatchNUWAddsToSameValue = [&](const Value A, const Value B,
	const Value *&X,
	const APInt &CA, const APInt &CB) {
	if (match(A, m_NUWAdd(m_Value(X), m_APInt(CA))) &&
	match(B, m_NUWAdd(m_Specific(X), m_APInt(CB))))
	return true;

	// If X & C == 0 then (X \| C) == X +_{nuw} C
	if (match(A, m_Or(m_Value(X), m_APInt(CA))) &&
	match(B, m_Or(m_Specific(X), m_APInt(CB)))) {
	KnownBits Known(CA->getBitWidth());
	computeKnownBits(X, Known, DL, Depth + 1, AC, CxtI, DT);

	if (CA->isSubsetOf(Known.Zero) && CB->isSubsetOf(Known.Zero))
	return true;
	}

	return false;
	};

	const Value *X;
	const APInt CLHS, CRHS;
	if (MatchNUWAddsToSameValue(LHS, RHS, X, CLHS, CRHS))
	return CLHS->ule(*CRHS);

	return false;
	}
	}
	}

	/// Return true if "icmp Pred BLHS BRHS" is true whenever "icmp Pred
	/// ALHS ARHS" is true. Otherwise, return None.
	static Optional<bool>
	isImpliedCondOperands(CmpInst::Predicate Pred, const Value *ALHS,
	const Value ARHS, const Value BLHS,
	const Value *BRHS, const DataLayout &DL,
	unsigned Depth, AssumptionCache *AC,
	const Instruction CxtI, const DominatorTree DT) {
	switch (Pred) {
	default:
	return None;

	case CmpInst::ICMP_SLT:
	case CmpInst::ICMP_SLE:
	if (isTruePredicate(CmpInst::ICMP_SLE, BLHS, ALHS, DL, Depth, AC, CxtI,
	DT) &&
	isTruePredicate(CmpInst::ICMP_SLE, ARHS, BRHS, DL, Depth, AC, CxtI, DT))
	return true;
	return None;

	case CmpInst::ICMP_ULT:
	case CmpInst::ICMP_ULE:
	if (isTruePredicate(CmpInst::ICMP_ULE, BLHS, ALHS, DL, Depth, AC, CxtI,
	DT) &&
	isTruePredicate(CmpInst::ICMP_ULE, ARHS, BRHS, DL, Depth, AC, CxtI, DT))
	return true;
	return None;
	}
	}

	/// Return true if the operands of the two compares match. IsSwappedOps is true
	/// when the operands match, but are swapped.
	static bool isMatchingOps(const Value ALHS, const Value ARHS,
	const Value BLHS, const Value BRHS,
	bool &IsSwappedOps) {

	bool IsMatchingOps = (ALHS == BLHS && ARHS == BRHS);
	IsSwappedOps = (ALHS == BRHS && ARHS == BLHS);
	return IsMatchingOps \|\| IsSwappedOps;
	}

	/// Return true if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS BRHS" is
	/// true. Return false if "icmp1 APred ALHS ARHS" implies "icmp2 BPred BLHS
	/// BRHS" is false. Otherwise, return None if we can't infer anything.
	static Optional<bool> isImpliedCondMatchingOperands(CmpInst::Predicate APred,
	const Value *ALHS,
	const Value *ARHS,
	CmpInst::Predicate BPred,
	const Value *BLHS,
	const Value *BRHS,
	bool IsSwappedOps) {
	// Canonicalize the operands so they're matching.
	if (IsSwappedOps) {
	std::swap(BLHS, BRHS);
	BPred = ICmpInst::getSwappedPredicate(BPred);
	}
	if (CmpInst::isImpliedTrueByMatchingCmp(APred, BPred))
	return true;
	if (CmpInst::isImpliedFalseByMatchingCmp(APred, BPred))
	return false;

	return None;
	}

	/// Return true if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS C2" is
	/// true. Return false if "icmp1 APred ALHS C1" implies "icmp2 BPred BLHS
	/// C2" is false. Otherwise, return None if we can't infer anything.
	static Optional<bool>
	isImpliedCondMatchingImmOperands(CmpInst::Predicate APred, const Value *ALHS,
	const ConstantInt *C1,
	CmpInst::Predicate BPred,
	const Value BLHS, const ConstantInt C2) {
	assert(ALHS == BLHS && "LHS operands must match.");
	ConstantRange DomCR =
	ConstantRange::makeExactICmpRegion(APred, C1->getValue());
	ConstantRange CR =
	ConstantRange::makeAllowedICmpRegion(BPred, C2->getValue());
	ConstantRange Intersection = DomCR.intersectWith(CR);
	ConstantRange Difference = DomCR.difference(CR);
	if (Intersection.isEmptySet())
	return false;
	if (Difference.isEmptySet())
	return true;
	return None;
	}

	Optional<bool> llvm::isImpliedCondition(const Value LHS, const Value RHS,
	const DataLayout &DL, bool LHSIsFalse,
	unsigned Depth, AssumptionCache *AC,
	const Instruction *CxtI,
	const DominatorTree *DT) {
	// A mismatch occurs when we compare a scalar cmp to a vector cmp, for example.
	if (LHS->getType() != RHS->getType())
	return None;

	Type *OpTy = LHS->getType();
	assert(OpTy->isIntOrIntVectorTy(1));

	// LHS ==> RHS by definition
	if (LHS == RHS)
	return !LHSIsFalse;

	if (OpTy->isVectorTy())
	// TODO: extending the code below to handle vectors
	return None;
	assert(OpTy->isIntegerTy(1) && "implied by above");

	Value BLHS, BRHS;
	ICmpInst::Predicate BPred;
	// We expect the RHS to be an icmp.
	if (!match(RHS, m_ICmp(BPred, m_Value(BLHS), m_Value(BRHS))))
	return None;

	Value ALHS, ARHS;
	ICmpInst::Predicate APred;
	// The LHS can be an 'or', 'and', or 'icmp'.
	if (!match(LHS, m_ICmp(APred, m_Value(ALHS), m_Value(ARHS)))) {
	// The remaining tests are all recursive, so bail out if we hit the limit.
	if (Depth == MaxDepth)
	return None;
	// If the result of an 'or' is false, then we know both legs of the 'or' are
	// false. Similarly, if the result of an 'and' is true, then we know both
	// legs of the 'and' are true.
	if ((LHSIsFalse && match(LHS, m_Or(m_Value(ALHS), m_Value(ARHS)))) \|\|
	(!LHSIsFalse && match(LHS, m_And(m_Value(ALHS), m_Value(ARHS))))) {
	if (Optional<bool> Implication = isImpliedCondition(
	ALHS, RHS, DL, LHSIsFalse, Depth + 1, AC, CxtI, DT))
	return Implication;
	if (Optional<bool> Implication = isImpliedCondition(
	ARHS, RHS, DL, LHSIsFalse, Depth + 1, AC, CxtI, DT))
	return Implication;
	return None;
	}
	return None;
	}
	// All of the below logic assumes both LHS and RHS are icmps.
	assert(isa<ICmpInst>(LHS) && isa<ICmpInst>(RHS) && "Expected icmps.");

	// The rest of the logic assumes the LHS condition is true. If that's not the
	// case, invert the predicate to make it so.
	if (LHSIsFalse)
	APred = CmpInst::getInversePredicate(APred);

	// Can we infer anything when the two compares have matching operands?
	bool IsSwappedOps;
	if (isMatchingOps(ALHS, ARHS, BLHS, BRHS, IsSwappedOps)) {
	if (Optional<bool> Implication = isImpliedCondMatchingOperands(
	APred, ALHS, ARHS, BPred, BLHS, BRHS, IsSwappedOps))
	return Implication;
	// No amount of additional analysis will infer the second condition, so
	// early exit.
	return None;
	}

	// Can we infer anything when the LHS operands match and the RHS operands are
	// constants (not necessarily matching)?
	if (ALHS == BLHS && isa<ConstantInt>(ARHS) && isa<ConstantInt>(BRHS)) {
	if (Optional<bool> Implication = isImpliedCondMatchingImmOperands(
	APred, ALHS, cast<ConstantInt>(ARHS), BPred, BLHS,
	cast<ConstantInt>(BRHS)))
	return Implication;
	// No amount of additional analysis will infer the second condition, so
	// early exit.
	return None;
	}

	if (APred == BPred)
	return isImpliedCondOperands(APred, ALHS, ARHS, BLHS, BRHS, DL, Depth, AC,
	CxtI, DT);

	return None;
	}
	Index: head/contrib/llvm/lib/CodeGen/BranchFolding.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/BranchFolding.cpp (revision 322319)
	+++ head/contrib/llvm/lib/CodeGen/BranchFolding.cpp (revision 322320)
	@@ -1,2012 +1,2013 @@
	//===- BranchFolding.cpp - Fold machine code branch instructions ----------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass forwards branches to unconditional branches to make them branch
	// directly to the target block. This pass often results in dead MBB's, which
	// it then removes.
	//
	// Note that this pass must be run after register allocation, it cannot handle
	// SSA form. It also must handle virtual registers for targets that emit virtual
	// ISA (e.g. NVPTX).
	//
	//===----------------------------------------------------------------------===//

	#include "BranchFolding.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
	#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/TargetPassConfig.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/Function.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/BlockFrequency.h"
	#include "llvm/Support/BranchProbability.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <cassert>
	#include <cstddef>
	#include <iterator>
	#include <numeric>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "branch-folder"

	STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
	STATISTIC(NumBranchOpts, "Number of branches optimized");
	STATISTIC(NumTailMerge , "Number of block tails merged");
	STATISTIC(NumHoist , "Number of times common instructions are hoisted");
	STATISTIC(NumTailCalls, "Number of tail calls optimized");

	static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
	cl::init(cl::BOU_UNSET), cl::Hidden);

	// Throttle for huge numbers of predecessors (compile speed problems)
	static cl::opt<unsigned>
	TailMergeThreshold("tail-merge-threshold",
	cl::desc("Max number of predecessors to consider tail merging"),
	cl::init(150), cl::Hidden);

	// Heuristic for tail merging (and, inversely, tail duplication).
	// TODO: This should be replaced with a target query.
	static cl::opt<unsigned>
	TailMergeSize("tail-merge-size",
	cl::desc("Min number of instructions to consider tail merging"),
	cl::init(3), cl::Hidden);

	namespace {

	/// BranchFolderPass - Wrap branch folder in a machine function pass.
	class BranchFolderPass : public MachineFunctionPass {
	public:
	static char ID;

	explicit BranchFolderPass(): MachineFunctionPass(ID) {}

	bool runOnMachineFunction(MachineFunction &MF) override;

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<MachineBlockFrequencyInfo>();
	AU.addRequired<MachineBranchProbabilityInfo>();
	AU.addRequired<TargetPassConfig>();
	MachineFunctionPass::getAnalysisUsage(AU);
	}
	};

	} // end anonymous namespace

	char BranchFolderPass::ID = 0;
	char &llvm::BranchFolderPassID = BranchFolderPass::ID;

	INITIALIZE_PASS(BranchFolderPass, DEBUG_TYPE,
	"Control Flow Optimizer", false, false)

	bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
	if (skipFunction(*MF.getFunction()))
	return false;

	TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
	// TailMerge can create jump into if branches that make CFG irreducible for
	// HW that requires structurized CFG.
	bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
	PassConfig->getEnableTailMerge();
	BranchFolder::MBFIWrapper MBBFreqInfo(
	getAnalysis<MachineBlockFrequencyInfo>());
	BranchFolder Folder(EnableTailMerge, /CommonHoist=/true, MBBFreqInfo,
	getAnalysis<MachineBranchProbabilityInfo>());
	return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
	MF.getSubtarget().getRegisterInfo(),
	getAnalysisIfAvailable<MachineModuleInfo>());
	}

	BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
	MBFIWrapper &FreqInfo,
	const MachineBranchProbabilityInfo &ProbInfo,
	unsigned MinTailLength)
	: EnableHoistCommonCode(CommonHoist), MinCommonTailLength(MinTailLength),
	MBBFreqInfo(FreqInfo), MBPI(ProbInfo) {
	if (MinCommonTailLength == 0)
	MinCommonTailLength = TailMergeSize;
	switch (FlagEnableTailMerge) {
	case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
	case cl::BOU_TRUE: EnableTailMerge = true; break;
	case cl::BOU_FALSE: EnableTailMerge = false; break;
	}
	}

	void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
	assert(MBB->pred_empty() && "MBB must be dead!");
	DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);

	MachineFunction *MF = MBB->getParent();
	// drop all successors.
	while (!MBB->succ_empty())
	MBB->removeSuccessor(MBB->succ_end()-1);

	// Avoid matching if this pointer gets reused.
	TriedMerging.erase(MBB);

	// Remove the block.
	MF->erase(MBB);
	FuncletMembership.erase(MBB);
	if (MLI)
	MLI->removeBlock(MBB);
	}

	bool BranchFolder::OptimizeFunction(MachineFunction &MF,
	const TargetInstrInfo *tii,
	const TargetRegisterInfo *tri,
	MachineModuleInfo *mmi,
	MachineLoopInfo *mli, bool AfterPlacement) {
	if (!tii) return false;

	TriedMerging.clear();

	MachineRegisterInfo &MRI = MF.getRegInfo();
	AfterBlockPlacement = AfterPlacement;
	TII = tii;
	TRI = tri;
	MMI = mmi;
	MLI = mli;
	this->MRI = &MRI;

	UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
	if (!UpdateLiveIns)
	MRI.invalidateLiveness();

	// Fix CFG. The later algorithms expect it to be right.
	bool MadeChange = false;
	for (MachineBasicBlock &MBB : MF) {
	MachineBasicBlock TBB = nullptr, FBB = nullptr;
	SmallVector<MachineOperand, 4> Cond;
	if (!TII->analyzeBranch(MBB, TBB, FBB, Cond, true))
	MadeChange \|= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
	}

	// Recalculate funclet membership.
	FuncletMembership = getFuncletMembership(MF);

	bool MadeChangeThisIteration = true;
	while (MadeChangeThisIteration) {
	MadeChangeThisIteration = TailMergeBlocks(MF);
	// No need to clean up if tail merging does not change anything after the
	// block placement.
	if (!AfterBlockPlacement \|\| MadeChangeThisIteration)
	MadeChangeThisIteration \|= OptimizeBranches(MF);
	if (EnableHoistCommonCode)
	MadeChangeThisIteration \|= HoistCommonCode(MF);
	MadeChange \|= MadeChangeThisIteration;
	}

	// See if any jump tables have become dead as the code generator
	// did its thing.
	MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
	if (!JTI)
	return MadeChange;

	// Walk the function to find jump tables that are live.
	BitVector JTIsLive(JTI->getJumpTables().size());
	for (const MachineBasicBlock &BB : MF) {
	for (const MachineInstr &I : BB)
	for (const MachineOperand &Op : I.operands()) {
	if (!Op.isJTI()) continue;

	// Remember that this JT is live.
	JTIsLive.set(Op.getIndex());
	}
	}

	// Finally, remove dead jump tables. This happens when the
	// indirect jump was unreachable (and thus deleted).
	for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
	if (!JTIsLive.test(i)) {
	JTI->RemoveJumpTable(i);
	MadeChange = true;
	}

	return MadeChange;
	}

	//===----------------------------------------------------------------------===//
	// Tail Merging of Blocks
	//===----------------------------------------------------------------------===//

	/// HashMachineInstr - Compute a hash value for MI and its operands.
	static unsigned HashMachineInstr(const MachineInstr &MI) {
	unsigned Hash = MI.getOpcode();
	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	const MachineOperand &Op = MI.getOperand(i);

	// Merge in bits from the operand if easy. We can't use MachineOperand's
	// hash_code here because it's not deterministic and we sort by hash value
	// later.
	unsigned OperandHash = 0;
	switch (Op.getType()) {
	case MachineOperand::MO_Register:
	OperandHash = Op.getReg();
	break;
	case MachineOperand::MO_Immediate:
	OperandHash = Op.getImm();
	break;
	case MachineOperand::MO_MachineBasicBlock:
	OperandHash = Op.getMBB()->getNumber();
	break;
	case MachineOperand::MO_FrameIndex:
	case MachineOperand::MO_ConstantPoolIndex:
	case MachineOperand::MO_JumpTableIndex:
	OperandHash = Op.getIndex();
	break;
	case MachineOperand::MO_GlobalAddress:
	case MachineOperand::MO_ExternalSymbol:
	// Global address / external symbol are too hard, don't bother, but do
	// pull in the offset.
	OperandHash = Op.getOffset();
	break;
	default:
	break;
	}

	Hash += ((OperandHash << 3) \| Op.getType()) << (i & 31);
	}
	return Hash;
	}

	/// HashEndOfMBB - Hash the last instruction in the MBB.
	static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
	MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
	if (I == MBB.end())
	return 0;

	return HashMachineInstr(*I);
	}

	/// ComputeCommonTailLength - Given two machine basic blocks, compute the number
	/// of instructions they actually have in common together at their end. Return
	/// iterators for the first shared instruction in each block.
	static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
	MachineBasicBlock *MBB2,
	MachineBasicBlock::iterator &I1,
	MachineBasicBlock::iterator &I2) {
	I1 = MBB1->end();
	I2 = MBB2->end();

	unsigned TailLen = 0;
	while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
	--I1; --I2;
	// Skip debugging pseudos; necessary to avoid changing the code.
	while (I1->isDebugValue()) {
	if (I1==MBB1->begin()) {
	while (I2->isDebugValue()) {
	if (I2==MBB2->begin())
	// I1==DBG at begin; I2==DBG at begin
	return TailLen;
	--I2;
	}
	++I2;
	// I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
	return TailLen;
	}
	--I1;
	}
	// I1==first (untested) non-DBG preceding known match
	while (I2->isDebugValue()) {
	if (I2==MBB2->begin()) {
	++I1;
	// I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
	return TailLen;
	}
	--I2;
	}
	// I1, I2==first (untested) non-DBGs preceding known match
	if (!I1->isIdenticalTo(*I2) \|\|
	// FIXME: This check is dubious. It's used to get around a problem where
	// people incorrectly expect inline asm directives to remain in the same
	// relative order. This is untenable because normal compiler
	// optimizations (like this one) may reorder and/or merge these
	// directives.
	I1->isInlineAsm()) {
	++I1; ++I2;
	break;
	}
	++TailLen;
	}
	// Back past possible debugging pseudos at beginning of block. This matters
	// when one block differs from the other only by whether debugging pseudos
	// are present at the beginning. (This way, the various checks later for
	// I1==MBB1->begin() work as expected.)
	if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
	--I2;
	while (I2->isDebugValue()) {
	if (I2 == MBB2->begin())
	return TailLen;
	--I2;
	}
	++I2;
	}
	if (I2 == MBB2->begin() && I1 != MBB1->begin()) {
	--I1;
	while (I1->isDebugValue()) {
	if (I1 == MBB1->begin())
	return TailLen;
	--I1;
	}
	++I1;
	}
	return TailLen;
	}

	void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
	MachineBasicBlock *NewDest) {
	TII->ReplaceTailWithBranchTo(OldInst, NewDest);

	if (UpdateLiveIns) {
	NewDest->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, NewDest);
	}

	++NumTailMerge;
	}

	MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
	MachineBasicBlock::iterator BBI1,
	const BasicBlock *BB) {
	if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
	return nullptr;

	MachineFunction &MF = *CurMBB.getParent();

	// Create the fall-through block.
	MachineFunction::iterator MBBI = CurMBB.getIterator();
	MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(BB);
	CurMBB.getParent()->insert(++MBBI, NewMBB);

	// Move all the successors of this block to the specified block.
	NewMBB->transferSuccessors(&CurMBB);

	// Add an edge from CurMBB to NewMBB for the fall-through.
	CurMBB.addSuccessor(NewMBB);

	// Splice the code over.
	NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());

	// NewMBB belongs to the same loop as CurMBB.
	if (MLI)
	if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
	ML->addBasicBlockToLoop(NewMBB, MLI->getBase());

	// NewMBB inherits CurMBB's block frequency.
	MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));

	if (UpdateLiveIns)
	computeLiveIns(LiveRegs, MRI, NewMBB);

	// Add the new block to the funclet.
	const auto &FuncletI = FuncletMembership.find(&CurMBB);
	if (FuncletI != FuncletMembership.end()) {
	auto n = FuncletI->second;
	FuncletMembership[NewMBB] = n;
	}

	return NewMBB;
	}

	/// EstimateRuntime - Make a rough estimate for how long it will take to run
	/// the specified code.
	static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
	MachineBasicBlock::iterator E) {
	unsigned Time = 0;
	for (; I != E; ++I) {
	if (I->isDebugValue())
	continue;
	if (I->isCall())
	Time += 10;
	else if (I->mayLoad() \|\| I->mayStore())
	Time += 2;
	else
	++Time;
	}
	return Time;
	}

	// CurMBB needs to add an unconditional branch to SuccMBB (we removed these
	// branches temporarily for tail merging). In the case where CurMBB ends
	// with a conditional branch to the next block, optimize by reversing the
	// test and conditionally branching to SuccMBB instead.
	static void FixTail(MachineBasicBlock CurMBB, MachineBasicBlock SuccBB,
	const TargetInstrInfo *TII) {
	MachineFunction *MF = CurMBB->getParent();
	MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
	MachineBasicBlock TBB = nullptr, FBB = nullptr;
	SmallVector<MachineOperand, 4> Cond;
	DebugLoc dl = CurMBB->findBranchDebugLoc();
	if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
	MachineBasicBlock NextBB = &I;
	if (TBB == NextBB && !Cond.empty() && !FBB) {
	if (!TII->reverseBranchCondition(Cond)) {
	TII->removeBranch(*CurMBB);
	TII->insertBranch(*CurMBB, SuccBB, nullptr, Cond, dl);
	return;
	}
	}
	}
	TII->insertBranch(*CurMBB, SuccBB, nullptr,
	SmallVector<MachineOperand, 0>(), dl);
	}

	bool
	BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
	if (getHash() < o.getHash())
	return true;
	if (getHash() > o.getHash())
	return false;
	if (getBlock()->getNumber() < o.getBlock()->getNumber())
	return true;
	if (getBlock()->getNumber() > o.getBlock()->getNumber())
	return false;
	// _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
	// an object with itself.
	#ifndef _GLIBCXX_DEBUG
	llvm_unreachable("Predecessor appears twice");
	#else
	return false;
	#endif
	}

	BlockFrequency
	BranchFolder::MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
	auto I = MergedBBFreq.find(MBB);

	if (I != MergedBBFreq.end())
	return I->second;

	return MBFI.getBlockFreq(MBB);
	}

	void BranchFolder::MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
	BlockFrequency F) {
	MergedBBFreq[MBB] = F;
	}

	raw_ostream &
	BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
	const MachineBasicBlock *MBB) const {
	return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
	}

	raw_ostream &
	BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
	const BlockFrequency Freq) const {
	return MBFI.printBlockFreq(OS, Freq);
	}

	void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) {
	MBFI.view(Name, isSimple);
	}

	uint64_t
	BranchFolder::MBFIWrapper::getEntryFreq() const {
	return MBFI.getEntryFreq();
	}

	/// CountTerminators - Count the number of terminators in the given
	/// block and set I to the position of the first non-terminator, if there
	/// is one, or MBB->end() otherwise.
	static unsigned CountTerminators(MachineBasicBlock *MBB,
	MachineBasicBlock::iterator &I) {
	I = MBB->end();
	unsigned NumTerms = 0;
	while (true) {
	if (I == MBB->begin()) {
	I = MBB->end();
	break;
	}
	--I;
	if (!I->isTerminator()) break;
	++NumTerms;
	}
	return NumTerms;
	}

	/// A no successor, non-return block probably ends in unreachable and is cold.
	/// Also consider a block that ends in an indirect branch to be a return block,
	/// since many targets use plain indirect branches to return.
	static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) {
	if (!MBB->succ_empty())
	return false;
	if (MBB->empty())
	return true;
	return !(MBB->back().isReturn() \|\| MBB->back().isIndirectBranch());
	}

	/// ProfitableToMerge - Check if two machine basic blocks have a common tail
	/// and decide if it would be profitable to merge those tails. Return the
	/// length of the common tail and iterators to the first common instruction
	/// in each block.
	/// MBB1, MBB2 The blocks to check
	/// MinCommonTailLength Minimum size of tail block to be merged.
	/// CommonTailLen Out parameter to record the size of the shared tail between
	/// MBB1 and MBB2
	/// I1, I2 Iterator references that will be changed to point to the first
	/// instruction in the common tail shared by MBB1,MBB2
	/// SuccBB A common successor of MBB1, MBB2 which are in a canonical form
	/// relative to SuccBB
	/// PredBB The layout predecessor of SuccBB, if any.
	/// FuncletMembership map from block to funclet #.
	/// AfterPlacement True if we are merging blocks after layout. Stricter
	/// thresholds apply to prevent undoing tail-duplication.
	static bool
	ProfitableToMerge(MachineBasicBlock MBB1, MachineBasicBlock MBB2,
	unsigned MinCommonTailLength, unsigned &CommonTailLen,
	MachineBasicBlock::iterator &I1,
	MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
	MachineBasicBlock *PredBB,
	DenseMap<const MachineBasicBlock *, int> &FuncletMembership,
	bool AfterPlacement) {
	// It is never profitable to tail-merge blocks from two different funclets.
	if (!FuncletMembership.empty()) {
	auto Funclet1 = FuncletMembership.find(MBB1);
	assert(Funclet1 != FuncletMembership.end());
	auto Funclet2 = FuncletMembership.find(MBB2);
	assert(Funclet2 != FuncletMembership.end());
	if (Funclet1->second != Funclet2->second)
	return false;
	}

	CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
	if (CommonTailLen == 0)
	return false;
	DEBUG(dbgs() << "Common tail length of BB#" << MBB1->getNumber()
	<< " and BB#" << MBB2->getNumber() << " is " << CommonTailLen
	<< '\n');

	// It's almost always profitable to merge any number of non-terminator
	// instructions with the block that falls through into the common successor.
	// This is true only for a single successor. For multiple successors, we are
	// trading a conditional branch for an unconditional one.
	// TODO: Re-visit successor size for non-layout tail merging.
	if ((MBB1 == PredBB \|\| MBB2 == PredBB) &&
	(!AfterPlacement \|\| MBB1->succ_size() == 1)) {
	MachineBasicBlock::iterator I;
	unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I);
	if (CommonTailLen > NumTerms)
	return true;
	}

	// If these are identical non-return blocks with no successors, merge them.
	// Such blocks are typically cold calls to noreturn functions like abort, and
	// are unlikely to become a fallthrough target after machine block placement.
	// Tail merging these blocks is unlikely to create additional unconditional
	// branches, and will reduce the size of this cold code.
	if (I1 == MBB1->begin() && I2 == MBB2->begin() &&
	blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2))
	return true;

	// If one of the blocks can be completely merged and happens to be in
	// a position where the other could fall through into it, merge any number
	// of instructions, because it can be done without a branch.
	// TODO: If the blocks are not adjacent, move one of them so that they are?
	if (MBB1->isLayoutSuccessor(MBB2) && I2 == MBB2->begin())
	return true;
	if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
	return true;

	// If both blocks are identical and end in a branch, merge them unless they
	// both have a fallthrough predecessor and successor.
	// We can only do this after block placement because it depends on whether
	// there are fallthroughs, and we don't know until after layout.
	if (AfterPlacement && I1 == MBB1->begin() && I2 == MBB2->begin()) {
	auto BothFallThrough = [](MachineBasicBlock *MBB) {
	if (MBB->succ_size() != 0 && !MBB->canFallThrough())
	return false;
	MachineFunction::iterator I(MBB);
	MachineFunction *MF = MBB->getParent();
	return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough();
	};
	if (!BothFallThrough(MBB1) \|\| !BothFallThrough(MBB2))
	return true;
	}

	// If both blocks have an unconditional branch temporarily stripped out,
	// count that as an additional common instruction for the following
	// heuristics. This heuristic is only accurate for single-succ blocks, so to
	// make sure that during layout merging and duplicating don't crash, we check
	// for that when merging during layout.
	unsigned EffectiveTailLen = CommonTailLen;
	if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
	(MBB1->succ_size() == 1 \|\| !AfterPlacement) &&
	!MBB1->back().isBarrier() &&
	!MBB2->back().isBarrier())
	++EffectiveTailLen;

	// Check if the common tail is long enough to be worthwhile.
	if (EffectiveTailLen >= MinCommonTailLength)
	return true;

	// If we are optimizing for code size, 2 instructions in common is enough if
	// we don't have to split a block. At worst we will be introducing 1 new
	// branch instruction, which is likely to be smaller than the 2
	// instructions that would be deleted in the merge.
	MachineFunction *MF = MBB1->getParent();
	return EffectiveTailLen >= 2 && MF->getFunction()->optForSize() &&
	(I1 == MBB1->begin() \|\| I2 == MBB2->begin());
	}

	unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
	unsigned MinCommonTailLength,
	MachineBasicBlock *SuccBB,
	MachineBasicBlock *PredBB) {
	unsigned maxCommonTailLength = 0U;
	SameTails.clear();
	MachineBasicBlock::iterator TrialBBI1, TrialBBI2;
	MPIterator HighestMPIter = std::prev(MergePotentials.end());
	for (MPIterator CurMPIter = std::prev(MergePotentials.end()),
	B = MergePotentials.begin();
	CurMPIter != B && CurMPIter->getHash() == CurHash; --CurMPIter) {
	for (MPIterator I = std::prev(CurMPIter); I->getHash() == CurHash; --I) {
	unsigned CommonTailLen;
	if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(),
	MinCommonTailLength,
	CommonTailLen, TrialBBI1, TrialBBI2,
	SuccBB, PredBB,
	FuncletMembership,
	AfterBlockPlacement)) {
	if (CommonTailLen > maxCommonTailLength) {
	SameTails.clear();
	maxCommonTailLength = CommonTailLen;
	HighestMPIter = CurMPIter;
	SameTails.push_back(SameTailElt(CurMPIter, TrialBBI1));
	}
	if (HighestMPIter == CurMPIter &&
	CommonTailLen == maxCommonTailLength)
	SameTails.push_back(SameTailElt(I, TrialBBI2));
	}
	if (I == B)
	break;
	}
	}
	return maxCommonTailLength;
	}

	void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
	MachineBasicBlock *SuccBB,
	MachineBasicBlock *PredBB) {
	MPIterator CurMPIter, B;
	for (CurMPIter = std::prev(MergePotentials.end()),
	B = MergePotentials.begin();
	CurMPIter->getHash() == CurHash; --CurMPIter) {
	// Put the unconditional branch back, if we need one.
	MachineBasicBlock *CurMBB = CurMPIter->getBlock();
	if (SuccBB && CurMBB != PredBB)
	FixTail(CurMBB, SuccBB, TII);
	if (CurMPIter == B)
	break;
	}
	if (CurMPIter->getHash() != CurHash)
	CurMPIter++;
	MergePotentials.erase(CurMPIter, MergePotentials.end());
	}

	bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
	MachineBasicBlock *SuccBB,
	unsigned maxCommonTailLength,
	unsigned &commonTailIndex) {
	commonTailIndex = 0;
	unsigned TimeEstimate = ~0U;
	for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
	// Use PredBB if possible; that doesn't require a new branch.
	if (SameTails[i].getBlock() == PredBB) {
	commonTailIndex = i;
	break;
	}
	// Otherwise, make a (fairly bogus) choice based on estimate of
	// how long it will take the various blocks to execute.
	unsigned t = EstimateRuntime(SameTails[i].getBlock()->begin(),
	SameTails[i].getTailStartPos());
	if (t <= TimeEstimate) {
	TimeEstimate = t;
	commonTailIndex = i;
	}
	}

	MachineBasicBlock::iterator BBI =
	SameTails[commonTailIndex].getTailStartPos();
	MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();

	DEBUG(dbgs() << "\nSplitting BB#" << MBB->getNumber() << ", size "
	<< maxCommonTailLength);

	// If the split block unconditionally falls-thru to SuccBB, it will be
	// merged. In control flow terms it should then take SuccBB's name. e.g. If
	// SuccBB is an inner loop, the common tail is still part of the inner loop.
	const BasicBlock *BB = (SuccBB && MBB->succ_size() == 1) ?
	SuccBB->getBasicBlock() : MBB->getBasicBlock();
	MachineBasicBlock newMBB = SplitMBBAt(MBB, BBI, BB);
	if (!newMBB) {
	DEBUG(dbgs() << "... failed!");
	return false;
	}

	SameTails[commonTailIndex].setBlock(newMBB);
	SameTails[commonTailIndex].setTailStartPos(newMBB->begin());

	// If we split PredBB, newMBB is the new predecessor.
	if (PredBB == MBB)
	PredBB = newMBB;

	return true;
	}

	void BranchFolder::MergeCommonTailDebugLocs(unsigned commonTailIndex) {
	MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();

	std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size());
	for (unsigned int i = 0 ; i != SameTails.size() ; ++i) {
	if (i != commonTailIndex)
	NextCommonInsts[i] = SameTails[i].getTailStartPos();
	else {
	assert(SameTails[i].getTailStartPos() == MBB->begin() &&
	"MBB is not a common tail only block");
	}
	}

	for (auto &MI : *MBB) {
	if (MI.isDebugValue())
	continue;
	DebugLoc DL = MI.getDebugLoc();
	for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) {
	if (i == commonTailIndex)
	continue;

	auto &Pos = NextCommonInsts[i];
	assert(Pos != SameTails[i].getBlock()->end() &&
	"Reached BB end within common tail");
	while (Pos->isDebugValue()) {
	++Pos;
	assert(Pos != SameTails[i].getBlock()->end() &&
	"Reached BB end within common tail");
	}
	assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!");
	DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc());
	NextCommonInsts[i] = ++Pos;
	}
	MI.setDebugLoc(DL);
	}
	}

	static void
	mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
	MachineBasicBlock &MBBCommon) {
	MachineBasicBlock *MBB = MBBIStartPos->getParent();
	// Note CommonTailLen does not necessarily matches the size of
	// the common BB nor all its instructions because of debug
	// instructions differences.
	unsigned CommonTailLen = 0;
	for (auto E = MBB->end(); MBBIStartPos != E; ++MBBIStartPos)
	++CommonTailLen;

	MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin();
	MachineBasicBlock::reverse_iterator MBBIE = MBB->rend();
	MachineBasicBlock::reverse_iterator MBBICommon = MBBCommon.rbegin();
	MachineBasicBlock::reverse_iterator MBBIECommon = MBBCommon.rend();

	while (CommonTailLen--) {
	assert(MBBI != MBBIE && "Reached BB end within common tail length!");
	(void)MBBIE;

	if (MBBI->isDebugValue()) {
	++MBBI;
	continue;
	}

	while ((MBBICommon != MBBIECommon) && MBBICommon->isDebugValue())
	++MBBICommon;

	assert(MBBICommon != MBBIECommon &&
	"Reached BB end within common tail length!");
	assert(MBBICommon->isIdenticalTo(*MBBI) && "Expected matching MIIs!");

	// Merge MMOs from memory operations in the common block.
	if (MBBICommon->mayLoad() \|\| MBBICommon->mayStore())
	MBBICommon->setMemRefs(MBBICommon->mergeMemRefsWith(*MBBI));
	// Drop undef flags if they aren't present in all merged instructions.
	for (unsigned I = 0, E = MBBICommon->getNumOperands(); I != E; ++I) {
	MachineOperand &MO = MBBICommon->getOperand(I);
	if (MO.isReg() && MO.isUndef()) {
	const MachineOperand &OtherMO = MBBI->getOperand(I);
	if (!OtherMO.isUndef())
	MO.setIsUndef(false);
	}
	}

	++MBBI;
	++MBBICommon;
	}
	}

	// See if any of the blocks in MergePotentials (which all have SuccBB as a
	// successor, or all have no successor if it is null) can be tail-merged.
	// If there is a successor, any blocks in MergePotentials that are not
	// tail-merged and are not immediately before Succ must have an unconditional
	// branch to Succ added (but the predecessor/successor lists need no
	// adjustment). The lone predecessor of Succ that falls through into Succ,
	// if any, is given in PredBB.
	// MinCommonTailLength - Except for the special cases below, tail-merge if
	// there are at least this many instructions in common.
	bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
	MachineBasicBlock *PredBB,
	unsigned MinCommonTailLength) {
	bool MadeChange = false;

	DEBUG(dbgs() << "\nTryTailMergeBlocks: ";
	for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
	dbgs() << "BB#" << MergePotentials[i].getBlock()->getNumber()
	<< (i == e-1 ? "" : ", ");
	dbgs() << "\n";
	if (SuccBB) {
	dbgs() << " with successor BB#" << SuccBB->getNumber() << '\n';
	if (PredBB)
	dbgs() << " which has fall-through from BB#"
	<< PredBB->getNumber() << "\n";
	}
	dbgs() << "Looking for common tails of at least "
	<< MinCommonTailLength << " instruction"
	<< (MinCommonTailLength == 1 ? "" : "s") << '\n';
	);

	// Sort by hash value so that blocks with identical end sequences sort
	// together.
	array_pod_sort(MergePotentials.begin(), MergePotentials.end());

	// Walk through equivalence sets looking for actual exact matches.
	while (MergePotentials.size() > 1) {
	unsigned CurHash = MergePotentials.back().getHash();

	// Build SameTails, identifying the set of blocks with this hash code
	// and with the maximum number of instructions in common.
	unsigned maxCommonTailLength = ComputeSameTails(CurHash,
	MinCommonTailLength,
	SuccBB, PredBB);

	// If we didn't find any pair that has at least MinCommonTailLength
	// instructions in common, remove all blocks with this hash code and retry.
	if (SameTails.empty()) {
	RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
	continue;
	}

	// If one of the blocks is the entire common tail (and not the entry
	// block, which we can't jump to), we can treat all blocks with this same
	// tail at once. Use PredBB if that is one of the possibilities, as that
	// will not introduce any extra branches.
	MachineBasicBlock *EntryBB =
	&MergePotentials.front().getBlock()->getParent()->front();
	unsigned commonTailIndex = SameTails.size();
	// If there are two blocks, check to see if one can be made to fall through
	// into the other.
	if (SameTails.size() == 2 &&
	SameTails[0].getBlock()->isLayoutSuccessor(SameTails[1].getBlock()) &&
	SameTails[1].tailIsWholeBlock())
	commonTailIndex = 1;
	else if (SameTails.size() == 2 &&
	SameTails[1].getBlock()->isLayoutSuccessor(
	SameTails[0].getBlock()) &&
	SameTails[0].tailIsWholeBlock())
	commonTailIndex = 0;
	else {
	// Otherwise just pick one, favoring the fall-through predecessor if
	// there is one.
	for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
	MachineBasicBlock *MBB = SameTails[i].getBlock();
	if (MBB == EntryBB && SameTails[i].tailIsWholeBlock())
	continue;
	if (MBB == PredBB) {
	commonTailIndex = i;
	break;
	}
	if (SameTails[i].tailIsWholeBlock())
	commonTailIndex = i;
	}
	}

	if (commonTailIndex == SameTails.size() \|\|
	(SameTails[commonTailIndex].getBlock() == PredBB &&
	!SameTails[commonTailIndex].tailIsWholeBlock())) {
	// None of the blocks consist entirely of the common tail.
	// Split a block so that one does.
	if (!CreateCommonTailOnlyBlock(PredBB, SuccBB,
	maxCommonTailLength, commonTailIndex)) {
	RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
	continue;
	}
	}

	MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();

	// Recompute common tail MBB's edge weights and block frequency.
	setCommonTailEdgeWeights(*MBB);

	// Merge debug locations across identical instructions for common tail.
	MergeCommonTailDebugLocs(commonTailIndex);

	// MBB is common tail. Adjust all other BB's to jump to this one.
	// Traversal must be forwards so erases work.
	DEBUG(dbgs() << "\nUsing common tail in BB#" << MBB->getNumber()
	<< " for ");
	for (unsigned int i=0, e = SameTails.size(); i != e; ++i) {
	if (commonTailIndex == i)
	continue;
	DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
	<< (i == e-1 ? "" : ", "));
	// Merge operations (MMOs, undef flags)
	mergeOperations(SameTails[i].getTailStartPos(), *MBB);
	// Hack the end off BB i, making it jump to BB commonTailIndex instead.
	ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
	// BB i is no longer a predecessor of SuccBB; remove it from the worklist.
	MergePotentials.erase(SameTails[i].getMPIter());
	}
	DEBUG(dbgs() << "\n");
	// We leave commonTailIndex in the worklist in case there are other blocks
	// that match it with a smaller number of instructions.
	MadeChange = true;
	}
	return MadeChange;
	}

	bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
	bool MadeChange = false;
	if (!EnableTailMerge) return MadeChange;

	// First find blocks with no successors.
	// Block placement does not create new tail merging opportunities for these
	// blocks.
	if (!AfterBlockPlacement) {
	MergePotentials.clear();
	for (MachineBasicBlock &MBB : MF) {
	if (MergePotentials.size() == TailMergeThreshold)
	break;
	if (!TriedMerging.count(&MBB) && MBB.succ_empty())
	MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(MBB), &MBB));
	}

	// If this is a large problem, avoid visiting the same basic blocks
	// multiple times.
	if (MergePotentials.size() == TailMergeThreshold)
	for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
	TriedMerging.insert(MergePotentials[i].getBlock());

	// See if we can do any tail merging on those.
	if (MergePotentials.size() >= 2)
	MadeChange \|= TryTailMergeBlocks(nullptr, nullptr, MinCommonTailLength);
	}

	// Look at blocks (IBB) with multiple predecessors (PBB).
	// We change each predecessor to a canonical form, by
	// (1) temporarily removing any unconditional branch from the predecessor
	// to IBB, and
	// (2) alter conditional branches so they branch to the other block
	// not IBB; this may require adding back an unconditional branch to IBB
	// later, where there wasn't one coming in. E.g.
	// Bcc IBB
	// fallthrough to QBB
	// here becomes
	// Bncc QBB
	// with a conceptual B to IBB after that, which never actually exists.
	// With those changes, we see whether the predecessors' tails match,
	// and merge them if so. We change things out of canonical form and
	// back to the way they were later in the process. (OptimizeBranches
	// would undo some of this, but we can't use it, because we'd get into
	// a compile-time infinite loop repeatedly doing and undoing the same
	// transformations.)

	for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
	I != E; ++I) {
	if (I->pred_size() < 2) continue;
	SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
	MachineBasicBlock IBB = &I;
	MachineBasicBlock PredBB = &std::prev(I);
	MergePotentials.clear();
	MachineLoop *ML;

	// Bail if merging after placement and IBB is the loop header because
	// -- If merging predecessors that belong to the same loop as IBB, the
	// common tail of merged predecessors may become the loop top if block
	// placement is called again and the predecessors may branch to this common
	// tail and require more branches. This can be relaxed if
	// MachineBlockPlacement::findBestLoopTop is more flexible.
	// --If merging predecessors that do not belong to the same loop as IBB, the
	// loop info of IBB's loop and the other loops may be affected. Calling the
	// block placement again may make big change to the layout and eliminate the
	// reason to do tail merging here.
	if (AfterBlockPlacement && MLI) {
	ML = MLI->getLoopFor(IBB);
	if (ML && IBB == ML->getHeader())
	continue;
	}

	for (MachineBasicBlock *PBB : I->predecessors()) {
	if (MergePotentials.size() == TailMergeThreshold)
	break;

	if (TriedMerging.count(PBB))
	continue;

	// Skip blocks that loop to themselves, can't tail merge these.
	if (PBB == IBB)
	continue;

	// Visit each predecessor only once.
	if (!UniquePreds.insert(PBB).second)
	continue;

	// Skip blocks which may jump to a landing pad. Can't tail merge these.
	if (PBB->hasEHPadSuccessor())
	continue;

	// After block placement, only consider predecessors that belong to the
	// same loop as IBB. The reason is the same as above when skipping loop
	// header.
	if (AfterBlockPlacement && MLI)
	if (ML != MLI->getLoopFor(PBB))
	continue;

	MachineBasicBlock TBB = nullptr, FBB = nullptr;
	SmallVector<MachineOperand, 4> Cond;
	if (!TII->analyzeBranch(*PBB, TBB, FBB, Cond, true)) {
	// Failing case: IBB is the target of a cbr, and we cannot reverse the
	// branch.
	SmallVector<MachineOperand, 4> NewCond(Cond);
	if (!Cond.empty() && TBB == IBB) {
	if (TII->reverseBranchCondition(NewCond))
	continue;
	// This is the QBB case described above
	if (!FBB) {
	auto Next = ++PBB->getIterator();
	if (Next != MF.end())
	FBB = &*Next;
	}
	}

	// Failing case: the only way IBB can be reached from PBB is via
	// exception handling. Happens for landing pads. Would be nice to have
	// a bit in the edge so we didn't have to do all this.
	if (IBB->isEHPad()) {
	MachineFunction::iterator IP = ++PBB->getIterator();
	MachineBasicBlock *PredNextBB = nullptr;
	if (IP != MF.end())
	PredNextBB = &*IP;
	if (!TBB) {
	if (IBB != PredNextBB) // fallthrough
	continue;
	} else if (FBB) {
	if (TBB != IBB && FBB != IBB) // cbr then ubr
	continue;
	} else if (Cond.empty()) {
	if (TBB != IBB) // ubr
	continue;
	} else {
	if (TBB != IBB && IBB != PredNextBB) // cbr
	continue;
	}
	}

	// Remove the unconditional branch at the end, if any.
	if (TBB && (Cond.empty() \|\| FBB)) {
	DebugLoc dl = PBB->findBranchDebugLoc();
	TII->removeBranch(*PBB);
	if (!Cond.empty())
	// reinsert conditional branch only, for now
	TII->insertBranch(*PBB, (TBB == IBB) ? FBB : TBB, nullptr,
	NewCond, dl);
	}

	MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(*PBB), PBB));
	}
	}

	// If this is a large problem, avoid visiting the same basic blocks multiple
	// times.
	if (MergePotentials.size() == TailMergeThreshold)
	for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
	TriedMerging.insert(MergePotentials[i].getBlock());

	if (MergePotentials.size() >= 2)
	MadeChange \|= TryTailMergeBlocks(IBB, PredBB, MinCommonTailLength);

	// Reinsert an unconditional branch if needed. The 1 below can occur as a
	// result of removing blocks in TryTailMergeBlocks.
	PredBB = &*std::prev(I); // this may have been changed in TryTailMergeBlocks
	if (MergePotentials.size() == 1 &&
	MergePotentials.begin()->getBlock() != PredBB)
	FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
	}

	return MadeChange;
	}

	void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
	SmallVector<BlockFrequency, 2> EdgeFreqLs(TailMBB.succ_size());
	BlockFrequency AccumulatedMBBFreq;

	// Aggregate edge frequency of successor edge j:
	// edgeFreq(j) = sum (freq(bb) * edgeProb(bb, j)),
	// where bb is a basic block that is in SameTails.
	for (const auto &Src : SameTails) {
	const MachineBasicBlock *SrcMBB = Src.getBlock();
	BlockFrequency BlockFreq = MBBFreqInfo.getBlockFreq(SrcMBB);
	AccumulatedMBBFreq += BlockFreq;

	// It is not necessary to recompute edge weights if TailBB has less than two
	// successors.
	if (TailMBB.succ_size() <= 1)
	continue;

	auto EdgeFreq = EdgeFreqLs.begin();

	for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
	SuccI != SuccE; ++SuccI, ++EdgeFreq)
	EdgeFreq += BlockFreq MBPI.getEdgeProbability(SrcMBB, *SuccI);
	}

	MBBFreqInfo.setBlockFreq(&TailMBB, AccumulatedMBBFreq);

	if (TailMBB.succ_size() <= 1)
	return;

	auto SumEdgeFreq =
	std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
	.getFrequency();
	auto EdgeFreq = EdgeFreqLs.begin();

	if (SumEdgeFreq > 0) {
	for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
	SuccI != SuccE; ++SuccI, ++EdgeFreq) {
	auto Prob = BranchProbability::getBranchProbability(
	EdgeFreq->getFrequency(), SumEdgeFreq);
	TailMBB.setSuccProbability(SuccI, Prob);
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// Branch Optimization
	//===----------------------------------------------------------------------===//

	bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
	bool MadeChange = false;

	// Make sure blocks are numbered in order
	MF.RenumberBlocks();
	// Renumbering blocks alters funclet membership, recalculate it.
	FuncletMembership = getFuncletMembership(MF);

	for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
	I != E; ) {
	MachineBasicBlock MBB = &I++;
	MadeChange \|= OptimizeBlock(MBB);

	// If it is dead, remove it.
	if (MBB->pred_empty()) {
	RemoveDeadBlock(MBB);
	MadeChange = true;
	++NumDeadBlocks;
	}
	}

	return MadeChange;
	}

	// Blocks should be considered empty if they contain only debug info;
	// else the debug info would affect codegen.
	static bool IsEmptyBlock(MachineBasicBlock *MBB) {
	return MBB->getFirstNonDebugInstr() == MBB->end();
	}

	// Blocks with only debug info and branches should be considered the same
	// as blocks with only branches.
	static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
	MachineBasicBlock::iterator I = MBB->getFirstNonDebugInstr();
	assert(I != MBB->end() && "empty block!");
	return I->isBranch();
	}

	/// IsBetterFallthrough - Return true if it would be clearly better to
	/// fall-through to MBB1 than to fall through into MBB2. This has to return
	/// a strict ordering, returning true for both (MBB1,MBB2) and (MBB2,MBB1) will
	/// result in infinite loops.
	static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
	MachineBasicBlock *MBB2) {
	// Right now, we use a simple heuristic. If MBB2 ends with a call, and
	// MBB1 doesn't, we prefer to fall through into MBB1. This allows us to
	// optimize branches that branch to either a return block or an assert block
	// into a fallthrough to the return.
	MachineBasicBlock::iterator MBB1I = MBB1->getLastNonDebugInstr();
	MachineBasicBlock::iterator MBB2I = MBB2->getLastNonDebugInstr();
	if (MBB1I == MBB1->end() \|\| MBB2I == MBB2->end())
	return false;

	// If there is a clear successor ordering we make sure that one block
	// will fall through to the next
	if (MBB1->isSuccessor(MBB2)) return true;
	if (MBB2->isSuccessor(MBB1)) return false;

	return MBB2I->isCall() && !MBB1I->isCall();
	}

	/// getBranchDebugLoc - Find and return, if any, the DebugLoc of the branch
	/// instructions on the block.
	static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
	MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
	if (I != MBB.end() && I->isBranch())
	return I->getDebugLoc();
	return DebugLoc();
	}

	bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
	bool MadeChange = false;
	MachineFunction &MF = *MBB->getParent();
	ReoptimizeBlock:

	MachineFunction::iterator FallThrough = MBB->getIterator();
	++FallThrough;

	// Make sure MBB and FallThrough belong to the same funclet.
	bool SameFunclet = true;
	if (!FuncletMembership.empty() && FallThrough != MF.end()) {
	auto MBBFunclet = FuncletMembership.find(MBB);
	assert(MBBFunclet != FuncletMembership.end());
	auto FallThroughFunclet = FuncletMembership.find(&*FallThrough);
	assert(FallThroughFunclet != FuncletMembership.end());
	SameFunclet = MBBFunclet->second == FallThroughFunclet->second;
	}

	// If this block is empty, make everyone use its fall-through, not the block
	// explicitly. Landing pads should not do this since the landing-pad table
	// points to this block. Blocks with their addresses taken shouldn't be
	// optimized away.
	if (IsEmptyBlock(MBB) && !MBB->isEHPad() && !MBB->hasAddressTaken() &&
	SameFunclet) {
	// Dead block? Leave for cleanup later.
	if (MBB->pred_empty()) return MadeChange;

	if (FallThrough == MF.end()) {
	// TODO: Simplify preds to not branch here if possible!
	} else if (FallThrough->isEHPad()) {
	// Don't rewrite to a landing pad fallthough. That could lead to the case
	// where a BB jumps to more than one landing pad.
	// TODO: Is it ever worth rewriting predecessors which don't already
	// jump to a landing pad, and so can safely jump to the fallthrough?
	} else if (MBB->isSuccessor(&*FallThrough)) {
	// Rewrite all predecessors of the old block to go to the fallthrough
	// instead.
	while (!MBB->pred_empty()) {
	MachineBasicBlock Pred = (MBB->pred_end()-1);
	Pred->ReplaceUsesOfBlockWith(MBB, &*FallThrough);
	}
	// If MBB was the target of a jump table, update jump tables to go to the
	// fallthrough instead.
	if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
	MJTI->ReplaceMBBInJumpTables(MBB, &*FallThrough);
	MadeChange = true;
	}
	return MadeChange;
	}

	// Check to see if we can simplify the terminator of the block before this
	// one.
	MachineBasicBlock &PrevBB = *std::prev(MachineFunction::iterator(MBB));

	MachineBasicBlock PriorTBB = nullptr, PriorFBB = nullptr;
	SmallVector<MachineOperand, 4> PriorCond;
	bool PriorUnAnalyzable =
	TII->analyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
	if (!PriorUnAnalyzable) {
	// If the CFG for the prior block has extra edges, remove them.
	MadeChange \|= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
	!PriorCond.empty());

	// If the previous branch is conditional and both conditions go to the same
	// destination, remove the branch, replacing it with an unconditional one or
	// a fall-through.
	if (PriorTBB && PriorTBB == PriorFBB) {
	DebugLoc dl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	PriorCond.clear();
	if (PriorTBB != MBB)
	TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}

	// If the previous block unconditionally falls through to this block and
	// this block has no other predecessors, move the contents of this block
	// into the prior block. This doesn't usually happen when SimplifyCFG
	// has been used, but it can happen if tail merging splits a fall-through
	// predecessor of a block.
	// This has to check PrevBB->succ_size() because EH edges are ignored by
	// AnalyzeBranch.
	if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
	PrevBB.succ_size() == 1 &&
	!MBB->hasAddressTaken() && !MBB->isEHPad()) {
	DEBUG(dbgs() << "\nMerging into block: " << PrevBB
	<< "From MBB: " << *MBB);
	// Remove redundant DBG_VALUEs first.
	if (PrevBB.begin() != PrevBB.end()) {
	MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
	--PrevBBIter;
	MachineBasicBlock::iterator MBBIter = MBB->begin();
	// Check if DBG_VALUE at the end of PrevBB is identical to the
	// DBG_VALUE at the beginning of MBB.
	while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
	&& PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
	if (!MBBIter->isIdenticalTo(*PrevBBIter))
	break;
	MachineInstr &DuplicateDbg = *MBBIter;
	++MBBIter; -- PrevBBIter;
	DuplicateDbg.eraseFromParent();
	}
	}
	PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
	PrevBB.removeSuccessor(PrevBB.succ_begin());
	assert(PrevBB.succ_empty());
	PrevBB.transferSuccessors(MBB);
	MadeChange = true;
	return MadeChange;
	}

	// If the previous branch only branches to this block (conditional or
	// not) remove the branch.
	if (PriorTBB == MBB && !PriorFBB) {
	TII->removeBranch(PrevBB);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}

	// If the prior block branches somewhere else on the condition and here if
	// the condition is false, remove the uncond second branch.
	if (PriorFBB == MBB) {
	DebugLoc dl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	TII->insertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}

	// If the prior block branches here on true and somewhere else on false, and
	// if the branch condition is reversible, reverse the branch to create a
	// fall-through.
	if (PriorTBB == MBB) {
	SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
	if (!TII->reverseBranchCondition(NewPriorCond)) {
	DebugLoc dl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	TII->insertBranch(PrevBB, PriorFBB, nullptr, NewPriorCond, dl);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}
	}

	// If this block has no successors (e.g. it is a return block or ends with
	// a call to a no-return function like abort or __cxa_throw) and if the pred
	// falls through into this block, and if it would otherwise fall through
	// into the block after this, move this block to the end of the function.
	//
	// We consider it more likely that execution will stay in the function (e.g.
	// due to loops) than it is to exit it. This asserts in loops etc, moving
	// the assert condition out of the loop body.
	if (MBB->succ_empty() && !PriorCond.empty() && !PriorFBB &&
	MachineFunction::iterator(PriorTBB) == FallThrough &&
	!MBB->canFallThrough()) {
	bool DoTransform = true;

	// We have to be careful that the succs of PredBB aren't both no-successor
	// blocks. If neither have successors and if PredBB is the second from
	// last block in the function, we'd just keep swapping the two blocks for
	// last. Only do the swap if one is clearly better to fall through than
	// the other.
	if (FallThrough == --MF.end() &&
	!IsBetterFallthrough(PriorTBB, MBB))
	DoTransform = false;

	if (DoTransform) {
	// Reverse the branch so we will fall through on the previous true cond.
	SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
	if (!TII->reverseBranchCondition(NewPriorCond)) {
	DEBUG(dbgs() << "\nMoving MBB: " << *MBB
	<< "To make fallthrough to: " << *PriorTBB << "\n");

	DebugLoc dl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	TII->insertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl);

	// Move this block to the end of the function.
	MBB->moveAfter(&MF.back());
	MadeChange = true;
	++NumBranchOpts;
	return MadeChange;
	}
	}
	}
	}

	if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 &&
	MF.getFunction()->optForSize()) {
	// Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch
	// direction, thereby defeating careful block placement and regressing
	// performance. Therefore, only consider this for optsize functions.
	MachineInstr &TailCall = *MBB->getFirstNonDebugInstr();
	if (TII->isUnconditionalTailCall(TailCall)) {
	MachineBasicBlock Pred = MBB->pred_begin();
	MachineBasicBlock PredTBB = nullptr, PredFBB = nullptr;
	SmallVector<MachineOperand, 4> PredCond;
	bool PredAnalyzable =
	!TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);

	- if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB) {
	+ if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB &&
	+ PredTBB != PredFBB) {
	// The predecessor has a conditional branch to this block which consists
	// of only a tail call. Try to fold the tail call into the conditional
	// branch.
	if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
	// TODO: It would be nice if analyzeBranch() could provide a pointer
	- // to the branch insturction so replaceBranchWithTailCall() doesn't
	+ // to the branch instruction so replaceBranchWithTailCall() doesn't
	// have to search for it.
	TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
	++NumTailCalls;
	Pred->removeSuccessor(MBB);
	MadeChange = true;
	return MadeChange;
	}
	}
	// If the predecessor is falling through to this block, we could reverse
	// the branch condition and fold the tail call into that. However, after
	// that we might have to re-arrange the CFG to fall through to the other
	// block and there is a high risk of regressing code size rather than
	// improving it.
	}
	}

	// Analyze the branch in the current block.
	MachineBasicBlock CurTBB = nullptr, CurFBB = nullptr;
	SmallVector<MachineOperand, 4> CurCond;
	bool CurUnAnalyzable =
	TII->analyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
	if (!CurUnAnalyzable) {
	// If the CFG for the prior block has extra edges, remove them.
	MadeChange \|= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());

	// If this is a two-way branch, and the FBB branches to this block, reverse
	// the condition so the single-basic-block loop is faster. Instead of:
	// Loop: xxx; jcc Out; jmp Loop
	// we want:
	// Loop: xxx; jncc Loop; jmp Out
	if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
	SmallVector<MachineOperand, 4> NewCond(CurCond);
	if (!TII->reverseBranchCondition(NewCond)) {
	DebugLoc dl = getBranchDebugLoc(*MBB);
	TII->removeBranch(*MBB);
	TII->insertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
	MadeChange = true;
	++NumBranchOpts;
	goto ReoptimizeBlock;
	}
	}

	// If this branch is the only thing in its block, see if we can forward
	// other blocks across it.
	if (CurTBB && CurCond.empty() && !CurFBB &&
	IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
	!MBB->hasAddressTaken() && !MBB->isEHPad()) {
	DebugLoc dl = getBranchDebugLoc(*MBB);
	// This block may contain just an unconditional branch. Because there can
	// be 'non-branch terminators' in the block, try removing the branch and
	// then seeing if the block is empty.
	TII->removeBranch(*MBB);
	// If the only things remaining in the block are debug info, remove these
	// as well, so this will behave the same as an empty block in non-debug
	// mode.
	if (IsEmptyBlock(MBB)) {
	// Make the block empty, losing the debug info (we could probably
	// improve this in some cases.)
	MBB->erase(MBB->begin(), MBB->end());
	}
	// If this block is just an unconditional branch to CurTBB, we can
	// usually completely eliminate the block. The only case we cannot
	// completely eliminate the block is when the block before this one
	// falls through into MBB and we can't understand the prior block's branch
	// condition.
	if (MBB->empty()) {
	bool PredHasNoFallThrough = !PrevBB.canFallThrough();
	if (PredHasNoFallThrough \|\| !PriorUnAnalyzable \|\|
	!PrevBB.isSuccessor(MBB)) {
	// If the prior block falls through into us, turn it into an
	// explicit branch to us to make updates simpler.
	if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) &&
	PriorTBB != MBB && PriorFBB != MBB) {
	if (!PriorTBB) {
	assert(PriorCond.empty() && !PriorFBB &&
	"Bad branch analysis");
	PriorTBB = MBB;
	} else {
	assert(!PriorFBB && "Machine CFG out of date!");
	PriorFBB = MBB;
	}
	DebugLoc pdl = getBranchDebugLoc(PrevBB);
	TII->removeBranch(PrevBB);
	TII->insertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, pdl);
	}

	// Iterate through all the predecessors, revectoring each in-turn.
	size_t PI = 0;
	bool DidChange = false;
	bool HasBranchToSelf = false;
	while(PI != MBB->pred_size()) {
	MachineBasicBlock PMBB = (MBB->pred_begin() + PI);
	if (PMBB == MBB) {
	// If this block has an uncond branch to itself, leave it.
	++PI;
	HasBranchToSelf = true;
	} else {
	DidChange = true;
	PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
	// If this change resulted in PMBB ending in a conditional
	// branch where both conditions go to the same destination,
	// change this to an unconditional branch (and fix the CFG).
	MachineBasicBlock NewCurTBB = nullptr, NewCurFBB = nullptr;
	SmallVector<MachineOperand, 4> NewCurCond;
	bool NewCurUnAnalyzable = TII->analyzeBranch(
	*PMBB, NewCurTBB, NewCurFBB, NewCurCond, true);
	if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
	DebugLoc pdl = getBranchDebugLoc(*PMBB);
	TII->removeBranch(*PMBB);
	NewCurCond.clear();
	TII->insertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, pdl);
	MadeChange = true;
	++NumBranchOpts;
	PMBB->CorrectExtraCFGEdges(NewCurTBB, nullptr, false);
	}
	}
	}

	// Change any jumptables to go to the new MBB.
	if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
	MJTI->ReplaceMBBInJumpTables(MBB, CurTBB);
	if (DidChange) {
	++NumBranchOpts;
	MadeChange = true;
	if (!HasBranchToSelf) return MadeChange;
	}
	}
	}

	// Add the branch back if the block is more than just an uncond branch.
	TII->insertBranch(*MBB, CurTBB, nullptr, CurCond, dl);
	}
	}

	// If the prior block doesn't fall through into this block, and if this
	// block doesn't fall through into some other block, see if we can find a
	// place to move this block where a fall-through will happen.
	if (!PrevBB.canFallThrough()) {
	// Now we know that there was no fall-through into this block, check to
	// see if it has a fall-through into its successor.
	bool CurFallsThru = MBB->canFallThrough();

	if (!MBB->isEHPad()) {
	// Check all the predecessors of this block. If one of them has no fall
	// throughs, move this block right after it.
	for (MachineBasicBlock *PredBB : MBB->predecessors()) {
	// Analyze the branch at the end of the pred.
	MachineBasicBlock PredTBB = nullptr, PredFBB = nullptr;
	SmallVector<MachineOperand, 4> PredCond;
	if (PredBB != MBB && !PredBB->canFallThrough() &&
	!TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true) &&
	(!CurFallsThru \|\| !CurTBB \|\| !CurFBB) &&
	(!CurFallsThru \|\| MBB->getNumber() >= PredBB->getNumber())) {
	// If the current block doesn't fall through, just move it.
	// If the current block can fall through and does not end with a
	// conditional branch, we need to append an unconditional jump to
	// the (current) next block. To avoid a possible compile-time
	// infinite loop, move blocks only backward in this case.
	// Also, if there are already 2 branches here, we cannot add a third;
	// this means we have the case
	// Bcc next
	// B elsewhere
	// next:
	if (CurFallsThru) {
	MachineBasicBlock NextBB = &std::next(MBB->getIterator());
	CurCond.clear();
	TII->insertBranch(*MBB, NextBB, nullptr, CurCond, DebugLoc());
	}
	MBB->moveAfter(PredBB);
	MadeChange = true;
	goto ReoptimizeBlock;
	}
	}
	}

	if (!CurFallsThru) {
	// Check all successors to see if we can move this block before it.
	for (MachineBasicBlock *SuccBB : MBB->successors()) {
	// Analyze the branch at the end of the block before the succ.
	MachineFunction::iterator SuccPrev = --SuccBB->getIterator();

	// If this block doesn't already fall-through to that successor, and if
	// the succ doesn't already have a block that can fall through into it,
	// and if the successor isn't an EH destination, we can arrange for the
	// fallthrough to happen.
	if (SuccBB != MBB && &*SuccPrev != MBB &&
	!SuccPrev->canFallThrough() && !CurUnAnalyzable &&
	!SuccBB->isEHPad()) {
	MBB->moveBefore(SuccBB);
	MadeChange = true;
	goto ReoptimizeBlock;
	}
	}

	// Okay, there is no really great place to put this block. If, however,
	// the block before this one would be a fall-through if this block were
	// removed, move this block to the end of the function. There is no real
	// advantage in "falling through" to an EH block, so we don't want to
	// perform this transformation for that case.
	//
	// Also, Windows EH introduced the possibility of an arbitrary number of
	// successors to a given block. The analyzeBranch call does not consider
	// exception handling and so we can get in a state where a block
	// containing a call is followed by multiple EH blocks that would be
	// rotated infinitely at the end of the function if the transformation
	// below were performed for EH "FallThrough" blocks. Therefore, even if
	// that appears not to be happening anymore, we should assume that it is
	// possible and not remove the "!FallThrough()->isEHPad" condition below.
	MachineBasicBlock PrevTBB = nullptr, PrevFBB = nullptr;
	SmallVector<MachineOperand, 4> PrevCond;
	if (FallThrough != MF.end() &&
	!FallThrough->isEHPad() &&
	!TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
	PrevBB.isSuccessor(&*FallThrough)) {
	MBB->moveAfter(&MF.back());
	MadeChange = true;
	return MadeChange;
	}
	}
	}

	return MadeChange;
	}

	//===----------------------------------------------------------------------===//
	// Hoist Common Code
	//===----------------------------------------------------------------------===//

	bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
	bool MadeChange = false;
	for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
	MachineBasicBlock MBB = &I++;
	MadeChange \|= HoistCommonCodeInSuccs(MBB);
	}

	return MadeChange;
	}

	/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
	/// its 'true' successor.
	static MachineBasicBlock findFalseBlock(MachineBasicBlock BB,
	MachineBasicBlock *TrueBB) {
	for (MachineBasicBlock *SuccBB : BB->successors())
	if (SuccBB != TrueBB)
	return SuccBB;
	return nullptr;
	}

	template <class Container>
	static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI,
	Container &Set) {
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	Set.insert(*AI);
	} else {
	Set.insert(Reg);
	}
	}

	/// findHoistingInsertPosAndDeps - Find the location to move common instructions
	/// in successors to. The location is usually just before the terminator,
	/// however if the terminator is a conditional branch and its previous
	/// instruction is the flag setting instruction, the previous instruction is
	/// the preferred location. This function also gathers uses and defs of the
	/// instructions from the insertion point to the end of the block. The data is
	/// used by HoistCommonCodeInSuccs to ensure safety.
	static
	MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI,
	SmallSet<unsigned,4> &Uses,
	SmallSet<unsigned,4> &Defs) {
	MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
	if (!TII->isUnpredicatedTerminator(*Loc))
	return MBB->end();

	for (const MachineOperand &MO : Loc->operands()) {
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (MO.isUse()) {
	addRegAndItsAliases(Reg, TRI, Uses);
	} else {
	if (!MO.isDead())
	// Don't try to hoist code in the rare case the terminator defines a
	// register that is later used.
	return MBB->end();

	// If the terminator defines a register, make sure we don't hoist
	// the instruction whose def might be clobbered by the terminator.
	addRegAndItsAliases(Reg, TRI, Defs);
	}
	}

	if (Uses.empty())
	return Loc;
	if (Loc == MBB->begin())
	return MBB->end();

	// The terminator is probably a conditional branch, try not to separate the
	// branch from condition setting instruction.
	MachineBasicBlock::iterator PI =
	skipDebugInstructionsBackward(std::prev(Loc), MBB->begin());

	bool IsDef = false;
	for (const MachineOperand &MO : PI->operands()) {
	// If PI has a regmask operand, it is probably a call. Separate away.
	if (MO.isRegMask())
	return Loc;
	if (!MO.isReg() \|\| MO.isUse())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (Uses.count(Reg)) {
	IsDef = true;
	break;
	}
	}
	if (!IsDef)
	// The condition setting instruction is not just before the conditional
	// branch.
	return Loc;

	// Be conservative, don't insert instruction above something that may have
	// side-effects. And since it's potentially bad to separate flag setting
	// instruction from the conditional branch, just abort the optimization
	// completely.
	// Also avoid moving code above predicated instruction since it's hard to
	// reason about register liveness with predicated instruction.
	bool DontMoveAcrossStore = true;
	if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) \|\| TII->isPredicated(*PI))
	return MBB->end();


	// Find out what registers are live. Note this routine is ignoring other live
	// registers which are only used by instructions in successor blocks.
	for (const MachineOperand &MO : PI->operands()) {
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (MO.isUse()) {
	addRegAndItsAliases(Reg, TRI, Uses);
	} else {
	if (Uses.erase(Reg)) {
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
	Uses.erase(*SubRegs); // Use sub-registers to be conservative
	}
	}
	addRegAndItsAliases(Reg, TRI, Defs);
	}
	}

	return PI;
	}

	bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
	MachineBasicBlock TBB = nullptr, FBB = nullptr;
	SmallVector<MachineOperand, 4> Cond;
	if (TII->analyzeBranch(*MBB, TBB, FBB, Cond, true) \|\| !TBB \|\| Cond.empty())
	return false;

	if (!FBB) FBB = findFalseBlock(MBB, TBB);
	if (!FBB)
	// Malformed bcc? True and false blocks are the same?
	return false;

	// Restrict the optimization to cases where MBB is the only predecessor,
	// it is an obvious win.
	if (TBB->pred_size() > 1 \|\| FBB->pred_size() > 1)
	return false;

	// Find a suitable position to hoist the common instructions to. Also figure
	// out which registers are used or defined by instructions from the insertion
	// point to the end of the block.
	SmallSet<unsigned, 4> Uses, Defs;
	MachineBasicBlock::iterator Loc =
	findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
	if (Loc == MBB->end())
	return false;

	bool HasDups = false;
	SmallVector<unsigned, 4> LocalDefs, LocalKills;
	SmallSet<unsigned, 4> ActiveDefsSet, AllDefsSet;
	MachineBasicBlock::iterator TIB = TBB->begin();
	MachineBasicBlock::iterator FIB = FBB->begin();
	MachineBasicBlock::iterator TIE = TBB->end();
	MachineBasicBlock::iterator FIE = FBB->end();
	while (TIB != TIE && FIB != FIE) {
	// Skip dbg_value instructions. These do not count.
	TIB = skipDebugInstructionsForward(TIB, TIE);
	FIB = skipDebugInstructionsForward(FIB, FIE);
	if (TIB == TIE \|\| FIB == FIE)
	break;

	if (!TIB->isIdenticalTo(*FIB, MachineInstr::CheckKillDead))
	break;

	if (TII->isPredicated(*TIB))
	// Hard to reason about register liveness with predicated instruction.
	break;

	bool IsSafe = true;
	for (MachineOperand &MO : TIB->operands()) {
	// Don't attempt to hoist instructions with register masks.
	if (MO.isRegMask()) {
	IsSafe = false;
	break;
	}
	if (!MO.isReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (MO.isDef()) {
	if (Uses.count(Reg)) {
	// Avoid clobbering a register that's used by the instruction at
	// the point of insertion.
	IsSafe = false;
	break;
	}

	if (Defs.count(Reg) && !MO.isDead()) {
	// Don't hoist the instruction if the def would be clobber by the
	// instruction at the point insertion. FIXME: This is overly
	// conservative. It should be possible to hoist the instructions
	// in BB2 in the following example:
	// BB1:
	// r1, eflag = op1 r2, r3
	// brcc eflag
	//
	// BB2:
	// r1 = op2, ...
	// = op3, r1<kill>
	IsSafe = false;
	break;
	}
	} else if (!ActiveDefsSet.count(Reg)) {
	if (Defs.count(Reg)) {
	// Use is defined by the instruction at the point of insertion.
	IsSafe = false;
	break;
	}

	if (MO.isKill() && Uses.count(Reg))
	// Kills a register that's read by the instruction at the point of
	// insertion. Remove the kill marker.
	MO.setIsKill(false);
	}
	}
	if (!IsSafe)
	break;

	bool DontMoveAcrossStore = true;
	if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
	break;

	// Remove kills from ActiveDefsSet, these registers had short live ranges.
	for (const MachineOperand &MO : TIB->operands()) {
	if (!MO.isReg() \|\| !MO.isUse() \|\| !MO.isKill())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	if (!AllDefsSet.count(Reg)) {
	LocalKills.push_back(Reg);
	continue;
	}
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
	ActiveDefsSet.erase(*AI);
	} else {
	ActiveDefsSet.erase(Reg);
	}
	}

	// Track local defs so we can update liveins.
	for (const MachineOperand &MO : TIB->operands()) {
	if (!MO.isReg() \|\| !MO.isDef() \|\| MO.isDead())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg \|\| TargetRegisterInfo::isVirtualRegister(Reg))
	continue;
	LocalDefs.push_back(Reg);
	addRegAndItsAliases(Reg, TRI, ActiveDefsSet);
	addRegAndItsAliases(Reg, TRI, AllDefsSet);
	}

	HasDups = true;
	++TIB;
	++FIB;
	}

	if (!HasDups)
	return false;

	MBB->splice(Loc, TBB, TBB->begin(), TIB);
	FBB->erase(FBB->begin(), FIB);

	// Update livein's.
	bool ChangedLiveIns = false;
	for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
	unsigned Def = LocalDefs[i];
	if (ActiveDefsSet.count(Def)) {
	TBB->addLiveIn(Def);
	FBB->addLiveIn(Def);
	ChangedLiveIns = true;
	}
	}
	for (unsigned K : LocalKills) {
	TBB->removeLiveIn(K);
	FBB->removeLiveIn(K);
	ChangedLiveIns = true;
	}

	if (ChangedLiveIns) {
	TBB->sortUniqueLiveIns();
	FBB->sortUniqueLiveIns();
	}

	++NumHoist;
	return true;
	}
	Index: head/contrib/llvm/lib/CodeGen/MachineFunction.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/MachineFunction.cpp (revision 322319)
	+++ head/contrib/llvm/lib/CodeGen/MachineFunction.cpp (revision 322320)
	@@ -1,1006 +1,1020 @@
	//===-- MachineFunction.cpp -----------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Collect native machine code information for a function. This allows
	// target-specific information about the generated code to be stored with each
	// function.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/MachineConstantPool.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/PseudoSourceValue.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/ModuleSlotTracker.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/GraphWriter.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetFrameLowering.h"
	#include "llvm/Target/TargetLowering.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	using namespace llvm;

	#define DEBUG_TYPE "codegen"

	static cl::opt<unsigned>
	AlignAllFunctions("align-all-functions",
	cl::desc("Force the alignment of all functions."),
	cl::init(0), cl::Hidden);

	static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
	typedef MachineFunctionProperties::Property P;
	switch(Prop) {
	case P::FailedISel: return "FailedISel";
	case P::IsSSA: return "IsSSA";
	case P::Legalized: return "Legalized";
	case P::NoPHIs: return "NoPHIs";
	case P::NoVRegs: return "NoVRegs";
	case P::RegBankSelected: return "RegBankSelected";
	case P::Selected: return "Selected";
	case P::TracksLiveness: return "TracksLiveness";
	}
	llvm_unreachable("Invalid machine function property");
	}

	void MachineFunctionProperties::print(raw_ostream &OS) const {
	const char *Separator = "";
	for (BitVector::size_type I = 0; I < Properties.size(); ++I) {
	if (!Properties[I])
	continue;
	OS << Separator << getPropertyName(static_cast<Property>(I));
	Separator = ", ";
	}
	}

	//===----------------------------------------------------------------------===//
	// MachineFunction implementation
	//===----------------------------------------------------------------------===//

	// Out-of-line virtual method.
	MachineFunctionInfo::~MachineFunctionInfo() {}

	void ilist_alloc_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
	MBB->getParent()->DeleteMachineBasicBlock(MBB);
	}

	static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
	const Function *Fn) {
	if (Fn->hasFnAttribute(Attribute::StackAlignment))
	return Fn->getFnStackAlignment();
	return STI->getFrameLowering()->getStackAlignment();
	}

	MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
	unsigned FunctionNum, MachineModuleInfo &mmi)
	: Fn(F), Target(TM), STI(TM.getSubtargetImpl(*F)), Ctx(mmi.getContext()),
	MMI(mmi) {
	FunctionNumber = FunctionNum;
	init();
	}

	void MachineFunction::init() {
	// Assume the function starts in SSA form with correct liveness.
	Properties.set(MachineFunctionProperties::Property::IsSSA);
	Properties.set(MachineFunctionProperties::Property::TracksLiveness);
	if (STI->getRegisterInfo())
	RegInfo = new (Allocator) MachineRegisterInfo(this);
	else
	RegInfo = nullptr;

	MFInfo = nullptr;
	// We can realign the stack if the target supports it and the user hasn't
	// explicitly asked us not to.
	bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() &&
	!Fn->hasFnAttribute("no-realign-stack");
	FrameInfo = new (Allocator) MachineFrameInfo(
	getFnStackAlignment(STI, Fn), /StackRealignable=/CanRealignSP,
	/ForceRealign=/CanRealignSP &&
	Fn->hasFnAttribute(Attribute::StackAlignment));

	if (Fn->hasFnAttribute(Attribute::StackAlignment))
	FrameInfo->ensureMaxAlignment(Fn->getFnStackAlignment());

	ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
	Alignment = STI->getTargetLowering()->getMinFunctionAlignment();

	// FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
	// FIXME: Use Function::optForSize().
	if (!Fn->hasFnAttribute(Attribute::OptimizeForSize))
	Alignment = std::max(Alignment,
	STI->getTargetLowering()->getPrefFunctionAlignment());

	if (AlignAllFunctions)
	Alignment = AlignAllFunctions;

	JumpTableInfo = nullptr;

	if (isFuncletEHPersonality(classifyEHPersonality(
	Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr))) {
	WinEHInfo = new (Allocator) WinEHFuncInfo();
	}

	assert(Target.isCompatibleDataLayout(getDataLayout()) &&
	"Can't create a MachineFunction using a Module with a "
	"Target-incompatible DataLayout attached\n");

	PSVManager = llvm::make_unique<PseudoSourceValueManager>();
	}

	MachineFunction::~MachineFunction() {
	clear();
	}

	void MachineFunction::clear() {
	Properties.reset();
	// Don't call destructors on MachineInstr and MachineOperand. All of their
	// memory comes from the BumpPtrAllocator which is about to be purged.
	//
	// Do call MachineBasicBlock destructors, it contains std::vectors.
	for (iterator I = begin(), E = end(); I != E; I = BasicBlocks.erase(I))
	I->Insts.clearAndLeakNodesUnsafely();

	InstructionRecycler.clear(Allocator);
	OperandRecycler.clear(Allocator);
	BasicBlockRecycler.clear(Allocator);
	VariableDbgInfos.clear();
	if (RegInfo) {
	RegInfo->~MachineRegisterInfo();
	Allocator.Deallocate(RegInfo);
	}
	if (MFInfo) {
	MFInfo->~MachineFunctionInfo();
	Allocator.Deallocate(MFInfo);
	}

	FrameInfo->~MachineFrameInfo();
	Allocator.Deallocate(FrameInfo);

	ConstantPool->~MachineConstantPool();
	Allocator.Deallocate(ConstantPool);

	if (JumpTableInfo) {
	JumpTableInfo->~MachineJumpTableInfo();
	Allocator.Deallocate(JumpTableInfo);
	}

	if (WinEHInfo) {
	WinEHInfo->~WinEHFuncInfo();
	Allocator.Deallocate(WinEHInfo);
	}
	}

	const DataLayout &MachineFunction::getDataLayout() const {
	return Fn->getParent()->getDataLayout();
	}

	/// Get the JumpTableInfo for this function.
	/// If it does not already exist, allocate one.
	MachineJumpTableInfo *MachineFunction::
	getOrCreateJumpTableInfo(unsigned EntryKind) {
	if (JumpTableInfo) return JumpTableInfo;

	JumpTableInfo = new (Allocator)
	MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind);
	return JumpTableInfo;
	}

	/// Should we be emitting segmented stack stuff for the function
	bool MachineFunction::shouldSplitStack() const {
	return getFunction()->hasFnAttribute("split-stack");
	}

	/// This discards all of the MachineBasicBlock numbers and recomputes them.
	/// This guarantees that the MBB numbers are sequential, dense, and match the
	/// ordering of the blocks within the function. If a specific MachineBasicBlock
	/// is specified, only that block and those after it are renumbered.
	void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
	if (empty()) { MBBNumbering.clear(); return; }
	MachineFunction::iterator MBBI, E = end();
	if (MBB == nullptr)
	MBBI = begin();
	else
	MBBI = MBB->getIterator();

	// Figure out the block number this should have.
	unsigned BlockNo = 0;
	if (MBBI != begin())
	BlockNo = std::prev(MBBI)->getNumber() + 1;

	for (; MBBI != E; ++MBBI, ++BlockNo) {
	if (MBBI->getNumber() != (int)BlockNo) {
	// Remove use of the old number.
	if (MBBI->getNumber() != -1) {
	assert(MBBNumbering[MBBI->getNumber()] == &*MBBI &&
	"MBB number mismatch!");
	MBBNumbering[MBBI->getNumber()] = nullptr;
	}

	// If BlockNo is already taken, set that block's number to -1.
	if (MBBNumbering[BlockNo])
	MBBNumbering[BlockNo]->setNumber(-1);

	MBBNumbering[BlockNo] = &*MBBI;
	MBBI->setNumber(BlockNo);
	}
	}

	// Okay, all the blocks are renumbered. If we have compactified the block
	// numbering, shrink MBBNumbering now.
	assert(BlockNo <= MBBNumbering.size() && "Mismatch!");
	MBBNumbering.resize(BlockNo);
	}

	/// Allocate a new MachineInstr. Use this instead of `new MachineInstr'.
	MachineInstr *MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
	const DebugLoc &DL,
	bool NoImp) {
	return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
	MachineInstr(*this, MCID, DL, NoImp);
	}

	/// Create a new MachineInstr which is a copy of the 'Orig' instruction,
	/// identical in all ways except the instruction has no parent, prev, or next.
	MachineInstr *
	MachineFunction::CloneMachineInstr(const MachineInstr *Orig) {
	return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
	MachineInstr(this, Orig);
	}

	/// Delete the given MachineInstr.
	///
	/// This function also serves as the MachineInstr destructor - the real
	/// ~MachineInstr() destructor must be empty.
	void
	MachineFunction::DeleteMachineInstr(MachineInstr *MI) {
	// Strip it for parts. The operand array and the MI object itself are
	// independently recyclable.
	if (MI->Operands)
	deallocateOperandArray(MI->CapOperands, MI->Operands);
	// Don't call ~MachineInstr() which must be trivial anyway because
	// ~MachineFunction drops whole lists of MachineInstrs wihout calling their
	// destructors.
	InstructionRecycler.Deallocate(Allocator, MI);
	}

	/// Allocate a new MachineBasicBlock. Use this instead of
	/// `new MachineBasicBlock'.
	MachineBasicBlock *
	MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
	return new (BasicBlockRecycler.Allocate<MachineBasicBlock>(Allocator))
	MachineBasicBlock(*this, bb);
	}

	/// Delete the given MachineBasicBlock.
	void
	MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
	assert(MBB->getParent() == this && "MBB parent mismatch!");
	MBB->~MachineBasicBlock();
	BasicBlockRecycler.Deallocate(Allocator, MBB);
	}

	MachineMemOperand *MachineFunction::getMachineMemOperand(
	MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
	unsigned base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
	SyncScope::ID SSID, AtomicOrdering Ordering,
	AtomicOrdering FailureOrdering) {
	return new (Allocator)
	MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges,
	SSID, Ordering, FailureOrdering);
	}

	MachineMemOperand *
	MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
	int64_t Offset, uint64_t Size) {
	if (MMO->getValue())
	return new (Allocator)
	MachineMemOperand(MachinePointerInfo(MMO->getValue(),
	MMO->getOffset()+Offset),
	MMO->getFlags(), Size, MMO->getBaseAlignment(),
	AAMDNodes(), nullptr, MMO->getSyncScopeID(),
	MMO->getOrdering(), MMO->getFailureOrdering());
	return new (Allocator)
	MachineMemOperand(MachinePointerInfo(MMO->getPseudoValue(),
	MMO->getOffset()+Offset),
	MMO->getFlags(), Size, MMO->getBaseAlignment(),
	AAMDNodes(), nullptr, MMO->getSyncScopeID(),
	MMO->getOrdering(), MMO->getFailureOrdering());
	}

	+MachineMemOperand *
	+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
	+ const AAMDNodes &AAInfo) {
	+ MachinePointerInfo MPI = MMO->getValue() ?
	+ MachinePointerInfo(MMO->getValue(), MMO->getOffset()) :
	+ MachinePointerInfo(MMO->getPseudoValue(), MMO->getOffset());
	+
	+ return new (Allocator)
	+ MachineMemOperand(MPI, MMO->getFlags(), MMO->getSize(),
	+ MMO->getBaseAlignment(), AAInfo,
	+ MMO->getRanges(), MMO->getSyncScopeID(),
	+ MMO->getOrdering(), MMO->getFailureOrdering());
	+}
	+
	MachineInstr::mmo_iterator
	MachineFunction::allocateMemRefsArray(unsigned long Num) {
	return Allocator.Allocate<MachineMemOperand *>(Num);
	}

	std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator>
	MachineFunction::extractLoadMemRefs(MachineInstr::mmo_iterator Begin,
	MachineInstr::mmo_iterator End) {
	// Count the number of load mem refs.
	unsigned Num = 0;
	for (MachineInstr::mmo_iterator I = Begin; I != End; ++I)
	if ((*I)->isLoad())
	++Num;

	// Allocate a new array and populate it with the load information.
	MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num);
	unsigned Index = 0;
	for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) {
	if ((*I)->isLoad()) {
	if (!(*I)->isStore())
	// Reuse the MMO.
	Result[Index] = *I;
	else {
	// Clone the MMO and unset the store flag.
	MachineMemOperand *JustLoad =
	getMachineMemOperand((*I)->getPointerInfo(),
	(*I)->getFlags() & ~MachineMemOperand::MOStore,
	(I)->getSize(), (I)->getBaseAlignment(),
	(*I)->getAAInfo(), nullptr,
	(I)->getSyncScopeID(), (I)->getOrdering(),
	(*I)->getFailureOrdering());
	Result[Index] = JustLoad;
	}
	++Index;
	}
	}
	return std::make_pair(Result, Result + Num);
	}

	std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator>
	MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
	MachineInstr::mmo_iterator End) {
	// Count the number of load mem refs.
	unsigned Num = 0;
	for (MachineInstr::mmo_iterator I = Begin; I != End; ++I)
	if ((*I)->isStore())
	++Num;

	// Allocate a new array and populate it with the store information.
	MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num);
	unsigned Index = 0;
	for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) {
	if ((*I)->isStore()) {
	if (!(*I)->isLoad())
	// Reuse the MMO.
	Result[Index] = *I;
	else {
	// Clone the MMO and unset the load flag.
	MachineMemOperand *JustStore =
	getMachineMemOperand((*I)->getPointerInfo(),
	(*I)->getFlags() & ~MachineMemOperand::MOLoad,
	(I)->getSize(), (I)->getBaseAlignment(),
	(*I)->getAAInfo(), nullptr,
	(I)->getSyncScopeID(), (I)->getOrdering(),
	(*I)->getFailureOrdering());
	Result[Index] = JustStore;
	}
	++Index;
	}
	}
	return std::make_pair(Result, Result + Num);
	}

	const char *MachineFunction::createExternalSymbolName(StringRef Name) {
	char *Dest = Allocator.Allocate<char>(Name.size() + 1);
	std::copy(Name.begin(), Name.end(), Dest);
	Dest[Name.size()] = 0;
	return Dest;
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void MachineFunction::dump() const {
	print(dbgs());
	}
	#endif

	StringRef MachineFunction::getName() const {
	assert(getFunction() && "No function!");
	return getFunction()->getName();
	}

	void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
	OS << "# Machine code for function " << getName() << ": ";
	getProperties().print(OS);
	OS << '\n';

	// Print Frame Information
	FrameInfo->print(*this, OS);

	// Print JumpTable Information
	if (JumpTableInfo)
	JumpTableInfo->print(OS);

	// Print Constant Pool
	ConstantPool->print(OS);

	const TargetRegisterInfo *TRI = getSubtarget().getRegisterInfo();

	if (RegInfo && !RegInfo->livein_empty()) {
	OS << "Function Live Ins: ";
	for (MachineRegisterInfo::livein_iterator
	I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) {
	OS << PrintReg(I->first, TRI);
	if (I->second)
	OS << " in " << PrintReg(I->second, TRI);
	if (std::next(I) != E)
	OS << ", ";
	}
	OS << '\n';
	}

	ModuleSlotTracker MST(getFunction()->getParent());
	MST.incorporateFunction(*getFunction());
	for (const auto &BB : *this) {
	OS << '\n';
	BB.print(OS, MST, Indexes);
	}

	OS << "\n# End machine code for function " << getName() << ".\n\n";
	}

	namespace llvm {
	template<>
	struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {

	DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}

	static std::string getGraphName(const MachineFunction *F) {
	return ("CFG for '" + F->getName() + "' function").str();
	}

	std::string getNodeLabel(const MachineBasicBlock *Node,
	const MachineFunction *Graph) {
	std::string OutStr;
	{
	raw_string_ostream OSS(OutStr);

	if (isSimple()) {
	OSS << "BB#" << Node->getNumber();
	if (const BasicBlock *BB = Node->getBasicBlock())
	OSS << ": " << BB->getName();
	} else
	Node->print(OSS);
	}

	if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());

	// Process string output to make it nicer...
	for (unsigned i = 0; i != OutStr.length(); ++i)
	if (OutStr[i] == '\n') { // Left justify
	OutStr[i] = '\\';
	OutStr.insert(OutStr.begin()+i+1, 'l');
	}
	return OutStr;
	}
	};
	}

	void MachineFunction::viewCFG() const
	{
	#ifndef NDEBUG
	ViewGraph(this, "mf" + getName());
	#else
	errs() << "MachineFunction::viewCFG is only available in debug builds on "
	<< "systems with Graphviz or gv!\n";
	#endif // NDEBUG
	}

	void MachineFunction::viewCFGOnly() const
	{
	#ifndef NDEBUG
	ViewGraph(this, "mf" + getName(), true);
	#else
	errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
	<< "systems with Graphviz or gv!\n";
	#endif // NDEBUG
	}

	/// Add the specified physical register as a live-in value and
	/// create a corresponding virtual register for it.
	unsigned MachineFunction::addLiveIn(unsigned PReg,
	const TargetRegisterClass *RC) {
	MachineRegisterInfo &MRI = getRegInfo();
	unsigned VReg = MRI.getLiveInVirtReg(PReg);
	if (VReg) {
	const TargetRegisterClass *VRegRC = MRI.getRegClass(VReg);
	(void)VRegRC;
	// A physical register can be added several times.
	// Between two calls, the register class of the related virtual register
	// may have been constrained to match some operation constraints.
	// In that case, check that the current register class includes the
	// physical register and is a sub class of the specified RC.
	assert((VRegRC == RC \|\| (VRegRC->contains(PReg) &&
	RC->hasSubClassEq(VRegRC))) &&
	"Register class mismatch!");
	return VReg;
	}
	VReg = MRI.createVirtualRegister(RC);
	MRI.addLiveIn(PReg, VReg);
	return VReg;
	}

	/// Return the MCSymbol for the specified non-empty jump table.
	/// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a
	/// normal 'L' label is returned.
	MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
	bool isLinkerPrivate) const {
	const DataLayout &DL = getDataLayout();
	assert(JumpTableInfo && "No jump tables");
	assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");

	StringRef Prefix = isLinkerPrivate ? DL.getLinkerPrivateGlobalPrefix()
	: DL.getPrivateGlobalPrefix();
	SmallString<60> Name;
	raw_svector_ostream(Name)
	<< Prefix << "JTI" << getFunctionNumber() << '_' << JTI;
	return Ctx.getOrCreateSymbol(Name);
	}

	/// Return a function-local symbol to represent the PIC base.
	MCSymbol *MachineFunction::getPICBaseSymbol() const {
	const DataLayout &DL = getDataLayout();
	return Ctx.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
	Twine(getFunctionNumber()) + "$pb");
	}

	/// \name Exception Handling
	/// \{

	LandingPadInfo &
	MachineFunction::getOrCreateLandingPadInfo(MachineBasicBlock *LandingPad) {
	unsigned N = LandingPads.size();
	for (unsigned i = 0; i < N; ++i) {
	LandingPadInfo &LP = LandingPads[i];
	if (LP.LandingPadBlock == LandingPad)
	return LP;
	}

	LandingPads.push_back(LandingPadInfo(LandingPad));
	return LandingPads[N];
	}

	void MachineFunction::addInvoke(MachineBasicBlock *LandingPad,
	MCSymbol BeginLabel, MCSymbol EndLabel) {
	LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
	LP.BeginLabels.push_back(BeginLabel);
	LP.EndLabels.push_back(EndLabel);
	}

	MCSymbol MachineFunction::addLandingPad(MachineBasicBlock LandingPad) {
	MCSymbol *LandingPadLabel = Ctx.createTempSymbol();
	LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
	LP.LandingPadLabel = LandingPadLabel;
	return LandingPadLabel;
	}

	void MachineFunction::addCatchTypeInfo(MachineBasicBlock *LandingPad,
	ArrayRef<const GlobalValue *> TyInfo) {
	LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
	for (unsigned N = TyInfo.size(); N; --N)
	LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1]));
	}

	void MachineFunction::addFilterTypeInfo(MachineBasicBlock *LandingPad,
	ArrayRef<const GlobalValue *> TyInfo) {
	LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
	std::vector<unsigned> IdsInFilter(TyInfo.size());
	for (unsigned I = 0, E = TyInfo.size(); I != E; ++I)
	IdsInFilter[I] = getTypeIDFor(TyInfo[I]);
	LP.TypeIds.push_back(getFilterIDFor(IdsInFilter));
	}

	void MachineFunction::tidyLandingPads(DenseMap<MCSymbol, uintptr_t> LPMap) {
	for (unsigned i = 0; i != LandingPads.size(); ) {
	LandingPadInfo &LandingPad = LandingPads[i];
	if (LandingPad.LandingPadLabel &&
	!LandingPad.LandingPadLabel->isDefined() &&
	(!LPMap \|\| (*LPMap)[LandingPad.LandingPadLabel] == 0))
	LandingPad.LandingPadLabel = nullptr;

	// Special case: we should emit LPs with null LP MBB. This indicates
	// "nounwind" case.
	if (!LandingPad.LandingPadLabel && LandingPad.LandingPadBlock) {
	LandingPads.erase(LandingPads.begin() + i);
	continue;
	}

	for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
	MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
	MCSymbol *EndLabel = LandingPad.EndLabels[j];
	if ((BeginLabel->isDefined() \|\|
	(LPMap && (*LPMap)[BeginLabel] != 0)) &&
	(EndLabel->isDefined() \|\|
	(LPMap && (*LPMap)[EndLabel] != 0))) continue;

	LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
	LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
	--j;
	--e;
	}

	// Remove landing pads with no try-ranges.
	if (LandingPads[i].BeginLabels.empty()) {
	LandingPads.erase(LandingPads.begin() + i);
	continue;
	}

	// If there is no landing pad, ensure that the list of typeids is empty.
	// If the only typeid is a cleanup, this is the same as having no typeids.
	if (!LandingPad.LandingPadBlock \|\|
	(LandingPad.TypeIds.size() == 1 && !LandingPad.TypeIds[0]))
	LandingPad.TypeIds.clear();
	++i;
	}
	}

	void MachineFunction::addCleanup(MachineBasicBlock *LandingPad) {
	LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
	LP.TypeIds.push_back(0);
	}

	void MachineFunction::addSEHCatchHandler(MachineBasicBlock *LandingPad,
	const Function *Filter,
	const BlockAddress *RecoverBA) {
	LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
	SEHHandler Handler;
	Handler.FilterOrFinally = Filter;
	Handler.RecoverBA = RecoverBA;
	LP.SEHHandlers.push_back(Handler);
	}

	void MachineFunction::addSEHCleanupHandler(MachineBasicBlock *LandingPad,
	const Function *Cleanup) {
	LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
	SEHHandler Handler;
	Handler.FilterOrFinally = Cleanup;
	Handler.RecoverBA = nullptr;
	LP.SEHHandlers.push_back(Handler);
	}

	void MachineFunction::setCallSiteLandingPad(MCSymbol *Sym,
	ArrayRef<unsigned> Sites) {
	LPadToCallSiteMap[Sym].append(Sites.begin(), Sites.end());
	}

	unsigned MachineFunction::getTypeIDFor(const GlobalValue *TI) {
	for (unsigned i = 0, N = TypeInfos.size(); i != N; ++i)
	if (TypeInfos[i] == TI) return i + 1;

	TypeInfos.push_back(TI);
	return TypeInfos.size();
	}

	int MachineFunction::getFilterIDFor(std::vector<unsigned> &TyIds) {
	// If the new filter coincides with the tail of an existing filter, then
	// re-use the existing filter. Folding filters more than this requires
	// re-ordering filters and/or their elements - probably not worth it.
	for (std::vector<unsigned>::iterator I = FilterEnds.begin(),
	E = FilterEnds.end(); I != E; ++I) {
	unsigned i = *I, j = TyIds.size();

	while (i && j)
	if (FilterIds[--i] != TyIds[--j])
	goto try_next;

	if (!j)
	// The new filter coincides with range [i, end) of the existing filter.
	return -(1 + i);

	try_next:;
	}

	// Add the new filter.
	int FilterID = -(1 + FilterIds.size());
	FilterIds.reserve(FilterIds.size() + TyIds.size() + 1);
	FilterIds.insert(FilterIds.end(), TyIds.begin(), TyIds.end());
	FilterEnds.push_back(FilterIds.size());
	FilterIds.push_back(0); // terminator
	return FilterID;
	}

	void llvm::addLandingPadInfo(const LandingPadInst &I, MachineBasicBlock &MBB) {
	MachineFunction &MF = *MBB.getParent();
	if (const auto *PF = dyn_cast<Function>(
	I.getParent()->getParent()->getPersonalityFn()->stripPointerCasts()))
	MF.getMMI().addPersonality(PF);

	if (I.isCleanup())
	MF.addCleanup(&MBB);

	// FIXME: New EH - Add the clauses in reverse order. This isn't 100% correct,
	// but we need to do it this way because of how the DWARF EH emitter
	// processes the clauses.
	for (unsigned i = I.getNumClauses(); i != 0; --i) {
	Value *Val = I.getClause(i - 1);
	if (I.isCatch(i - 1)) {
	MF.addCatchTypeInfo(&MBB,
	dyn_cast<GlobalValue>(Val->stripPointerCasts()));
	} else {
	// Add filters in a list.
	Constant *CVal = cast<Constant>(Val);
	SmallVector<const GlobalValue *, 4> FilterList;
	for (User::op_iterator II = CVal->op_begin(), IE = CVal->op_end();
	II != IE; ++II)
	FilterList.push_back(cast<GlobalValue>((*II)->stripPointerCasts()));

	MF.addFilterTypeInfo(&MBB, FilterList);
	}
	}
	}

	/// \}

	//===----------------------------------------------------------------------===//
	// MachineJumpTableInfo implementation
	//===----------------------------------------------------------------------===//

	/// Return the size of each entry in the jump table.
	unsigned MachineJumpTableInfo::getEntrySize(const DataLayout &TD) const {
	// The size of a jump table entry is 4 bytes unless the entry is just the
	// address of a block, in which case it is the pointer size.
	switch (getEntryKind()) {
	case MachineJumpTableInfo::EK_BlockAddress:
	return TD.getPointerSize();
	case MachineJumpTableInfo::EK_GPRel64BlockAddress:
	return 8;
	case MachineJumpTableInfo::EK_GPRel32BlockAddress:
	case MachineJumpTableInfo::EK_LabelDifference32:
	case MachineJumpTableInfo::EK_Custom32:
	return 4;
	case MachineJumpTableInfo::EK_Inline:
	return 0;
	}
	llvm_unreachable("Unknown jump table encoding!");
	}

	/// Return the alignment of each entry in the jump table.
	unsigned MachineJumpTableInfo::getEntryAlignment(const DataLayout &TD) const {
	// The alignment of a jump table entry is the alignment of int32 unless the
	// entry is just the address of a block, in which case it is the pointer
	// alignment.
	switch (getEntryKind()) {
	case MachineJumpTableInfo::EK_BlockAddress:
	return TD.getPointerABIAlignment();
	case MachineJumpTableInfo::EK_GPRel64BlockAddress:
	return TD.getABIIntegerTypeAlignment(64);
	case MachineJumpTableInfo::EK_GPRel32BlockAddress:
	case MachineJumpTableInfo::EK_LabelDifference32:
	case MachineJumpTableInfo::EK_Custom32:
	return TD.getABIIntegerTypeAlignment(32);
	case MachineJumpTableInfo::EK_Inline:
	return 1;
	}
	llvm_unreachable("Unknown jump table encoding!");
	}

	/// Create a new jump table entry in the jump table info.
	unsigned MachineJumpTableInfo::createJumpTableIndex(
	const std::vector<MachineBasicBlock*> &DestBBs) {
	assert(!DestBBs.empty() && "Cannot create an empty jump table!");
	JumpTables.push_back(MachineJumpTableEntry(DestBBs));
	return JumpTables.size()-1;
	}

	/// If Old is the target of any jump tables, update the jump tables to branch
	/// to New instead.
	bool MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old,
	MachineBasicBlock *New) {
	assert(Old != New && "Not making a change?");
	bool MadeChange = false;
	for (size_t i = 0, e = JumpTables.size(); i != e; ++i)
	ReplaceMBBInJumpTable(i, Old, New);
	return MadeChange;
	}

	/// If Old is a target of the jump tables, update the jump table to branch to
	/// New instead.
	bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx,
	MachineBasicBlock *Old,
	MachineBasicBlock *New) {
	assert(Old != New && "Not making a change?");
	bool MadeChange = false;
	MachineJumpTableEntry &JTE = JumpTables[Idx];
	for (size_t j = 0, e = JTE.MBBs.size(); j != e; ++j)
	if (JTE.MBBs[j] == Old) {
	JTE.MBBs[j] = New;
	MadeChange = true;
	}
	return MadeChange;
	}

	void MachineJumpTableInfo::print(raw_ostream &OS) const {
	if (JumpTables.empty()) return;

	OS << "Jump Tables:\n";

	for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) {
	OS << " jt#" << i << ": ";
	for (unsigned j = 0, f = JumpTables[i].MBBs.size(); j != f; ++j)
	OS << " BB#" << JumpTables[i].MBBs[j]->getNumber();
	}

	OS << '\n';
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void MachineJumpTableInfo::dump() const { print(dbgs()); }
	#endif


	//===----------------------------------------------------------------------===//
	// MachineConstantPool implementation
	//===----------------------------------------------------------------------===//

	void MachineConstantPoolValue::anchor() { }

	Type *MachineConstantPoolEntry::getType() const {
	if (isMachineConstantPoolEntry())
	return Val.MachineCPVal->getType();
	return Val.ConstVal->getType();
	}

	bool MachineConstantPoolEntry::needsRelocation() const {
	if (isMachineConstantPoolEntry())
	return true;
	return Val.ConstVal->needsRelocation();
	}

	SectionKind
	MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
	if (needsRelocation())
	return SectionKind::getReadOnlyWithRel();
	switch (DL->getTypeAllocSize(getType())) {
	case 4:
	return SectionKind::getMergeableConst4();
	case 8:
	return SectionKind::getMergeableConst8();
	case 16:
	return SectionKind::getMergeableConst16();
	case 32:
	return SectionKind::getMergeableConst32();
	default:
	return SectionKind::getReadOnly();
	}
	}

	MachineConstantPool::~MachineConstantPool() {
	// A constant may be a member of both Constants and MachineCPVsSharingEntries,
	// so keep track of which we've deleted to avoid double deletions.
	DenseSet<MachineConstantPoolValue*> Deleted;
	for (unsigned i = 0, e = Constants.size(); i != e; ++i)
	if (Constants[i].isMachineConstantPoolEntry()) {
	Deleted.insert(Constants[i].Val.MachineCPVal);
	delete Constants[i].Val.MachineCPVal;
	}
	for (DenseSet<MachineConstantPoolValue*>::iterator I =
	MachineCPVsSharingEntries.begin(), E = MachineCPVsSharingEntries.end();
	I != E; ++I) {
	if (Deleted.count(*I) == 0)
	delete *I;
	}
	}

	/// Test whether the given two constants can be allocated the same constant pool
	/// entry.
	static bool CanShareConstantPoolEntry(const Constant A, const Constant B,
	const DataLayout &DL) {
	// Handle the trivial case quickly.
	if (A == B) return true;

	// If they have the same type but weren't the same constant, quickly
	// reject them.
	if (A->getType() == B->getType()) return false;

	// We can't handle structs or arrays.
	if (isa<StructType>(A->getType()) \|\| isa<ArrayType>(A->getType()) \|\|
	isa<StructType>(B->getType()) \|\| isa<ArrayType>(B->getType()))
	return false;

	// For now, only support constants with the same size.
	uint64_t StoreSize = DL.getTypeStoreSize(A->getType());
	if (StoreSize != DL.getTypeStoreSize(B->getType()) \|\| StoreSize > 128)
	return false;

	Type IntTy = IntegerType::get(A->getContext(), StoreSize8);

	// Try constant folding a bitcast of both instructions to an integer. If we
	// get two identical ConstantInt's, then we are good to share them. We use
	// the constant folding APIs to do this so that we get the benefit of
	// DataLayout.
	if (isa<PointerType>(A->getType()))
	A = ConstantFoldCastOperand(Instruction::PtrToInt,
	const_cast<Constant *>(A), IntTy, DL);
	else if (A->getType() != IntTy)
	A = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(A),
	IntTy, DL);
	if (isa<PointerType>(B->getType()))
	B = ConstantFoldCastOperand(Instruction::PtrToInt,
	const_cast<Constant *>(B), IntTy, DL);
	else if (B->getType() != IntTy)
	B = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(B),
	IntTy, DL);

	return A == B;
	}

	/// Create a new entry in the constant pool or return an existing one.
	/// User must specify the log2 of the minimum required alignment for the object.
	unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
	unsigned Alignment) {
	assert(Alignment && "Alignment must be specified!");
	if (Alignment > PoolAlignment) PoolAlignment = Alignment;

	// Check to see if we already have this constant.
	//
	// FIXME, this could be made much more efficient for large constant pools.
	for (unsigned i = 0, e = Constants.size(); i != e; ++i)
	if (!Constants[i].isMachineConstantPoolEntry() &&
	CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, DL)) {
	if ((unsigned)Constants[i].getAlignment() < Alignment)
	Constants[i].Alignment = Alignment;
	return i;
	}

	Constants.push_back(MachineConstantPoolEntry(C, Alignment));
	return Constants.size()-1;
	}

	unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
	unsigned Alignment) {
	assert(Alignment && "Alignment must be specified!");
	if (Alignment > PoolAlignment) PoolAlignment = Alignment;

	// Check to see if we already have this constant.
	//
	// FIXME, this could be made much more efficient for large constant pools.
	int Idx = V->getExistingMachineCPValue(this, Alignment);
	if (Idx != -1) {
	MachineCPVsSharingEntries.insert(V);
	return (unsigned)Idx;
	}

	Constants.push_back(MachineConstantPoolEntry(V, Alignment));
	return Constants.size()-1;
	}

	void MachineConstantPool::print(raw_ostream &OS) const {
	if (Constants.empty()) return;

	OS << "Constant Pool:\n";
	for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
	OS << " cp#" << i << ": ";
	if (Constants[i].isMachineConstantPoolEntry())
	Constants[i].Val.MachineCPVal->print(OS);
	else
	Constants[i].Val.ConstVal->printAsOperand(OS, /PrintType=/false);
	OS << ", align=" << Constants[i].getAlignment();
	OS << "\n";
	}
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void MachineConstantPool::dump() const { print(dbgs()); }
	#endif
	Index: head/contrib/llvm/lib/CodeGen/MachineInstr.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/MachineInstr.cpp (revision 322319)
	+++ head/contrib/llvm/lib/CodeGen/MachineInstr.cpp (revision 322320)
	@@ -1,2391 +1,2389 @@
	//===- lib/CodeGen/MachineInstr.cpp ---------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Methods common to all machine instructions.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/FoldingSet.h"
	#include "llvm/ADT/Hashing.h"
	#include "llvm/ADT/None.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallString.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/Loads.h"
	#include "llvm/Analysis/MemoryLocation.h"
	#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineInstrBundle.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/PseudoSourceValue.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DebugInfoMetadata.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/InstrTypes.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Metadata.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/ModuleSlotTracker.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCInstrDesc.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/LowLevelTypeImpl.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetIntrinsicInfo.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <algorithm>
	#include <cassert>
	#include <cstddef>
	#include <cstdint>
	#include <cstring>
	#include <iterator>
	#include <utility>

	using namespace llvm;

	static cl::opt<bool> PrintWholeRegMask(
	"print-whole-regmask",
	cl::desc("Print the full contents of regmask operands in IR dumps"),
	cl::init(true), cl::Hidden);

	//===----------------------------------------------------------------------===//
	// MachineOperand Implementation
	//===----------------------------------------------------------------------===//

	void MachineOperand::setReg(unsigned Reg) {
	if (getReg() == Reg) return; // No change.

	// Otherwise, we have to change the register. If this operand is embedded
	// into a machine function, we need to update the old and new register's
	// use/def lists.
	if (MachineInstr *MI = getParent())
	if (MachineBasicBlock *MBB = MI->getParent())
	if (MachineFunction *MF = MBB->getParent()) {
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MRI.removeRegOperandFromUseList(this);
	SmallContents.RegNo = Reg;
	MRI.addRegOperandToUseList(this);
	return;
	}

	// Otherwise, just change the register, no problem. :)
	SmallContents.RegNo = Reg;
	}

	void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
	const TargetRegisterInfo &TRI) {
	assert(TargetRegisterInfo::isVirtualRegister(Reg));
	if (SubIdx && getSubReg())
	SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
	setReg(Reg);
	if (SubIdx)
	setSubReg(SubIdx);
	}

	void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
	assert(TargetRegisterInfo::isPhysicalRegister(Reg));
	if (getSubReg()) {
	Reg = TRI.getSubReg(Reg, getSubReg());
	// Note that getSubReg() may return 0 if the sub-register doesn't exist.
	// That won't happen in legal code.
	setSubReg(0);
	if (isDef())
	setIsUndef(false);
	}
	setReg(Reg);
	}

	/// Change a def to a use, or a use to a def.
	void MachineOperand::setIsDef(bool Val) {
	assert(isReg() && "Wrong MachineOperand accessor");
	assert((!Val \|\| !isDebug()) && "Marking a debug operation as def");
	if (IsDef == Val)
	return;
	// MRI may keep uses and defs in different list positions.
	if (MachineInstr *MI = getParent())
	if (MachineBasicBlock *MBB = MI->getParent())
	if (MachineFunction *MF = MBB->getParent()) {
	MachineRegisterInfo &MRI = MF->getRegInfo();
	MRI.removeRegOperandFromUseList(this);
	IsDef = Val;
	MRI.addRegOperandToUseList(this);
	return;
	}
	IsDef = Val;
	}

	// If this operand is currently a register operand, and if this is in a
	// function, deregister the operand from the register's use/def list.
	void MachineOperand::removeRegFromUses() {
	if (!isReg() \|\| !isOnRegUseList())
	return;

	if (MachineInstr *MI = getParent()) {
	if (MachineBasicBlock *MBB = MI->getParent()) {
	if (MachineFunction *MF = MBB->getParent())
	MF->getRegInfo().removeRegOperandFromUseList(this);
	}
	}
	}

	/// ChangeToImmediate - Replace this operand with a new immediate operand of
	/// the specified value. If an operand is known to be an immediate already,
	/// the setImm method should be used.
	void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
	assert((!isReg() \|\| !isTied()) && "Cannot change a tied operand into an imm");

	removeRegFromUses();

	OpKind = MO_Immediate;
	Contents.ImmVal = ImmVal;
	}

	void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm) {
	assert((!isReg() \|\| !isTied()) && "Cannot change a tied operand into an imm");

	removeRegFromUses();

	OpKind = MO_FPImmediate;
	Contents.CFP = FPImm;
	}

	void MachineOperand::ChangeToES(const char *SymName, unsigned char TargetFlags) {
	assert((!isReg() \|\| !isTied()) &&
	"Cannot change a tied operand into an external symbol");

	removeRegFromUses();

	OpKind = MO_ExternalSymbol;
	Contents.OffsetedInfo.Val.SymbolName = SymName;
	setOffset(0); // Offset is always 0.
	setTargetFlags(TargetFlags);
	}

	void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym) {
	assert((!isReg() \|\| !isTied()) &&
	"Cannot change a tied operand into an MCSymbol");

	removeRegFromUses();

	OpKind = MO_MCSymbol;
	Contents.Sym = Sym;
	}

	void MachineOperand::ChangeToFrameIndex(int Idx) {
	assert((!isReg() \|\| !isTied()) &&
	"Cannot change a tied operand into a FrameIndex");

	removeRegFromUses();

	OpKind = MO_FrameIndex;
	setIndex(Idx);
	}

	/// ChangeToRegister - Replace this operand with a new register operand of
	/// the specified value. If an operand is known to be an register already,
	/// the setReg method should be used.
	void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
	bool isKill, bool isDead, bool isUndef,
	bool isDebug) {
	MachineRegisterInfo *RegInfo = nullptr;
	if (MachineInstr *MI = getParent())
	if (MachineBasicBlock *MBB = MI->getParent())
	if (MachineFunction *MF = MBB->getParent())
	RegInfo = &MF->getRegInfo();
	// If this operand is already a register operand, remove it from the
	// register's use/def lists.
	bool WasReg = isReg();
	if (RegInfo && WasReg)
	RegInfo->removeRegOperandFromUseList(this);

	// Change this to a register and set the reg#.
	OpKind = MO_Register;
	SmallContents.RegNo = Reg;
	SubReg_TargetFlags = 0;
	IsDef = isDef;
	IsImp = isImp;
	IsKill = isKill;
	IsDead = isDead;
	IsUndef = isUndef;
	IsInternalRead = false;
	IsEarlyClobber = false;
	IsDebug = isDebug;
	// Ensure isOnRegUseList() returns false.
	Contents.Reg.Prev = nullptr;
	// Preserve the tie when the operand was already a register.
	if (!WasReg)
	TiedTo = 0;

	// If this operand is embedded in a function, add the operand to the
	// register's use/def list.
	if (RegInfo)
	RegInfo->addRegOperandToUseList(this);
	}

	/// isIdenticalTo - Return true if this operand is identical to the specified
	/// operand. Note that this should stay in sync with the hash_value overload
	/// below.
	bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
	if (getType() != Other.getType() \|\|
	getTargetFlags() != Other.getTargetFlags())
	return false;

	switch (getType()) {
	case MachineOperand::MO_Register:
	return getReg() == Other.getReg() && isDef() == Other.isDef() &&
	getSubReg() == Other.getSubReg();
	case MachineOperand::MO_Immediate:
	return getImm() == Other.getImm();
	case MachineOperand::MO_CImmediate:
	return getCImm() == Other.getCImm();
	case MachineOperand::MO_FPImmediate:
	return getFPImm() == Other.getFPImm();
	case MachineOperand::MO_MachineBasicBlock:
	return getMBB() == Other.getMBB();
	case MachineOperand::MO_FrameIndex:
	return getIndex() == Other.getIndex();
	case MachineOperand::MO_ConstantPoolIndex:
	case MachineOperand::MO_TargetIndex:
	return getIndex() == Other.getIndex() && getOffset() == Other.getOffset();
	case MachineOperand::MO_JumpTableIndex:
	return getIndex() == Other.getIndex();
	case MachineOperand::MO_GlobalAddress:
	return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset();
	case MachineOperand::MO_ExternalSymbol:
	return strcmp(getSymbolName(), Other.getSymbolName()) == 0 &&
	getOffset() == Other.getOffset();
	case MachineOperand::MO_BlockAddress:
	return getBlockAddress() == Other.getBlockAddress() &&
	getOffset() == Other.getOffset();
	case MachineOperand::MO_RegisterMask:
	case MachineOperand::MO_RegisterLiveOut: {
	// Shallow compare of the two RegMasks
	const uint32_t *RegMask = getRegMask();
	const uint32_t *OtherRegMask = Other.getRegMask();
	if (RegMask == OtherRegMask)
	return true;

	// Calculate the size of the RegMask
	const MachineFunction *MF = getParent()->getParent()->getParent();
	const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
	unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;

	// Deep compare of the two RegMasks
	return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask);
	}
	case MachineOperand::MO_MCSymbol:
	return getMCSymbol() == Other.getMCSymbol();
	case MachineOperand::MO_CFIIndex:
	return getCFIIndex() == Other.getCFIIndex();
	case MachineOperand::MO_Metadata:
	return getMetadata() == Other.getMetadata();
	case MachineOperand::MO_IntrinsicID:
	return getIntrinsicID() == Other.getIntrinsicID();
	case MachineOperand::MO_Predicate:
	return getPredicate() == Other.getPredicate();
	}
	llvm_unreachable("Invalid machine operand type");
	}

	// Note: this must stay exactly in sync with isIdenticalTo above.
	hash_code llvm::hash_value(const MachineOperand &MO) {
	switch (MO.getType()) {
	case MachineOperand::MO_Register:
	// Register operands don't have target flags.
	return hash_combine(MO.getType(), MO.getReg(), MO.getSubReg(), MO.isDef());
	case MachineOperand::MO_Immediate:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
	case MachineOperand::MO_CImmediate:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCImm());
	case MachineOperand::MO_FPImmediate:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getFPImm());
	case MachineOperand::MO_MachineBasicBlock:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMBB());
	case MachineOperand::MO_FrameIndex:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
	case MachineOperand::MO_ConstantPoolIndex:
	case MachineOperand::MO_TargetIndex:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex(),
	MO.getOffset());
	case MachineOperand::MO_JumpTableIndex:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
	case MachineOperand::MO_ExternalSymbol:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
	MO.getSymbolName());
	case MachineOperand::MO_GlobalAddress:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getGlobal(),
	MO.getOffset());
	case MachineOperand::MO_BlockAddress:
	return hash_combine(MO.getType(), MO.getTargetFlags(),
	MO.getBlockAddress(), MO.getOffset());
	case MachineOperand::MO_RegisterMask:
	case MachineOperand::MO_RegisterLiveOut:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
	case MachineOperand::MO_Metadata:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
	case MachineOperand::MO_MCSymbol:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
	case MachineOperand::MO_CFIIndex:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCFIIndex());
	case MachineOperand::MO_IntrinsicID:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());
	case MachineOperand::MO_Predicate:
	return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate());
	}
	llvm_unreachable("Invalid machine operand type");
	}

	void MachineOperand::print(raw_ostream &OS, const TargetRegisterInfo *TRI,
	const TargetIntrinsicInfo *IntrinsicInfo) const {
	ModuleSlotTracker DummyMST(nullptr);
	print(OS, DummyMST, TRI, IntrinsicInfo);
	}

	void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
	const TargetRegisterInfo *TRI,
	const TargetIntrinsicInfo *IntrinsicInfo) const {
	switch (getType()) {
	case MachineOperand::MO_Register:
	OS << PrintReg(getReg(), TRI, getSubReg());

	if (isDef() \|\| isKill() \|\| isDead() \|\| isImplicit() \|\| isUndef() \|\|
	isInternalRead() \|\| isEarlyClobber() \|\| isTied()) {
	OS << '<';
	bool NeedComma = false;
	if (isDef()) {
	if (NeedComma) OS << ',';
	if (isEarlyClobber())
	OS << "earlyclobber,";
	if (isImplicit())
	OS << "imp-";
	OS << "def";
	NeedComma = true;
	// <def,read-undef> only makes sense when getSubReg() is set.
	// Don't clutter the output otherwise.
	if (isUndef() && getSubReg())
	OS << ",read-undef";
	} else if (isImplicit()) {
	OS << "imp-use";
	NeedComma = true;
	}

	if (isKill()) {
	if (NeedComma) OS << ',';
	OS << "kill";
	NeedComma = true;
	}
	if (isDead()) {
	if (NeedComma) OS << ',';
	OS << "dead";
	NeedComma = true;
	}
	if (isUndef() && isUse()) {
	if (NeedComma) OS << ',';
	OS << "undef";
	NeedComma = true;
	}
	if (isInternalRead()) {
	if (NeedComma) OS << ',';
	OS << "internal";
	NeedComma = true;
	}
	if (isTied()) {
	if (NeedComma) OS << ',';
	OS << "tied";
	if (TiedTo != 15)
	OS << unsigned(TiedTo - 1);
	}
	OS << '>';
	}
	break;
	case MachineOperand::MO_Immediate:
	OS << getImm();
	break;
	case MachineOperand::MO_CImmediate:
	getCImm()->getValue().print(OS, false);
	break;
	case MachineOperand::MO_FPImmediate:
	if (getFPImm()->getType()->isFloatTy()) {
	OS << getFPImm()->getValueAPF().convertToFloat();
	} else if (getFPImm()->getType()->isHalfTy()) {
	APFloat APF = getFPImm()->getValueAPF();
	bool Unused;
	APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused);
	OS << "half " << APF.convertToFloat();
	} else if (getFPImm()->getType()->isFP128Ty()) {
	APFloat APF = getFPImm()->getValueAPF();
	SmallString<16> Str;
	getFPImm()->getValueAPF().toString(Str);
	OS << "quad " << Str;
	} else if (getFPImm()->getType()->isX86_FP80Ty()) {
	APFloat APF = getFPImm()->getValueAPF();
	OS << "x86_fp80 0xK";
	APInt API = APF.bitcastToAPInt();
	OS << format_hex_no_prefix(API.getHiBits(16).getZExtValue(), 4,
	/Upper=/true);
	OS << format_hex_no_prefix(API.getLoBits(64).getZExtValue(), 16,
	/Upper=/true);
	} else {
	OS << getFPImm()->getValueAPF().convertToDouble();
	}
	break;
	case MachineOperand::MO_MachineBasicBlock:
	OS << "<BB#" << getMBB()->getNumber() << ">";
	break;
	case MachineOperand::MO_FrameIndex:
	OS << "<fi#" << getIndex() << '>';
	break;
	case MachineOperand::MO_ConstantPoolIndex:
	OS << "<cp#" << getIndex();
	if (getOffset()) OS << "+" << getOffset();
	OS << '>';
	break;
	case MachineOperand::MO_TargetIndex:
	OS << "<ti#" << getIndex();
	if (getOffset()) OS << "+" << getOffset();
	OS << '>';
	break;
	case MachineOperand::MO_JumpTableIndex:
	OS << "<jt#" << getIndex() << '>';
	break;
	case MachineOperand::MO_GlobalAddress:
	OS << "<ga:";
	getGlobal()->printAsOperand(OS, /PrintType=/false, MST);
	if (getOffset()) OS << "+" << getOffset();
	OS << '>';
	break;
	case MachineOperand::MO_ExternalSymbol:
	OS << "<es:" << getSymbolName();
	if (getOffset()) OS << "+" << getOffset();
	OS << '>';
	break;
	case MachineOperand::MO_BlockAddress:
	OS << '<';
	getBlockAddress()->printAsOperand(OS, /PrintType=/false, MST);
	if (getOffset()) OS << "+" << getOffset();
	OS << '>';
	break;
	case MachineOperand::MO_RegisterMask: {
	unsigned NumRegsInMask = 0;
	unsigned NumRegsEmitted = 0;
	OS << "<regmask";
	for (unsigned i = 0; i < TRI->getNumRegs(); ++i) {
	unsigned MaskWord = i / 32;
	unsigned MaskBit = i % 32;
	if (getRegMask()[MaskWord] & (1 << MaskBit)) {
	if (PrintWholeRegMask \|\| NumRegsEmitted <= 10) {
	OS << " " << PrintReg(i, TRI);
	NumRegsEmitted++;
	}
	NumRegsInMask++;
	}
	}
	if (NumRegsEmitted != NumRegsInMask)
	OS << " and " << (NumRegsInMask - NumRegsEmitted) << " more...";
	OS << ">";
	break;
	}
	case MachineOperand::MO_RegisterLiveOut:
	OS << "<regliveout>";
	break;
	case MachineOperand::MO_Metadata:
	OS << '<';
	getMetadata()->printAsOperand(OS, MST);
	OS << '>';
	break;
	case MachineOperand::MO_MCSymbol:
	OS << "<MCSym=" << *getMCSymbol() << '>';
	break;
	case MachineOperand::MO_CFIIndex:
	OS << "<call frame instruction>";
	break;
	case MachineOperand::MO_IntrinsicID: {
	Intrinsic::ID ID = getIntrinsicID();
	if (ID < Intrinsic::num_intrinsics)
	OS << "<intrinsic:@" << Intrinsic::getName(ID, None) << '>';
	else if (IntrinsicInfo)
	OS << "<intrinsic:@" << IntrinsicInfo->getName(ID) << '>';
	else
	OS << "<intrinsic:" << ID << '>';
	break;
	}
	case MachineOperand::MO_Predicate: {
	auto Pred = static_cast<CmpInst::Predicate>(getPredicate());
	OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred")
	<< CmpInst::getPredicateName(Pred) << '>';
	break;
	}
	}
	if (unsigned TF = getTargetFlags())
	OS << "[TF=" << TF << ']';
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void MachineOperand::dump() const {
	dbgs() << *this << '\n';
	}
	#endif

	//===----------------------------------------------------------------------===//
	// MachineMemOperand Implementation
	//===----------------------------------------------------------------------===//

	/// getAddrSpace - Return the LLVM IR address space number that this pointer
	/// points into.
	unsigned MachinePointerInfo::getAddrSpace() const {
	if (V.isNull() \|\| V.is<const PseudoSourceValue*>()) return 0;
	return cast<PointerType>(V.get<const Value*>()->getType())->getAddressSpace();
	}

	/// isDereferenceable - Return true if V is always dereferenceable for
	/// Offset + Size byte.
	bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
	const DataLayout &DL) const {
	if (!V.is<const Value*>())
	return false;

	const Value BasePtr = V.get<const Value>();
	if (BasePtr == nullptr)
	return false;

	- return isDereferenceableAndAlignedPointer(BasePtr, 1,
	- APInt(DL.getPointerSize(),
	- Offset + Size),
	- DL);
	+ return isDereferenceableAndAlignedPointer(
	+ BasePtr, 1, APInt(DL.getPointerSizeInBits(), Offset + Size), DL);
	}

	/// getConstantPool - Return a MachinePointerInfo record that refers to the
	/// constant pool.
	MachinePointerInfo MachinePointerInfo::getConstantPool(MachineFunction &MF) {
	return MachinePointerInfo(MF.getPSVManager().getConstantPool());
	}

	/// getFixedStack - Return a MachinePointerInfo record that refers to the
	/// the specified FrameIndex.
	MachinePointerInfo MachinePointerInfo::getFixedStack(MachineFunction &MF,
	int FI, int64_t Offset) {
	return MachinePointerInfo(MF.getPSVManager().getFixedStack(FI), Offset);
	}

	MachinePointerInfo MachinePointerInfo::getJumpTable(MachineFunction &MF) {
	return MachinePointerInfo(MF.getPSVManager().getJumpTable());
	}

	MachinePointerInfo MachinePointerInfo::getGOT(MachineFunction &MF) {
	return MachinePointerInfo(MF.getPSVManager().getGOT());
	}

	MachinePointerInfo MachinePointerInfo::getStack(MachineFunction &MF,
	int64_t Offset) {
	return MachinePointerInfo(MF.getPSVManager().getStack(), Offset);
	}

	MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
	uint64_t s, unsigned int a,
	const AAMDNodes &AAInfo,
	const MDNode *Ranges,
	SyncScope::ID SSID,
	AtomicOrdering Ordering,
	AtomicOrdering FailureOrdering)
	: PtrInfo(ptrinfo), Size(s), FlagVals(f), BaseAlignLog2(Log2_32(a) + 1),
	AAInfo(AAInfo), Ranges(Ranges) {
	assert((PtrInfo.V.isNull() \|\| PtrInfo.V.is<const PseudoSourceValue*>() \|\|
	isa<PointerType>(PtrInfo.V.get<const Value*>()->getType())) &&
	"invalid pointer value");
	assert(getBaseAlignment() == a && "Alignment is not a power of 2!");
	assert((isLoad() \|\| isStore()) && "Not a load/store!");

	AtomicInfo.SSID = static_cast<unsigned>(SSID);
	assert(getSyncScopeID() == SSID && "Value truncated");
	AtomicInfo.Ordering = static_cast<unsigned>(Ordering);
	assert(getOrdering() == Ordering && "Value truncated");
	AtomicInfo.FailureOrdering = static_cast<unsigned>(FailureOrdering);
	assert(getFailureOrdering() == FailureOrdering && "Value truncated");
	}

	/// Profile - Gather unique data for the object.
	///
	void MachineMemOperand::Profile(FoldingSetNodeID &ID) const {
	ID.AddInteger(getOffset());
	ID.AddInteger(Size);
	ID.AddPointer(getOpaqueValue());
	ID.AddInteger(getFlags());
	ID.AddInteger(getBaseAlignment());
	}

	void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
	// The Value and Offset may differ due to CSE. But the flags and size
	// should be the same.
	assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
	assert(MMO->getSize() == getSize() && "Size mismatch!");

	if (MMO->getBaseAlignment() >= getBaseAlignment()) {
	// Update the alignment value.
	BaseAlignLog2 = Log2_32(MMO->getBaseAlignment()) + 1;
	// Also update the base and offset, because the new alignment may
	// not be applicable with the old ones.
	PtrInfo = MMO->PtrInfo;
	}
	}

	/// getAlignment - Return the minimum known alignment in bytes of the
	/// actual memory reference.
	uint64_t MachineMemOperand::getAlignment() const {
	return MinAlign(getBaseAlignment(), getOffset());
	}

	void MachineMemOperand::print(raw_ostream &OS) const {
	ModuleSlotTracker DummyMST(nullptr);
	print(OS, DummyMST);
	}
	void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
	assert((isLoad() \|\| isStore()) &&
	"SV has to be a load, store or both.");

	if (isVolatile())
	OS << "Volatile ";

	if (isLoad())
	OS << "LD";
	if (isStore())
	OS << "ST";
	OS << getSize();

	// Print the address information.
	OS << "[";
	if (const Value *V = getValue())
	V->printAsOperand(OS, /PrintType=/false, MST);
	else if (const PseudoSourceValue *PSV = getPseudoValue())
	PSV->printCustom(OS);
	else
	OS << "<unknown>";

	unsigned AS = getAddrSpace();
	if (AS != 0)
	OS << "(addrspace=" << AS << ')';

	// If the alignment of the memory reference itself differs from the alignment
	// of the base pointer, print the base alignment explicitly, next to the base
	// pointer.
	if (getBaseAlignment() != getAlignment())
	OS << "(align=" << getBaseAlignment() << ")";

	if (getOffset() != 0)
	OS << "+" << getOffset();
	OS << "]";

	// Print the alignment of the reference.
	if (getBaseAlignment() != getAlignment() \|\| getBaseAlignment() != getSize())
	OS << "(align=" << getAlignment() << ")";

	// Print TBAA info.
	if (const MDNode *TBAAInfo = getAAInfo().TBAA) {
	OS << "(tbaa=";
	if (TBAAInfo->getNumOperands() > 0)
	TBAAInfo->getOperand(0)->printAsOperand(OS, MST);
	else
	OS << "<unknown>";
	OS << ")";
	}

	// Print AA scope info.
	if (const MDNode *ScopeInfo = getAAInfo().Scope) {
	OS << "(alias.scope=";
	if (ScopeInfo->getNumOperands() > 0)
	for (unsigned i = 0, ie = ScopeInfo->getNumOperands(); i != ie; ++i) {
	ScopeInfo->getOperand(i)->printAsOperand(OS, MST);
	if (i != ie-1)
	OS << ",";
	}
	else
	OS << "<unknown>";
	OS << ")";
	}

	// Print AA noalias scope info.
	if (const MDNode *NoAliasInfo = getAAInfo().NoAlias) {
	OS << "(noalias=";
	if (NoAliasInfo->getNumOperands() > 0)
	for (unsigned i = 0, ie = NoAliasInfo->getNumOperands(); i != ie; ++i) {
	NoAliasInfo->getOperand(i)->printAsOperand(OS, MST);
	if (i != ie-1)
	OS << ",";
	}
	else
	OS << "<unknown>";
	OS << ")";
	}

	if (isNonTemporal())
	OS << "(nontemporal)";
	if (isDereferenceable())
	OS << "(dereferenceable)";
	if (isInvariant())
	OS << "(invariant)";
	if (getFlags() & MOTargetFlag1)
	OS << "(flag1)";
	if (getFlags() & MOTargetFlag2)
	OS << "(flag2)";
	if (getFlags() & MOTargetFlag3)
	OS << "(flag3)";
	}

	//===----------------------------------------------------------------------===//
	// MachineInstr Implementation
	//===----------------------------------------------------------------------===//

	void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
	if (MCID->ImplicitDefs)
	for (const MCPhysReg ImpDefs = MCID->getImplicitDefs(); ImpDefs;
	++ImpDefs)
	addOperand(MF, MachineOperand::CreateReg(*ImpDefs, true, true));
	if (MCID->ImplicitUses)
	for (const MCPhysReg ImpUses = MCID->getImplicitUses(); ImpUses;
	++ImpUses)
	addOperand(MF, MachineOperand::CreateReg(*ImpUses, false, true));
	}

	/// MachineInstr ctor - This constructor creates a MachineInstr and adds the
	/// implicit operands. It reserves space for the number of operands specified by
	/// the MCInstrDesc.
	MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
	DebugLoc dl, bool NoImp)
	: MCID(&tid), debugLoc(std::move(dl)) {
	assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");

	// Reserve space for the expected number of operands.
	if (unsigned NumOps = MCID->getNumOperands() +
	MCID->getNumImplicitDefs() + MCID->getNumImplicitUses()) {
	CapOperands = OperandCapacity::get(NumOps);
	Operands = MF.allocateOperandArray(CapOperands);
	}

	if (!NoImp)
	addImplicitDefUseOperands(MF);
	}

	/// MachineInstr ctor - Copies MachineInstr arg exactly
	///
	MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
	: MCID(&MI.getDesc()), NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
	debugLoc(MI.getDebugLoc()) {
	assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");

	CapOperands = OperandCapacity::get(MI.getNumOperands());
	Operands = MF.allocateOperandArray(CapOperands);

	// Copy operands.
	for (const MachineOperand &MO : MI.operands())
	addOperand(MF, MO);

	// Copy all the sensible flags.
	setFlags(MI.Flags);
	}

	/// getRegInfo - If this instruction is embedded into a MachineFunction,
	/// return the MachineRegisterInfo object for the current function, otherwise
	/// return null.
	MachineRegisterInfo *MachineInstr::getRegInfo() {
	if (MachineBasicBlock *MBB = getParent())
	return &MBB->getParent()->getRegInfo();
	return nullptr;
	}

	/// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
	/// this instruction from their respective use lists. This requires that the
	/// operands already be on their use lists.
	void MachineInstr::RemoveRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
	for (MachineOperand &MO : operands())
	if (MO.isReg())
	MRI.removeRegOperandFromUseList(&MO);
	}

	/// AddRegOperandsToUseLists - Add all of the register operands in
	/// this instruction from their respective use lists. This requires that the
	/// operands not be on their use lists yet.
	void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &MRI) {
	for (MachineOperand &MO : operands())
	if (MO.isReg())
	MRI.addRegOperandToUseList(&MO);
	}

	void MachineInstr::addOperand(const MachineOperand &Op) {
	MachineBasicBlock *MBB = getParent();
	assert(MBB && "Use MachineInstrBuilder to add operands to dangling instrs");
	MachineFunction *MF = MBB->getParent();
	assert(MF && "Use MachineInstrBuilder to add operands to dangling instrs");
	addOperand(*MF, Op);
	}

	/// Move NumOps MachineOperands from Src to Dst, with support for overlapping
	/// ranges. If MRI is non-null also update use-def chains.
	static void moveOperands(MachineOperand Dst, MachineOperand Src,
	unsigned NumOps, MachineRegisterInfo *MRI) {
	if (MRI)
	return MRI->moveOperands(Dst, Src, NumOps);

	// MachineOperand is a trivially copyable type so we can just use memmove.
	std::memmove(Dst, Src, NumOps * sizeof(MachineOperand));
	}

	/// addOperand - Add the specified operand to the instruction. If it is an
	/// implicit operand, it is added to the end of the operand list. If it is
	/// an explicit operand it is added at the end of the explicit operand list
	/// (before the first implicit operand).
	void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
	assert(MCID && "Cannot add operands before providing an instr descriptor");

	// Check if we're adding one of our existing operands.
	if (&Op >= Operands && &Op < Operands + NumOperands) {
	// This is unusual: MI->addOperand(MI->getOperand(i)).
	// If adding Op requires reallocating or moving existing operands around,
	// the Op reference could go stale. Support it by copying Op.
	MachineOperand CopyOp(Op);
	return addOperand(MF, CopyOp);
	}

	// Find the insert location for the new operand. Implicit registers go at
	// the end, everything else goes before the implicit regs.
	//
	// FIXME: Allow mixed explicit and implicit operands on inline asm.
	// InstrEmitter::EmitSpecialNode() is marking inline asm clobbers as
	// implicit-defs, but they must not be moved around. See the FIXME in
	// InstrEmitter.cpp.
	unsigned OpNo = getNumOperands();
	bool isImpReg = Op.isReg() && Op.isImplicit();
	if (!isImpReg && !isInlineAsm()) {
	while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
	--OpNo;
	assert(!Operands[OpNo].isTied() && "Cannot move tied operands");
	}
	}

	#ifndef NDEBUG
	bool isMetaDataOp = Op.getType() == MachineOperand::MO_Metadata;
	// OpNo now points as the desired insertion point. Unless this is a variadic
	// instruction, only implicit regs are allowed beyond MCID->getNumOperands().
	// RegMask operands go between the explicit and implicit operands.
	assert((isImpReg \|\| Op.isRegMask() \|\| MCID->isVariadic() \|\|
	OpNo < MCID->getNumOperands() \|\| isMetaDataOp) &&
	"Trying to add an operand to a machine instr that is already done!");
	#endif

	MachineRegisterInfo *MRI = getRegInfo();

	// Determine if the Operands array needs to be reallocated.
	// Save the old capacity and operand array.
	OperandCapacity OldCap = CapOperands;
	MachineOperand *OldOperands = Operands;
	if (!OldOperands \|\| OldCap.getSize() == getNumOperands()) {
	CapOperands = OldOperands ? OldCap.getNext() : OldCap.get(1);
	Operands = MF.allocateOperandArray(CapOperands);
	// Move the operands before the insertion point.
	if (OpNo)
	moveOperands(Operands, OldOperands, OpNo, MRI);
	}

	// Move the operands following the insertion point.
	if (OpNo != NumOperands)
	moveOperands(Operands + OpNo + 1, OldOperands + OpNo, NumOperands - OpNo,
	MRI);
	++NumOperands;

	// Deallocate the old operand array.
	if (OldOperands != Operands && OldOperands)
	MF.deallocateOperandArray(OldCap, OldOperands);

	// Copy Op into place. It still needs to be inserted into the MRI use lists.
	MachineOperand *NewMO = new (Operands + OpNo) MachineOperand(Op);
	NewMO->ParentMI = this;

	// When adding a register operand, tell MRI about it.
	if (NewMO->isReg()) {
	// Ensure isOnRegUseList() returns false, regardless of Op's status.
	NewMO->Contents.Reg.Prev = nullptr;
	// Ignore existing ties. This is not a property that can be copied.
	NewMO->TiedTo = 0;
	// Add the new operand to MRI, but only for instructions in an MBB.
	if (MRI)
	MRI->addRegOperandToUseList(NewMO);
	// The MCID operand information isn't accurate until we start adding
	// explicit operands. The implicit operands are added first, then the
	// explicits are inserted before them.
	if (!isImpReg) {
	// Tie uses to defs as indicated in MCInstrDesc.
	if (NewMO->isUse()) {
	int DefIdx = MCID->getOperandConstraint(OpNo, MCOI::TIED_TO);
	if (DefIdx != -1)
	tieOperands(DefIdx, OpNo);
	}
	// If the register operand is flagged as early, mark the operand as such.
	if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
	NewMO->setIsEarlyClobber(true);
	}
	}
	}

	/// RemoveOperand - Erase an operand from an instruction, leaving it with one
	/// fewer operand than it started with.
	///
	void MachineInstr::RemoveOperand(unsigned OpNo) {
	assert(OpNo < getNumOperands() && "Invalid operand number");
	untieRegOperand(OpNo);

	#ifndef NDEBUG
	// Moving tied operands would break the ties.
	for (unsigned i = OpNo + 1, e = getNumOperands(); i != e; ++i)
	if (Operands[i].isReg())
	assert(!Operands[i].isTied() && "Cannot move tied operands");
	#endif

	MachineRegisterInfo *MRI = getRegInfo();
	if (MRI && Operands[OpNo].isReg())
	MRI->removeRegOperandFromUseList(Operands + OpNo);

	// Don't call the MachineOperand destructor. A lot of this code depends on
	// MachineOperand having a trivial destructor anyway, and adding a call here
	// wouldn't make it 'destructor-correct'.

	if (unsigned N = NumOperands - 1 - OpNo)
	moveOperands(Operands + OpNo, Operands + OpNo + 1, N, MRI);
	--NumOperands;
	}

	/// addMemOperand - Add a MachineMemOperand to the machine instruction.
	/// This function should be used only occasionally. The setMemRefs function
	/// is the primary method for setting up a MachineInstr's MemRefs list.
	void MachineInstr::addMemOperand(MachineFunction &MF,
	MachineMemOperand *MO) {
	mmo_iterator OldMemRefs = MemRefs;
	unsigned OldNumMemRefs = NumMemRefs;

	unsigned NewNum = NumMemRefs + 1;
	mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NewNum);

	std::copy(OldMemRefs, OldMemRefs + OldNumMemRefs, NewMemRefs);
	NewMemRefs[NewNum - 1] = MO;
	setMemRefs(NewMemRefs, NewMemRefs + NewNum);
	}

	/// Check to see if the MMOs pointed to by the two MemRefs arrays are
	/// identical.
	static bool hasIdenticalMMOs(const MachineInstr &MI1, const MachineInstr &MI2) {
	auto I1 = MI1.memoperands_begin(), E1 = MI1.memoperands_end();
	auto I2 = MI2.memoperands_begin(), E2 = MI2.memoperands_end();
	if ((E1 - I1) != (E2 - I2))
	return false;
	for (; I1 != E1; ++I1, ++I2) {
	if (I1 != I2)
	return false;
	}
	return true;
	}

	std::pair<MachineInstr::mmo_iterator, unsigned>
	MachineInstr::mergeMemRefsWith(const MachineInstr& Other) {

	// If either of the incoming memrefs are empty, we must be conservative and
	// treat this as if we've exhausted our space for memrefs and dropped them.
	if (memoperands_empty() \|\| Other.memoperands_empty())
	return std::make_pair(nullptr, 0);

	// If both instructions have identical memrefs, we don't need to merge them.
	// Since many instructions have a single memref, and we tend to merge things
	// like pairs of loads from the same location, this catches a large number of
	// cases in practice.
	if (hasIdenticalMMOs(*this, Other))
	return std::make_pair(MemRefs, NumMemRefs);

	// TODO: consider uniquing elements within the operand lists to reduce
	// space usage and fall back to conservative information less often.
	size_t CombinedNumMemRefs = NumMemRefs + Other.NumMemRefs;

	// If we don't have enough room to store this many memrefs, be conservative
	// and drop them. Otherwise, we'd fail asserts when trying to add them to
	// the new instruction.
	if (CombinedNumMemRefs != uint8_t(CombinedNumMemRefs))
	return std::make_pair(nullptr, 0);

	MachineFunction *MF = getParent()->getParent();
	mmo_iterator MemBegin = MF->allocateMemRefsArray(CombinedNumMemRefs);
	mmo_iterator MemEnd = std::copy(memoperands_begin(), memoperands_end(),
	MemBegin);
	MemEnd = std::copy(Other.memoperands_begin(), Other.memoperands_end(),
	MemEnd);
	assert(MemEnd - MemBegin == (ptrdiff_t)CombinedNumMemRefs &&
	"missing memrefs");

	return std::make_pair(MemBegin, CombinedNumMemRefs);
	}

	bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
	assert(!isBundledWithPred() && "Must be called on bundle header");
	for (MachineBasicBlock::const_instr_iterator MII = getIterator();; ++MII) {
	if (MII->getDesc().getFlags() & Mask) {
	if (Type == AnyInBundle)
	return true;
	} else {
	if (Type == AllInBundle && !MII->isBundle())
	return false;
	}
	// This was the last instruction in the bundle.
	if (!MII->isBundledWithSucc())
	return Type == AllInBundle;
	}
	}

	bool MachineInstr::isIdenticalTo(const MachineInstr &Other,
	MICheckType Check) const {
	// If opcodes or number of operands are not the same then the two
	// instructions are obviously not identical.
	if (Other.getOpcode() != getOpcode() \|\|
	Other.getNumOperands() != getNumOperands())
	return false;

	if (isBundle()) {
	// We have passed the test above that both instructions have the same
	// opcode, so we know that both instructions are bundles here. Let's compare
	// MIs inside the bundle.
	assert(Other.isBundle() && "Expected that both instructions are bundles.");
	MachineBasicBlock::const_instr_iterator I1 = getIterator();
	MachineBasicBlock::const_instr_iterator I2 = Other.getIterator();
	// Loop until we analysed the last intruction inside at least one of the
	// bundles.
	while (I1->isBundledWithSucc() && I2->isBundledWithSucc()) {
	++I1;
	++I2;
	if (!I1->isIdenticalTo(*I2, Check))
	return false;
	}
	// If we've reached the end of just one of the two bundles, but not both,
	// the instructions are not identical.
	if (I1->isBundledWithSucc() \|\| I2->isBundledWithSucc())
	return false;
	}

	// Check operands to make sure they match.
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = getOperand(i);
	const MachineOperand &OMO = Other.getOperand(i);
	if (!MO.isReg()) {
	if (!MO.isIdenticalTo(OMO))
	return false;
	continue;
	}

	// Clients may or may not want to ignore defs when testing for equality.
	// For example, machine CSE pass only cares about finding common
	// subexpressions, so it's safe to ignore virtual register defs.
	if (MO.isDef()) {
	if (Check == IgnoreDefs)
	continue;
	else if (Check == IgnoreVRegDefs) {
	if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) \|\|
	TargetRegisterInfo::isPhysicalRegister(OMO.getReg()))
	if (MO.getReg() != OMO.getReg())
	return false;
	} else {
	if (!MO.isIdenticalTo(OMO))
	return false;
	if (Check == CheckKillDead && MO.isDead() != OMO.isDead())
	return false;
	}
	} else {
	if (!MO.isIdenticalTo(OMO))
	return false;
	if (Check == CheckKillDead && MO.isKill() != OMO.isKill())
	return false;
	}
	}
	// If DebugLoc does not match then two dbg.values are not identical.
	if (isDebugValue())
	if (getDebugLoc() && Other.getDebugLoc() &&
	getDebugLoc() != Other.getDebugLoc())
	return false;
	return true;
	}

	MachineInstr *MachineInstr::removeFromParent() {
	assert(getParent() && "Not embedded in a basic block!");
	return getParent()->remove(this);
	}

	MachineInstr *MachineInstr::removeFromBundle() {
	assert(getParent() && "Not embedded in a basic block!");
	return getParent()->remove_instr(this);
	}

	void MachineInstr::eraseFromParent() {
	assert(getParent() && "Not embedded in a basic block!");
	getParent()->erase(this);
	}

	void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() {
	assert(getParent() && "Not embedded in a basic block!");
	MachineBasicBlock *MBB = getParent();
	MachineFunction *MF = MBB->getParent();
	assert(MF && "Not embedded in a function!");

	MachineInstr MI = (MachineInstr )this;
	MachineRegisterInfo &MRI = MF->getRegInfo();

	for (const MachineOperand &MO : MI->operands()) {
	if (!MO.isReg() \|\| !MO.isDef())
	continue;
	unsigned Reg = MO.getReg();
	if (!TargetRegisterInfo::isVirtualRegister(Reg))
	continue;
	MRI.markUsesInDebugValueAsUndef(Reg);
	}
	MI->eraseFromParent();
	}

	void MachineInstr::eraseFromBundle() {
	assert(getParent() && "Not embedded in a basic block!");
	getParent()->erase_instr(this);
	}

	/// getNumExplicitOperands - Returns the number of non-implicit operands.
	///
	unsigned MachineInstr::getNumExplicitOperands() const {
	unsigned NumOperands = MCID->getNumOperands();
	if (!MCID->isVariadic())
	return NumOperands;

	for (unsigned i = NumOperands, e = getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = getOperand(i);
	if (!MO.isReg() \|\| !MO.isImplicit())
	NumOperands++;
	}
	return NumOperands;
	}

	void MachineInstr::bundleWithPred() {
	assert(!isBundledWithPred() && "MI is already bundled with its predecessor");
	setFlag(BundledPred);
	MachineBasicBlock::instr_iterator Pred = getIterator();
	--Pred;
	assert(!Pred->isBundledWithSucc() && "Inconsistent bundle flags");
	Pred->setFlag(BundledSucc);
	}

	void MachineInstr::bundleWithSucc() {
	assert(!isBundledWithSucc() && "MI is already bundled with its successor");
	setFlag(BundledSucc);
	MachineBasicBlock::instr_iterator Succ = getIterator();
	++Succ;
	assert(!Succ->isBundledWithPred() && "Inconsistent bundle flags");
	Succ->setFlag(BundledPred);
	}

	void MachineInstr::unbundleFromPred() {
	assert(isBundledWithPred() && "MI isn't bundled with its predecessor");
	clearFlag(BundledPred);
	MachineBasicBlock::instr_iterator Pred = getIterator();
	--Pred;
	assert(Pred->isBundledWithSucc() && "Inconsistent bundle flags");
	Pred->clearFlag(BundledSucc);
	}

	void MachineInstr::unbundleFromSucc() {
	assert(isBundledWithSucc() && "MI isn't bundled with its successor");
	clearFlag(BundledSucc);
	MachineBasicBlock::instr_iterator Succ = getIterator();
	++Succ;
	assert(Succ->isBundledWithPred() && "Inconsistent bundle flags");
	Succ->clearFlag(BundledPred);
	}

	bool MachineInstr::isStackAligningInlineAsm() const {
	if (isInlineAsm()) {
	unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
	if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
	return true;
	}
	return false;
	}

	InlineAsm::AsmDialect MachineInstr::getInlineAsmDialect() const {
	assert(isInlineAsm() && "getInlineAsmDialect() only works for inline asms!");
	unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
	return InlineAsm::AsmDialect((ExtraInfo & InlineAsm::Extra_AsmDialect) != 0);
	}

	int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
	unsigned *GroupNo) const {
	assert(isInlineAsm() && "Expected an inline asm instruction");
	assert(OpIdx < getNumOperands() && "OpIdx out of range");

	// Ignore queries about the initial operands.
	if (OpIdx < InlineAsm::MIOp_FirstOperand)
	return -1;

	unsigned Group = 0;
	unsigned NumOps;
	for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
	i += NumOps) {
	const MachineOperand &FlagMO = getOperand(i);
	// If we reach the implicit register operands, stop looking.
	if (!FlagMO.isImm())
	return -1;
	NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
	if (i + NumOps > OpIdx) {
	if (GroupNo)
	*GroupNo = Group;
	return i;
	}
	++Group;
	}
	return -1;
	}

	const DILocalVariable *MachineInstr::getDebugVariable() const {
	assert(isDebugValue() && "not a DBG_VALUE");
	return cast<DILocalVariable>(getOperand(2).getMetadata());
	}

	const DIExpression *MachineInstr::getDebugExpression() const {
	assert(isDebugValue() && "not a DBG_VALUE");
	return cast<DIExpression>(getOperand(3).getMetadata());
	}

	const TargetRegisterClass*
	MachineInstr::getRegClassConstraint(unsigned OpIdx,
	const TargetInstrInfo *TII,
	const TargetRegisterInfo *TRI) const {
	assert(getParent() && "Can't have an MBB reference here!");
	assert(getParent()->getParent() && "Can't have an MF reference here!");
	const MachineFunction &MF = *getParent()->getParent();

	// Most opcodes have fixed constraints in their MCInstrDesc.
	if (!isInlineAsm())
	return TII->getRegClass(getDesc(), OpIdx, TRI, MF);

	if (!getOperand(OpIdx).isReg())
	return nullptr;

	// For tied uses on inline asm, get the constraint from the def.
	unsigned DefIdx;
	if (getOperand(OpIdx).isUse() && isRegTiedToDefOperand(OpIdx, &DefIdx))
	OpIdx = DefIdx;

	// Inline asm stores register class constraints in the flag word.
	int FlagIdx = findInlineAsmFlagIdx(OpIdx);
	if (FlagIdx < 0)
	return nullptr;

	unsigned Flag = getOperand(FlagIdx).getImm();
	unsigned RCID;
	if ((InlineAsm::getKind(Flag) == InlineAsm::Kind_RegUse \|\|
	InlineAsm::getKind(Flag) == InlineAsm::Kind_RegDef \|\|
	InlineAsm::getKind(Flag) == InlineAsm::Kind_RegDefEarlyClobber) &&
	InlineAsm::hasRegClassConstraint(Flag, RCID))
	return TRI->getRegClass(RCID);

	// Assume that all registers in a memory operand are pointers.
	if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem)
	return TRI->getPointerRegClass(MF);

	return nullptr;
	}

	const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
	unsigned Reg, const TargetRegisterClass CurRC, const TargetInstrInfo TII,
	const TargetRegisterInfo *TRI, bool ExploreBundle) const {
	// Check every operands inside the bundle if we have
	// been asked to.
	if (ExploreBundle)
	for (ConstMIBundleOperands OpndIt(*this); OpndIt.isValid() && CurRC;
	++OpndIt)
	CurRC = OpndIt->getParent()->getRegClassConstraintEffectForVRegImpl(
	OpndIt.getOperandNo(), Reg, CurRC, TII, TRI);
	else
	// Otherwise, just check the current operands.
	for (unsigned i = 0, e = NumOperands; i < e && CurRC; ++i)
	CurRC = getRegClassConstraintEffectForVRegImpl(i, Reg, CurRC, TII, TRI);
	return CurRC;
	}

	const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVRegImpl(
	unsigned OpIdx, unsigned Reg, const TargetRegisterClass *CurRC,
	const TargetInstrInfo TII, const TargetRegisterInfo TRI) const {
	assert(CurRC && "Invalid initial register class");
	// Check if Reg is constrained by some of its use/def from MI.
	const MachineOperand &MO = getOperand(OpIdx);
	if (!MO.isReg() \|\| MO.getReg() != Reg)
	return CurRC;
	// If yes, accumulate the constraints through the operand.
	return getRegClassConstraintEffect(OpIdx, CurRC, TII, TRI);
	}

	const TargetRegisterClass *MachineInstr::getRegClassConstraintEffect(
	unsigned OpIdx, const TargetRegisterClass *CurRC,
	const TargetInstrInfo TII, const TargetRegisterInfo TRI) const {
	const TargetRegisterClass *OpRC = getRegClassConstraint(OpIdx, TII, TRI);
	const MachineOperand &MO = getOperand(OpIdx);
	assert(MO.isReg() &&
	"Cannot get register constraints for non-register operand");
	assert(CurRC && "Invalid initial register class");
	if (unsigned SubIdx = MO.getSubReg()) {
	if (OpRC)
	CurRC = TRI->getMatchingSuperRegClass(CurRC, OpRC, SubIdx);
	else
	CurRC = TRI->getSubClassWithSubReg(CurRC, SubIdx);
	} else if (OpRC)
	CurRC = TRI->getCommonSubClass(CurRC, OpRC);
	return CurRC;
	}

	/// Return the number of instructions inside the MI bundle, not counting the
	/// header instruction.
	unsigned MachineInstr::getBundleSize() const {
	MachineBasicBlock::const_instr_iterator I = getIterator();
	unsigned Size = 0;
	while (I->isBundledWithSucc()) {
	++Size;
	++I;
	}
	return Size;
	}

	/// Returns true if the MachineInstr has an implicit-use operand of exactly
	/// the given register (not considering sub/super-registers).
	bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = getOperand(i);
	if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
	return true;
	}
	return false;
	}

	/// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
	/// the specific register or -1 if it is not found. It further tightens
	/// the search criteria to a use that kills the register if isKill is true.
	int MachineInstr::findRegisterUseOperandIdx(
	unsigned Reg, bool isKill, const TargetRegisterInfo *TRI) const {
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = getOperand(i);
	if (!MO.isReg() \|\| !MO.isUse())
	continue;
	unsigned MOReg = MO.getReg();
	if (!MOReg)
	continue;
	if (MOReg == Reg \|\| (TRI && TargetRegisterInfo::isPhysicalRegister(MOReg) &&
	TargetRegisterInfo::isPhysicalRegister(Reg) &&
	TRI->isSubRegister(MOReg, Reg)))
	if (!isKill \|\| MO.isKill())
	return i;
	}
	return -1;
	}

	/// readsWritesVirtualRegister - Return a pair of bools (reads, writes)
	/// indicating if this instruction reads or writes Reg. This also considers
	/// partial defines.
	std::pair<bool,bool>
	MachineInstr::readsWritesVirtualRegister(unsigned Reg,
	SmallVectorImpl<unsigned> *Ops) const {
	bool PartDef = false; // Partial redefine.
	bool FullDef = false; // Full define.
	bool Use = false;

	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = getOperand(i);
	if (!MO.isReg() \|\| MO.getReg() != Reg)
	continue;
	if (Ops)
	Ops->push_back(i);
	if (MO.isUse())
	Use \|= !MO.isUndef();
	else if (MO.getSubReg() && !MO.isUndef())
	// A partial <def,undef> doesn't count as reading the register.
	PartDef = true;
	else
	FullDef = true;
	}
	// A partial redefine uses Reg unless there is also a full define.
	return std::make_pair(Use \|\| (PartDef && !FullDef), PartDef \|\| FullDef);
	}

	/// findRegisterDefOperandIdx() - Returns the operand index that is a def of
	/// the specified register or -1 if it is not found. If isDead is true, defs
	/// that are not dead are skipped. If TargetRegisterInfo is non-null, then it
	/// also checks if there is a def of a super-register.
	int
	MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
	const TargetRegisterInfo *TRI) const {
	bool isPhys = TargetRegisterInfo::isPhysicalRegister(Reg);
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = getOperand(i);
	// Accept regmask operands when Overlap is set.
	// Ignore them when looking for a specific def operand (Overlap == false).
	if (isPhys && Overlap && MO.isRegMask() && MO.clobbersPhysReg(Reg))
	return i;
	if (!MO.isReg() \|\| !MO.isDef())
	continue;
	unsigned MOReg = MO.getReg();
	bool Found = (MOReg == Reg);
	if (!Found && TRI && isPhys &&
	TargetRegisterInfo::isPhysicalRegister(MOReg)) {
	if (Overlap)
	Found = TRI->regsOverlap(MOReg, Reg);
	else
	Found = TRI->isSubRegister(MOReg, Reg);
	}
	if (Found && (!isDead \|\| MO.isDead()))
	return i;
	}
	return -1;
	}

	/// findFirstPredOperandIdx() - Find the index of the first operand in the
	/// operand list that is used to represent the predicate. It returns -1 if
	/// none is found.
	int MachineInstr::findFirstPredOperandIdx() const {
	// Don't call MCID.findFirstPredOperandIdx() because this variant
	// is sometimes called on an instruction that's not yet complete, and
	// so the number of operands is less than the MCID indicates. In
	// particular, the PTX target does this.
	const MCInstrDesc &MCID = getDesc();
	if (MCID.isPredicable()) {
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
	if (MCID.OpInfo[i].isPredicate())
	return i;
	}

	return -1;
	}

	// MachineOperand::TiedTo is 4 bits wide.
	const unsigned TiedMax = 15;

	/// tieOperands - Mark operands at DefIdx and UseIdx as tied to each other.
	///
	/// Use and def operands can be tied together, indicated by a non-zero TiedTo
	/// field. TiedTo can have these values:
	///
	/// 0: Operand is not tied to anything.
	/// 1 to TiedMax-1: Tied to getOperand(TiedTo-1).
	/// TiedMax: Tied to an operand >= TiedMax-1.
	///
	/// The tied def must be one of the first TiedMax operands on a normal
	/// instruction. INLINEASM instructions allow more tied defs.
	///
	void MachineInstr::tieOperands(unsigned DefIdx, unsigned UseIdx) {
	MachineOperand &DefMO = getOperand(DefIdx);
	MachineOperand &UseMO = getOperand(UseIdx);
	assert(DefMO.isDef() && "DefIdx must be a def operand");
	assert(UseMO.isUse() && "UseIdx must be a use operand");
	assert(!DefMO.isTied() && "Def is already tied to another use");
	assert(!UseMO.isTied() && "Use is already tied to another def");

	if (DefIdx < TiedMax)
	UseMO.TiedTo = DefIdx + 1;
	else {
	// Inline asm can use the group descriptors to find tied operands, but on
	// normal instruction, the tied def must be within the first TiedMax
	// operands.
	assert(isInlineAsm() && "DefIdx out of range");
	UseMO.TiedTo = TiedMax;
	}

	// UseIdx can be out of range, we'll search for it in findTiedOperandIdx().
	DefMO.TiedTo = std::min(UseIdx + 1, TiedMax);
	}

	/// Given the index of a tied register operand, find the operand it is tied to.
	/// Defs are tied to uses and vice versa. Returns the index of the tied operand
	/// which must exist.
	unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
	const MachineOperand &MO = getOperand(OpIdx);
	assert(MO.isTied() && "Operand isn't tied");

	// Normally TiedTo is in range.
	if (MO.TiedTo < TiedMax)
	return MO.TiedTo - 1;

	// Uses on normal instructions can be out of range.
	if (!isInlineAsm()) {
	// Normal tied defs must be in the 0..TiedMax-1 range.
	if (MO.isUse())
	return TiedMax - 1;
	// MO is a def. Search for the tied use.
	for (unsigned i = TiedMax - 1, e = getNumOperands(); i != e; ++i) {
	const MachineOperand &UseMO = getOperand(i);
	if (UseMO.isReg() && UseMO.isUse() && UseMO.TiedTo == OpIdx + 1)
	return i;
	}
	llvm_unreachable("Can't find tied use");
	}

	// Now deal with inline asm by parsing the operand group descriptor flags.
	// Find the beginning of each operand group.
	SmallVector<unsigned, 8> GroupIdx;
	unsigned OpIdxGroup = ~0u;
	unsigned NumOps;
	for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
	i += NumOps) {
	const MachineOperand &FlagMO = getOperand(i);
	assert(FlagMO.isImm() && "Invalid tied operand on inline asm");
	unsigned CurGroup = GroupIdx.size();
	GroupIdx.push_back(i);
	NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
	// OpIdx belongs to this operand group.
	if (OpIdx > i && OpIdx < i + NumOps)
	OpIdxGroup = CurGroup;
	unsigned TiedGroup;
	if (!InlineAsm::isUseOperandTiedToDef(FlagMO.getImm(), TiedGroup))
	continue;
	// Operands in this group are tied to operands in TiedGroup which must be
	// earlier. Find the number of operands between the two groups.
	unsigned Delta = i - GroupIdx[TiedGroup];

	// OpIdx is a use tied to TiedGroup.
	if (OpIdxGroup == CurGroup)
	return OpIdx - Delta;

	// OpIdx is a def tied to this use group.
	if (OpIdxGroup == TiedGroup)
	return OpIdx + Delta;
	}
	llvm_unreachable("Invalid tied operand on inline asm");
	}

	/// clearKillInfo - Clears kill flags on all operands.
	///
	void MachineInstr::clearKillInfo() {
	for (MachineOperand &MO : operands()) {
	if (MO.isReg() && MO.isUse())
	MO.setIsKill(false);
	}
	}

	void MachineInstr::substituteRegister(unsigned FromReg,
	unsigned ToReg,
	unsigned SubIdx,
	const TargetRegisterInfo &RegInfo) {
	if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
	if (SubIdx)
	ToReg = RegInfo.getSubReg(ToReg, SubIdx);
	for (MachineOperand &MO : operands()) {
	if (!MO.isReg() \|\| MO.getReg() != FromReg)
	continue;
	MO.substPhysReg(ToReg, RegInfo);
	}
	} else {
	for (MachineOperand &MO : operands()) {
	if (!MO.isReg() \|\| MO.getReg() != FromReg)
	continue;
	MO.substVirtReg(ToReg, SubIdx, RegInfo);
	}
	}
	}

	/// isSafeToMove - Return true if it is safe to move this instruction. If
	/// SawStore is set to true, it means that there is a store (or call) between
	/// the instruction's location and its intended destination.
	bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
	// Ignore stuff that we obviously can't move.
	//
	// Treat volatile loads as stores. This is not strictly necessary for
	// volatiles, but it is required for atomic loads. It is not allowed to move
	// a load across an atomic load with Ordering > Monotonic.
	if (mayStore() \|\| isCall() \|\|
	(mayLoad() && hasOrderedMemoryRef())) {
	SawStore = true;
	return false;
	}

	if (isPosition() \|\| isDebugValue() \|\| isTerminator() \|\|
	hasUnmodeledSideEffects())
	return false;

	// See if this instruction does a load. If so, we have to guarantee that the
	// loaded value doesn't change between the load and the its intended
	// destination. The check for isInvariantLoad gives the targe the chance to
	// classify the load as always returning a constant, e.g. a constant pool
	// load.
	if (mayLoad() && !isDereferenceableInvariantLoad(AA))
	// Otherwise, this is a real load. If there is a store between the load and
	// end of block, we can't move it.
	return !SawStore;

	return true;
	}

	bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other,
	bool UseTBAA) {
	const MachineFunction *MF = getParent()->getParent();
	const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();

	// If neither instruction stores to memory, they can't alias in any
	// meaningful way, even if they read from the same address.
	if (!mayStore() && !Other.mayStore())
	return false;

	// Let the target decide if memory accesses cannot possibly overlap.
	if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA))
	return false;

	if (!AA)
	return true;

	// FIXME: Need to handle multiple memory operands to support all targets.
	if (!hasOneMemOperand() \|\| !Other.hasOneMemOperand())
	return true;

	MachineMemOperand MMOa = memoperands_begin();
	MachineMemOperand MMOb = Other.memoperands_begin();

	if (!MMOa->getValue() \|\| !MMOb->getValue())
	return true;

	// The following interface to AA is fashioned after DAGCombiner::isAlias
	// and operates with MachineMemOperand offset with some important
	// assumptions:
	// - LLVM fundamentally assumes flat address spaces.
	// - MachineOperand offset can only result from legalization and
	// cannot affect queries other than the trivial case of overlap
	// checking.
	// - These offsets never wrap and never step outside
	// of allocated objects.
	// - There should never be any negative offsets here.
	//
	// FIXME: Modify API to hide this math from "user"
	// FIXME: Even before we go to AA we can reason locally about some
	// memory objects. It can save compile time, and possibly catch some
	// corner cases not currently covered.

	assert((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
	assert((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");

	int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
	int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
	int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;

	AliasResult AAResult =
	AA->alias(MemoryLocation(MMOa->getValue(), Overlapa,
	UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
	MemoryLocation(MMOb->getValue(), Overlapb,
	UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));

	return (AAResult != NoAlias);
	}

	/// hasOrderedMemoryRef - Return true if this instruction may have an ordered
	/// or volatile memory reference, or if the information describing the memory
	/// reference is not available. Return false if it is known to have no ordered
	/// memory references.
	bool MachineInstr::hasOrderedMemoryRef() const {
	// An instruction known never to access memory won't have a volatile access.
	if (!mayStore() &&
	!mayLoad() &&
	!isCall() &&
	!hasUnmodeledSideEffects())
	return false;

	// Otherwise, if the instruction has no memory reference information,
	// conservatively assume it wasn't preserved.
	if (memoperands_empty())
	return true;

	// Check if any of our memory operands are ordered.
	return llvm::any_of(memoperands(), [](const MachineMemOperand *MMO) {
	return !MMO->isUnordered();
	});
	}

	/// isDereferenceableInvariantLoad - Return true if this instruction will never
	/// trap and is loading from a location whose value is invariant across a run of
	/// this function.
	bool MachineInstr::isDereferenceableInvariantLoad(AliasAnalysis *AA) const {
	// If the instruction doesn't load at all, it isn't an invariant load.
	if (!mayLoad())
	return false;

	// If the instruction has lost its memoperands, conservatively assume that
	// it may not be an invariant load.
	if (memoperands_empty())
	return false;

	const MachineFrameInfo &MFI = getParent()->getParent()->getFrameInfo();

	for (MachineMemOperand *MMO : memoperands()) {
	if (MMO->isVolatile()) return false;
	if (MMO->isStore()) return false;
	if (MMO->isInvariant() && MMO->isDereferenceable())
	continue;

	// A load from a constant PseudoSourceValue is invariant.
	if (const PseudoSourceValue *PSV = MMO->getPseudoValue())
	if (PSV->isConstant(&MFI))
	continue;

	if (const Value *V = MMO->getValue()) {
	// If we have an AliasAnalysis, ask it whether the memory is constant.
	if (AA &&
	AA->pointsToConstantMemory(
	MemoryLocation(V, MMO->getSize(), MMO->getAAInfo())))
	continue;
	}

	// Otherwise assume conservatively.
	return false;
	}

	// Everything checks out.
	return true;
	}

	/// isConstantValuePHI - If the specified instruction is a PHI that always
	/// merges together the same virtual register, return the register, otherwise
	/// return 0.
	unsigned MachineInstr::isConstantValuePHI() const {
	if (!isPHI())
	return 0;
	assert(getNumOperands() >= 3 &&
	"It's illegal to have a PHI without source operands");

	unsigned Reg = getOperand(1).getReg();
	for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
	if (getOperand(i).getReg() != Reg)
	return 0;
	return Reg;
	}

	bool MachineInstr::hasUnmodeledSideEffects() const {
	if (hasProperty(MCID::UnmodeledSideEffects))
	return true;
	if (isInlineAsm()) {
	unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
	if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
	return true;
	}

	return false;
	}

	bool MachineInstr::isLoadFoldBarrier() const {
	return mayStore() \|\| isCall() \|\| hasUnmodeledSideEffects();
	}

	/// allDefsAreDead - Return true if all the defs of this instruction are dead.
	///
	bool MachineInstr::allDefsAreDead() const {
	for (const MachineOperand &MO : operands()) {
	if (!MO.isReg() \|\| MO.isUse())
	continue;
	if (!MO.isDead())
	return false;
	}
	return true;
	}

	/// copyImplicitOps - Copy implicit register operands from specified
	/// instruction to this instruction.
	void MachineInstr::copyImplicitOps(MachineFunction &MF,
	const MachineInstr &MI) {
	for (unsigned i = MI.getDesc().getNumOperands(), e = MI.getNumOperands();
	i != e; ++i) {
	const MachineOperand &MO = MI.getOperand(i);
	if ((MO.isReg() && MO.isImplicit()) \|\| MO.isRegMask())
	addOperand(MF, MO);
	}
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void MachineInstr::dump() const {
	dbgs() << " ";
	print(dbgs());
	}
	#endif

	void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc,
	const TargetInstrInfo *TII) const {
	const Module *M = nullptr;
	if (const MachineBasicBlock *MBB = getParent())
	if (const MachineFunction *MF = MBB->getParent())
	M = MF->getFunction()->getParent();

	ModuleSlotTracker MST(M);
	print(OS, MST, SkipOpers, SkipDebugLoc, TII);
	}

	void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
	bool SkipOpers, bool SkipDebugLoc,
	const TargetInstrInfo *TII) const {
	// We can be a bit tidier if we know the MachineFunction.
	const MachineFunction *MF = nullptr;
	const TargetRegisterInfo *TRI = nullptr;
	const MachineRegisterInfo *MRI = nullptr;
	const TargetIntrinsicInfo *IntrinsicInfo = nullptr;

	if (const MachineBasicBlock *MBB = getParent()) {
	MF = MBB->getParent();
	if (MF) {
	MRI = &MF->getRegInfo();
	TRI = MF->getSubtarget().getRegisterInfo();
	if (!TII)
	TII = MF->getSubtarget().getInstrInfo();
	IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
	}
	}

	// Save a list of virtual registers.
	SmallVector<unsigned, 8> VirtRegs;

	// Print explicitly defined operands on the left of an assignment syntax.
	unsigned StartOp = 0, e = getNumOperands();
	for (; StartOp < e && getOperand(StartOp).isReg() &&
	getOperand(StartOp).isDef() &&
	!getOperand(StartOp).isImplicit();
	++StartOp) {
	if (StartOp != 0) OS << ", ";
	getOperand(StartOp).print(OS, MST, TRI, IntrinsicInfo);
	unsigned Reg = getOperand(StartOp).getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg)) {
	VirtRegs.push_back(Reg);
	LLT Ty = MRI ? MRI->getType(Reg) : LLT{};
	if (Ty.isValid())
	OS << '(' << Ty << ')';
	}
	}

	if (StartOp != 0)
	OS << " = ";

	// Print the opcode name.
	if (TII)
	OS << TII->getName(getOpcode());
	else
	OS << "UNKNOWN";

	if (SkipOpers)
	return;

	// Print the rest of the operands.
	bool FirstOp = true;
	unsigned AsmDescOp = ~0u;
	unsigned AsmOpCount = 0;

	if (isInlineAsm() && e >= InlineAsm::MIOp_FirstOperand) {
	// Print asm string.
	OS << " ";
	getOperand(InlineAsm::MIOp_AsmString).print(OS, MST, TRI);

	// Print HasSideEffects, MayLoad, MayStore, IsAlignStack
	unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
	if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
	OS << " [sideeffect]";
	if (ExtraInfo & InlineAsm::Extra_MayLoad)
	OS << " [mayload]";
	if (ExtraInfo & InlineAsm::Extra_MayStore)
	OS << " [maystore]";
	if (ExtraInfo & InlineAsm::Extra_IsConvergent)
	OS << " [isconvergent]";
	if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
	OS << " [alignstack]";
	if (getInlineAsmDialect() == InlineAsm::AD_ATT)
	OS << " [attdialect]";
	if (getInlineAsmDialect() == InlineAsm::AD_Intel)
	OS << " [inteldialect]";

	StartOp = AsmDescOp = InlineAsm::MIOp_FirstOperand;
	FirstOp = false;
	}

	for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
	const MachineOperand &MO = getOperand(i);

	if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
	VirtRegs.push_back(MO.getReg());

	if (FirstOp) FirstOp = false; else OS << ",";
	OS << " ";
	if (i < getDesc().NumOperands) {
	const MCOperandInfo &MCOI = getDesc().OpInfo[i];
	if (MCOI.isPredicate())
	OS << "pred:";
	if (MCOI.isOptionalDef())
	OS << "opt:";
	}
	if (isDebugValue() && MO.isMetadata()) {
	// Pretty print DBG_VALUE instructions.
	auto *DIV = dyn_cast<DILocalVariable>(MO.getMetadata());
	if (DIV && !DIV->getName().empty())
	OS << "!\"" << DIV->getName() << '\"';
	else
	MO.print(OS, MST, TRI);
	} else if (TRI && (isInsertSubreg() \|\| isRegSequence() \|\|
	(isSubregToReg() && i == 3)) && MO.isImm()) {
	OS << TRI->getSubRegIndexName(MO.getImm());
	} else if (i == AsmDescOp && MO.isImm()) {
	// Pretty print the inline asm operand descriptor.
	OS << '$' << AsmOpCount++;
	unsigned Flag = MO.getImm();
	switch (InlineAsm::getKind(Flag)) {
	case InlineAsm::Kind_RegUse: OS << ":[reguse"; break;
	case InlineAsm::Kind_RegDef: OS << ":[regdef"; break;
	case InlineAsm::Kind_RegDefEarlyClobber: OS << ":[regdef-ec"; break;
	case InlineAsm::Kind_Clobber: OS << ":[clobber"; break;
	case InlineAsm::Kind_Imm: OS << ":[imm"; break;
	case InlineAsm::Kind_Mem: OS << ":[mem"; break;
	default: OS << ":[??" << InlineAsm::getKind(Flag); break;
	}

	unsigned RCID = 0;
	if (!InlineAsm::isImmKind(Flag) && !InlineAsm::isMemKind(Flag) &&
	InlineAsm::hasRegClassConstraint(Flag, RCID)) {
	if (TRI) {
	OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID));
	} else
	OS << ":RC" << RCID;
	}

	if (InlineAsm::isMemKind(Flag)) {
	unsigned MCID = InlineAsm::getMemoryConstraintID(Flag);
	switch (MCID) {
	case InlineAsm::Constraint_es: OS << ":es"; break;
	case InlineAsm::Constraint_i: OS << ":i"; break;
	case InlineAsm::Constraint_m: OS << ":m"; break;
	case InlineAsm::Constraint_o: OS << ":o"; break;
	case InlineAsm::Constraint_v: OS << ":v"; break;
	case InlineAsm::Constraint_Q: OS << ":Q"; break;
	case InlineAsm::Constraint_R: OS << ":R"; break;
	case InlineAsm::Constraint_S: OS << ":S"; break;
	case InlineAsm::Constraint_T: OS << ":T"; break;
	case InlineAsm::Constraint_Um: OS << ":Um"; break;
	case InlineAsm::Constraint_Un: OS << ":Un"; break;
	case InlineAsm::Constraint_Uq: OS << ":Uq"; break;
	case InlineAsm::Constraint_Us: OS << ":Us"; break;
	case InlineAsm::Constraint_Ut: OS << ":Ut"; break;
	case InlineAsm::Constraint_Uv: OS << ":Uv"; break;
	case InlineAsm::Constraint_Uy: OS << ":Uy"; break;
	case InlineAsm::Constraint_X: OS << ":X"; break;
	case InlineAsm::Constraint_Z: OS << ":Z"; break;
	case InlineAsm::Constraint_ZC: OS << ":ZC"; break;
	case InlineAsm::Constraint_Zy: OS << ":Zy"; break;
	default: OS << ":?"; break;
	}
	}

	unsigned TiedTo = 0;
	if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo))
	OS << " tiedto:$" << TiedTo;

	OS << ']';

	// Compute the index of the next operand descriptor.
	AsmDescOp += 1 + InlineAsm::getNumOperandRegisters(Flag);
	} else
	MO.print(OS, MST, TRI);
	}

	bool HaveSemi = false;
	const unsigned PrintableFlags = FrameSetup \| FrameDestroy;
	if (Flags & PrintableFlags) {
	if (!HaveSemi) {
	OS << ";";
	HaveSemi = true;
	}
	OS << " flags: ";

	if (Flags & FrameSetup)
	OS << "FrameSetup";

	if (Flags & FrameDestroy)
	OS << "FrameDestroy";
	}

	if (!memoperands_empty()) {
	if (!HaveSemi) {
	OS << ";";
	HaveSemi = true;
	}

	OS << " mem:";
	for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
	i != e; ++i) {
	(*i)->print(OS, MST);
	if (std::next(i) != e)
	OS << " ";
	}
	}

	// Print the regclass of any virtual registers encountered.
	if (MRI && !VirtRegs.empty()) {
	if (!HaveSemi) {
	OS << ";";
	HaveSemi = true;
	}
	for (unsigned i = 0; i != VirtRegs.size(); ++i) {
	const RegClassOrRegBank &RC = MRI->getRegClassOrRegBank(VirtRegs[i]);
	if (!RC)
	continue;
	// Generic virtual registers do not have register classes.
	if (RC.is<const RegisterBank *>())
	OS << " " << RC.get<const RegisterBank *>()->getName();
	else
	OS << " "
	<< TRI->getRegClassName(RC.get<const TargetRegisterClass *>());
	OS << ':' << PrintReg(VirtRegs[i]);
	for (unsigned j = i+1; j != VirtRegs.size();) {
	if (MRI->getRegClassOrRegBank(VirtRegs[j]) != RC) {
	++j;
	continue;
	}
	if (VirtRegs[i] != VirtRegs[j])
	OS << "," << PrintReg(VirtRegs[j]);
	VirtRegs.erase(VirtRegs.begin()+j);
	}
	}
	}

	// Print debug location information.
	if (isDebugValue() && getOperand(e - 2).isMetadata()) {
	if (!HaveSemi)
	OS << ";";
	auto *DV = cast<DILocalVariable>(getOperand(e - 2).getMetadata());
	OS << " line no:" << DV->getLine();
	if (auto *InlinedAt = debugLoc->getInlinedAt()) {
	DebugLoc InlinedAtDL(InlinedAt);
	if (InlinedAtDL && MF) {
	OS << " inlined @[ ";
	InlinedAtDL.print(OS);
	OS << " ]";
	}
	}
	if (isIndirectDebugValue())
	OS << " indirect";
	} else if (SkipDebugLoc) {
	return;
	} else if (debugLoc && MF) {
	if (!HaveSemi)
	OS << ";";
	OS << " dbg:";
	debugLoc.print(OS);
	}

	OS << '\n';
	}

	bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
	const TargetRegisterInfo *RegInfo,
	bool AddIfNotFound) {
	bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
	bool hasAliases = isPhysReg &&
	MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
	bool Found = false;
	SmallVector<unsigned,4> DeadOps;
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	MachineOperand &MO = getOperand(i);
	if (!MO.isReg() \|\| !MO.isUse() \|\| MO.isUndef())
	continue;

	// DEBUG_VALUE nodes do not contribute to code generation and should
	// always be ignored. Failure to do so may result in trying to modify
	// KILL flags on DEBUG_VALUE nodes.
	if (MO.isDebug())
	continue;

	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;

	if (Reg == IncomingReg) {
	if (!Found) {
	if (MO.isKill())
	// The register is already marked kill.
	return true;
	if (isPhysReg && isRegTiedToDefOperand(i))
	// Two-address uses of physregs must not be marked kill.
	return true;
	MO.setIsKill();
	Found = true;
	}
	} else if (hasAliases && MO.isKill() &&
	TargetRegisterInfo::isPhysicalRegister(Reg)) {
	// A super-register kill already exists.
	if (RegInfo->isSuperRegister(IncomingReg, Reg))
	return true;
	if (RegInfo->isSubRegister(IncomingReg, Reg))
	DeadOps.push_back(i);
	}
	}

	// Trim unneeded kill operands.
	while (!DeadOps.empty()) {
	unsigned OpIdx = DeadOps.back();
	if (getOperand(OpIdx).isImplicit())
	RemoveOperand(OpIdx);
	else
	getOperand(OpIdx).setIsKill(false);
	DeadOps.pop_back();
	}

	// If not found, this means an alias of one of the operands is killed. Add a
	// new implicit operand if required.
	if (!Found && AddIfNotFound) {
	addOperand(MachineOperand::CreateReg(IncomingReg,
	false /IsDef/,
	true /IsImp/,
	true /IsKill/));
	return true;
	}
	return Found;
	}

	void MachineInstr::clearRegisterKills(unsigned Reg,
	const TargetRegisterInfo *RegInfo) {
	if (!TargetRegisterInfo::isPhysicalRegister(Reg))
	RegInfo = nullptr;
	for (MachineOperand &MO : operands()) {
	if (!MO.isReg() \|\| !MO.isUse() \|\| !MO.isKill())
	continue;
	unsigned OpReg = MO.getReg();
	if ((RegInfo && RegInfo->regsOverlap(Reg, OpReg)) \|\| Reg == OpReg)
	MO.setIsKill(false);
	}
	}

	bool MachineInstr::addRegisterDead(unsigned Reg,
	const TargetRegisterInfo *RegInfo,
	bool AddIfNotFound) {
	bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(Reg);
	bool hasAliases = isPhysReg &&
	MCRegAliasIterator(Reg, RegInfo, false).isValid();
	bool Found = false;
	SmallVector<unsigned,4> DeadOps;
	for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
	MachineOperand &MO = getOperand(i);
	if (!MO.isReg() \|\| !MO.isDef())
	continue;
	unsigned MOReg = MO.getReg();
	if (!MOReg)
	continue;

	if (MOReg == Reg) {
	MO.setIsDead();
	Found = true;
	} else if (hasAliases && MO.isDead() &&
	TargetRegisterInfo::isPhysicalRegister(MOReg)) {
	// There exists a super-register that's marked dead.
	if (RegInfo->isSuperRegister(Reg, MOReg))
	return true;
	if (RegInfo->isSubRegister(Reg, MOReg))
	DeadOps.push_back(i);
	}
	}

	// Trim unneeded dead operands.
	while (!DeadOps.empty()) {
	unsigned OpIdx = DeadOps.back();
	if (getOperand(OpIdx).isImplicit())
	RemoveOperand(OpIdx);
	else
	getOperand(OpIdx).setIsDead(false);
	DeadOps.pop_back();
	}

	// If not found, this means an alias of one of the operands is dead. Add a
	// new implicit operand if required.
	if (Found \|\| !AddIfNotFound)
	return Found;

	addOperand(MachineOperand::CreateReg(Reg,
	true /IsDef/,
	true /IsImp/,
	false /IsKill/,
	true /IsDead/));
	return true;
	}

	void MachineInstr::clearRegisterDeads(unsigned Reg) {
	for (MachineOperand &MO : operands()) {
	if (!MO.isReg() \|\| !MO.isDef() \|\| MO.getReg() != Reg)
	continue;
	MO.setIsDead(false);
	}
	}

	void MachineInstr::setRegisterDefReadUndef(unsigned Reg, bool IsUndef) {
	for (MachineOperand &MO : operands()) {
	if (!MO.isReg() \|\| !MO.isDef() \|\| MO.getReg() != Reg \|\| MO.getSubReg() == 0)
	continue;
	MO.setIsUndef(IsUndef);
	}
	}

	void MachineInstr::addRegisterDefined(unsigned Reg,
	const TargetRegisterInfo *RegInfo) {
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	MachineOperand *MO = findRegisterDefOperand(Reg, false, RegInfo);
	if (MO)
	return;
	} else {
	for (const MachineOperand &MO : operands()) {
	if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
	MO.getSubReg() == 0)
	return;
	}
	}
	addOperand(MachineOperand::CreateReg(Reg,
	true /IsDef/,
	true /IsImp/));
	}

	void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
	const TargetRegisterInfo &TRI) {
	bool HasRegMask = false;
	for (MachineOperand &MO : operands()) {
	if (MO.isRegMask()) {
	HasRegMask = true;
	continue;
	}
	if (!MO.isReg() \|\| !MO.isDef()) continue;
	unsigned Reg = MO.getReg();
	if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
	// If there are no uses, including partial uses, the def is dead.
	if (llvm::none_of(UsedRegs,
	[&](unsigned Use) { return TRI.regsOverlap(Use, Reg); }))
	MO.setIsDead();
	}

	// This is a call with a register mask operand.
	// Mask clobbers are always dead, so add defs for the non-dead defines.
	if (HasRegMask)
	for (ArrayRef<unsigned>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
	I != E; ++I)
	addRegisterDefined(*I, &TRI);
	}

	unsigned
	MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
	// Build up a buffer of hash code components.
	SmallVector<size_t, 8> HashComponents;
	HashComponents.reserve(MI->getNumOperands() + 1);
	HashComponents.push_back(MI->getOpcode());
	for (const MachineOperand &MO : MI->operands()) {
	if (MO.isReg() && MO.isDef() &&
	TargetRegisterInfo::isVirtualRegister(MO.getReg()))
	continue; // Skip virtual register defs.

	HashComponents.push_back(hash_value(MO));
	}
	return hash_combine_range(HashComponents.begin(), HashComponents.end());
	}

	void MachineInstr::emitError(StringRef Msg) const {
	// Find the source location cookie.
	unsigned LocCookie = 0;
	const MDNode *LocMD = nullptr;
	for (unsigned i = getNumOperands(); i != 0; --i) {
	if (getOperand(i-1).isMetadata() &&
	(LocMD = getOperand(i-1).getMetadata()) &&
	LocMD->getNumOperands() != 0) {
	if (const ConstantInt *CI =
	mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
	LocCookie = CI->getZExtValue();
	break;
	}
	}
	}

	if (const MachineBasicBlock *MBB = getParent())
	if (const MachineFunction *MF = MBB->getParent())
	return MF->getMMI().getModule()->getContext().emitError(LocCookie, Msg);
	report_fatal_error(Msg);
	}

	MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
	const MCInstrDesc &MCID, bool IsIndirect,
	unsigned Reg, unsigned Offset,
	const MDNode Variable, const MDNode Expr) {
	assert(isa<DILocalVariable>(Variable) && "not a variable");
	assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
	assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	if (IsIndirect)
	return BuildMI(MF, DL, MCID)
	.addReg(Reg, RegState::Debug)
	.addImm(Offset)
	.addMetadata(Variable)
	.addMetadata(Expr);
	else {
	assert(Offset == 0 && "A direct address cannot have an offset.");
	return BuildMI(MF, DL, MCID)
	.addReg(Reg, RegState::Debug)
	.addReg(0U, RegState::Debug)
	.addMetadata(Variable)
	.addMetadata(Expr);
	}
	}

	MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
	MachineBasicBlock::iterator I,
	const DebugLoc &DL, const MCInstrDesc &MCID,
	bool IsIndirect, unsigned Reg,
	unsigned Offset, const MDNode *Variable,
	const MDNode *Expr) {
	assert(isa<DILocalVariable>(Variable) && "not a variable");
	assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
	MachineFunction &MF = *BB.getParent();
	MachineInstr *MI =
	BuildMI(MF, DL, MCID, IsIndirect, Reg, Offset, Variable, Expr);
	BB.insert(I, MI);
	return MachineInstrBuilder(MF, MI);
	}

	MachineInstr *llvm::buildDbgValueForSpill(MachineBasicBlock &BB,
	MachineBasicBlock::iterator I,
	const MachineInstr &Orig,
	int FrameIndex) {
	const MDNode *Var = Orig.getDebugVariable();
	const auto *Expr = cast_or_null<DIExpression>(Orig.getDebugExpression());
	bool IsIndirect = Orig.isIndirectDebugValue();
	uint64_t Offset = IsIndirect ? Orig.getOperand(1).getImm() : 0;
	DebugLoc DL = Orig.getDebugLoc();
	assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	// If the DBG_VALUE already was a memory location, add an extra
	// DW_OP_deref. Otherwise just turning this from a register into a
	// memory/indirect location is sufficient.
	if (IsIndirect)
	Expr = DIExpression::prepend(Expr, DIExpression::WithDeref);
	return BuildMI(BB, I, DL, Orig.getDesc())
	.addFrameIndex(FrameIndex)
	.addImm(Offset)
	.addMetadata(Var)
	.addMetadata(Expr);
	}
	Index: head/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp (revision 322319)
	+++ head/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp (revision 322320)
	@@ -1,1486 +1,1427 @@
	//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file This implements the ScheduleDAGInstrs class, which implements
	/// re-scheduling of MachineInstrs.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/CodeGen/ScheduleDAGInstrs.h"
	#include "llvm/ADT/IntEqClasses.h"
	#include "llvm/ADT/MapVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/SparseSet.h"
	#include "llvm/ADT/iterator_range.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/LiveIntervalAnalysis.h"
	#include "llvm/CodeGen/LivePhysRegs.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBundle.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/PseudoSourceValue.h"
	#include "llvm/CodeGen/RegisterPressure.h"
	#include "llvm/CodeGen/ScheduleDAG.h"
	#include "llvm/CodeGen/ScheduleDFS.h"
	#include "llvm/CodeGen/SlotIndexes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/LaneBitmask.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/Format.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <algorithm>
	#include <cassert>
	#include <iterator>
	#include <string>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "machine-scheduler"

	static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
	cl::ZeroOrMore, cl::init(false),
	cl::desc("Enable use of AA during MI DAG construction"));

	static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
	cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));

	// Note: the two options below might be used in tuning compile time vs
	// output quality. Setting HugeRegion so large that it will never be
	// reached means best-effort, but may be slow.

	// When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
	// together hold this many SUs, a reduction of maps will be done.
	static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
	cl::init(1000), cl::desc("The limit to use while constructing the DAG "
	"prior to scheduling, at which point a trade-off "
	"is made to avoid excessive compile time."));

	static cl::opt<unsigned> ReductionSize(
	"dag-maps-reduction-size", cl::Hidden,
	cl::desc("A huge scheduling region will have maps reduced by this many "
	"nodes at a time. Defaults to HugeRegion / 2."));

	static unsigned getReductionSize() {
	// Always reduce a huge region with half of the elements, except
	// when user sets this number explicitly.
	if (ReductionSize.getNumOccurrences() == 0)
	return HugeRegion / 2;
	return ReductionSize;
	}

	static void dumpSUList(ScheduleDAGInstrs::SUList &L) {
	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	dbgs() << "{ ";
	for (const SUnit *su : L) {
	dbgs() << "SU(" << su->NodeNum << ")";
	if (su != L.back())
	dbgs() << ", ";
	}
	dbgs() << "}\n";
	#endif
	}

	ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
	const MachineLoopInfo *mli,
	bool RemoveKillFlags)
	: ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
	RemoveKillFlags(RemoveKillFlags),
	UnknownValue(UndefValue::get(
	Type::getVoidTy(mf.getFunction()->getContext()))) {
	DbgValues.clear();

	const TargetSubtargetInfo &ST = mf.getSubtarget();
	SchedModel.init(ST.getSchedModel(), &ST, TII);
	}

	-/// This is the function that does the work of looking through basic
	-/// ptrtoint+arithmetic+inttoptr sequences.
	-static const Value getUnderlyingObjectFromInt(const Value V) {
	- do {
	- if (const Operator *U = dyn_cast<Operator>(V)) {
	- // If we find a ptrtoint, we can transfer control back to the
	- // regular getUnderlyingObjectFromInt.
	- if (U->getOpcode() == Instruction::PtrToInt)
	- return U->getOperand(0);
	- // If we find an add of a constant, a multiplied value, or a phi, it's
	- // likely that the other operand will lead us to the base
	- // object. We don't have to worry about the case where the
	- // object address is somehow being computed by the multiply,
	- // because our callers only care when the result is an
	- // identifiable object.
	- if (U->getOpcode() != Instruction::Add \|\|
	- (!isa<ConstantInt>(U->getOperand(1)) &&
	- Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
	- !isa<PHINode>(U->getOperand(1))))
	- return V;
	- V = U->getOperand(0);
	- } else {
	- return V;
	- }
	- assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
	- } while (true);
	-}
	-
	-/// This is a wrapper around GetUnderlyingObjects and adds support for basic
	-/// ptrtoint+arithmetic+inttoptr sequences.
	-static void getUnderlyingObjects(const Value *V,
	- SmallVectorImpl<Value *> &Objects,
	- const DataLayout &DL) {
	- SmallPtrSet<const Value *, 16> Visited;
	- SmallVector<const Value *, 4> Working(1, V);
	- do {
	- V = Working.pop_back_val();
	-
	- SmallVector<Value *, 4> Objs;
	- GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
	-
	- for (Value *V : Objs) {
	- if (!Visited.insert(V).second)
	- continue;
	- if (Operator::getOpcode(V) == Instruction::IntToPtr) {
	- const Value *O =
	- getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
	- if (O->getType()->isPointerTy()) {
	- Working.push_back(O);
	- continue;
	- }
	- }
	- Objects.push_back(const_cast<Value *>(V));
	- }
	- } while (!Working.empty());
	-}
	-
	/// If this machine instr has memory reference information and it can be tracked
	/// to a normal reference to a known object, return the Value for that object.
	static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
	const MachineFrameInfo &MFI,
	UnderlyingObjectsVector &Objects,
	const DataLayout &DL) {
	auto allMMOsOkay = [&]() {
	for (const MachineMemOperand *MMO : MI->memoperands()) {
	if (MMO->isVolatile())
	return false;

	if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
	// Function that contain tail calls don't have unique PseudoSourceValue
	// objects. Two PseudoSourceValues might refer to the same or
	// overlapping locations. The client code calling this function assumes
	// this is not the case. So return a conservative answer of no known
	// object.
	if (MFI.hasTailCall())
	return false;

	// For now, ignore PseudoSourceValues which may alias LLVM IR values
	// because the code that uses this function has no way to cope with
	// such aliases.
	if (PSV->isAliased(&MFI))
	return false;

	bool MayAlias = PSV->mayAlias(&MFI);
	Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
	} else if (const Value *V = MMO->getValue()) {
	SmallVector<Value *, 4> Objs;
	- getUnderlyingObjects(V, Objs, DL);
	+ getUnderlyingObjectsForCodeGen(V, Objs, DL);

	for (Value *V : Objs) {
	- if (!isIdentifiedObject(V))
	- return false;
	-
	+ assert(isIdentifiedObject(V));
	Objects.push_back(UnderlyingObjectsVector::value_type(V, true));
	}
	} else
	return false;
	}
	return true;
	};

	if (!allMMOsOkay())
	Objects.clear();
	}

	void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
	BB = bb;
	}

	void ScheduleDAGInstrs::finishBlock() {
	// Subclasses should no longer refer to the old block.
	BB = nullptr;
	}

	void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
	MachineBasicBlock::iterator begin,
	MachineBasicBlock::iterator end,
	unsigned regioninstrs) {
	assert(bb == BB && "startBlock should set BB");
	RegionBegin = begin;
	RegionEnd = end;
	NumRegionInstrs = regioninstrs;
	}

	void ScheduleDAGInstrs::exitRegion() {
	// Nothing to do.
	}

	void ScheduleDAGInstrs::addSchedBarrierDeps() {
	MachineInstr ExitMI = RegionEnd != BB->end() ? &RegionEnd : nullptr;
	ExitSU.setInstr(ExitMI);
	// Add dependencies on the defs and uses of the instruction.
	if (ExitMI) {
	for (const MachineOperand &MO : ExitMI->operands()) {
	if (!MO.isReg() \|\| MO.isDef()) continue;
	unsigned Reg = MO.getReg();
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
	} else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) {
	addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO));
	}
	}
	}
	if (!ExitMI \|\| (!ExitMI->isCall() && !ExitMI->isBarrier())) {
	// For others, e.g. fallthrough, conditional branch, assume the exit
	// uses all the registers that are livein to the successor blocks.
	for (const MachineBasicBlock *Succ : BB->successors()) {
	for (const auto &LI : Succ->liveins()) {
	if (!Uses.contains(LI.PhysReg))
	Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg));
	}
	}
	}
	}

	/// MO is an operand of SU's instruction that defines a physical register. Adds
	/// data dependencies from SU to any uses of the physical register.
	void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
	const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
	assert(MO.isDef() && "expect physreg def");

	// Ask the target if address-backscheduling is desirable, and if so how much.
	const TargetSubtargetInfo &ST = MF.getSubtarget();

	for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
	Alias.isValid(); ++Alias) {
	if (!Uses.contains(*Alias))
	continue;
	for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
	SUnit *UseSU = I->SU;
	if (UseSU == SU)
	continue;

	// Adjust the dependence latency using operand def/use information,
	// then allow the target to perform its own adjustments.
	int UseOp = I->OpIdx;
	MachineInstr *RegUse = nullptr;
	SDep Dep;
	if (UseOp < 0)
	Dep = SDep(SU, SDep::Artificial);
	else {
	// Set the hasPhysRegDefs only for physreg defs that have a use within
	// the scheduling region.
	SU->hasPhysRegDefs = true;
	Dep = SDep(SU, SDep::Data, *Alias);
	RegUse = UseSU->getInstr();
	}
	Dep.setLatency(
	SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse,
	UseOp));

	ST.adjustSchedDependency(SU, UseSU, Dep);
	UseSU->addPred(Dep);
	}
	}
	}

	/// \brief Adds register dependencies (data, anti, and output) from this SUnit
	/// to following instructions in the same scheduling region that depend the
	/// physical register referenced at OperIdx.
	void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
	MachineInstr *MI = SU->getInstr();
	MachineOperand &MO = MI->getOperand(OperIdx);
	unsigned Reg = MO.getReg();
	// We do not need to track any dependencies for constant registers.
	if (MRI.isConstantPhysReg(Reg))
	return;

	// Optionally add output and anti dependencies. For anti
	// dependencies we use a latency of 0 because for a multi-issue
	// target we want to allow the defining instruction to issue
	// in the same cycle as the using instruction.
	// TODO: Using a latency of 1 here for output dependencies assumes
	// there's no cost for reusing registers.
	SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
	for (MCRegAliasIterator Alias(Reg, TRI, true); Alias.isValid(); ++Alias) {
	if (!Defs.contains(*Alias))
	continue;
	for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
	SUnit *DefSU = I->SU;
	if (DefSU == &ExitSU)
	continue;
	if (DefSU != SU &&
	(Kind != SDep::Output \|\| !MO.isDead() \|\|
	!DefSU->getInstr()->registerDefIsDead(*Alias))) {
	if (Kind == SDep::Anti)
	DefSU->addPred(SDep(SU, Kind, /Reg=/*Alias));
	else {
	SDep Dep(SU, Kind, /Reg=/*Alias);
	Dep.setLatency(
	SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
	DefSU->addPred(Dep);
	}
	}
	}
	}

	if (!MO.isDef()) {
	SU->hasPhysRegUses = true;
	// Either insert a new Reg2SUnits entry with an empty SUnits list, or
	// retrieve the existing SUnits list for this register's uses.
	// Push this SUnit on the use list.
	Uses.insert(PhysRegSUOper(SU, OperIdx, Reg));
	if (RemoveKillFlags)
	MO.setIsKill(false);
	} else {
	addPhysRegDataDeps(SU, OperIdx);

	// clear this register's use list
	if (Uses.contains(Reg))
	Uses.eraseAll(Reg);

	if (!MO.isDead()) {
	Defs.eraseAll(Reg);
	} else if (SU->isCall) {
	// Calls will not be reordered because of chain dependencies (see
	// below). Since call operands are dead, calls may continue to be added
	// to the DefList making dependence checking quadratic in the size of
	// the block. Instead, we leave only one call at the back of the
	// DefList.
	Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
	Reg2SUnitsMap::iterator B = P.first;
	Reg2SUnitsMap::iterator I = P.second;
	for (bool isBegin = I == B; !isBegin; /* empty */) {
	isBegin = (--I) == B;
	if (!I->SU->isCall)
	break;
	I = Defs.erase(I);
	}
	}

	// Defs are pushed in the order they are visited and never reordered.
	Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
	}
	}

	LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
	{
	unsigned Reg = MO.getReg();
	// No point in tracking lanemasks if we don't have interesting subregisters.
	const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
	if (!RC.HasDisjunctSubRegs)
	return LaneBitmask::getAll();

	unsigned SubReg = MO.getSubReg();
	if (SubReg == 0)
	return RC.getLaneMask();
	return TRI->getSubRegIndexLaneMask(SubReg);
	}

	/// Adds register output and data dependencies from this SUnit to instructions
	/// that occur later in the same scheduling region if they read from or write to
	/// the virtual register defined at OperIdx.
	///
	/// TODO: Hoist loop induction variable increments. This has to be
	/// reevaluated. Generally, IV scheduling should be done before coalescing.
	void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
	MachineInstr *MI = SU->getInstr();
	MachineOperand &MO = MI->getOperand(OperIdx);
	unsigned Reg = MO.getReg();

	LaneBitmask DefLaneMask;
	LaneBitmask KillLaneMask;
	if (TrackLaneMasks) {
	bool IsKill = MO.getSubReg() == 0 \|\| MO.isUndef();
	DefLaneMask = getLaneMaskForMO(MO);
	// If we have a <read-undef> flag, none of the lane values comes from an
	// earlier instruction.
	KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask;

	// Clear undef flag, we'll re-add it later once we know which subregister
	// Def is first.
	MO.setIsUndef(false);
	} else {
	DefLaneMask = LaneBitmask::getAll();
	KillLaneMask = LaneBitmask::getAll();
	}

	if (MO.isDead()) {
	assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
	"Dead defs should have no uses");
	} else {
	// Add data dependence to all uses we found so far.
	const TargetSubtargetInfo &ST = MF.getSubtarget();
	for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
	E = CurrentVRegUses.end(); I != E; /empty/) {
	LaneBitmask LaneMask = I->LaneMask;
	// Ignore uses of other lanes.
	if ((LaneMask & KillLaneMask).none()) {
	++I;
	continue;
	}

	if ((LaneMask & DefLaneMask).any()) {
	SUnit *UseSU = I->SU;
	MachineInstr *Use = UseSU->getInstr();
	SDep Dep(SU, SDep::Data, Reg);
	Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
	I->OperandIndex));
	ST.adjustSchedDependency(SU, UseSU, Dep);
	UseSU->addPred(Dep);
	}

	LaneMask &= ~KillLaneMask;
	// If we found a Def for all lanes of this use, remove it from the list.
	if (LaneMask.any()) {
	I->LaneMask = LaneMask;
	++I;
	} else
	I = CurrentVRegUses.erase(I);
	}
	}

	// Shortcut: Singly defined vregs do not have output/anti dependencies.
	if (MRI.hasOneDef(Reg))
	return;

	// Add output dependence to the next nearest defs of this vreg.
	//
	// Unless this definition is dead, the output dependence should be
	// transitively redundant with antidependencies from this definition's
	// uses. We're conservative for now until we have a way to guarantee the uses
	// are not eliminated sometime during scheduling. The output dependence edge
	// is also useful if output latency exceeds def-use latency.
	LaneBitmask LaneMask = DefLaneMask;
	for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
	CurrentVRegDefs.end())) {
	// Ignore defs for other lanes.
	if ((V2SU.LaneMask & LaneMask).none())
	continue;
	// Add an output dependence.
	SUnit *DefSU = V2SU.SU;
	// Ignore additional defs of the same lanes in one instruction. This can
	// happen because lanemasks are shared for targets with too many
	// subregisters. We also use some representration tricks/hacks where we
	// add super-register defs/uses, to imply that although we only access parts
	// of the reg we care about the full one.
	if (DefSU == SU)
	continue;
	SDep Dep(SU, SDep::Output, Reg);
	Dep.setLatency(
	SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
	DefSU->addPred(Dep);

	// Update current definition. This can get tricky if the def was about a
	// bigger lanemask before. We then have to shrink it and create a new
	// VReg2SUnit for the non-overlapping part.
	LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
	LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
	V2SU.SU = SU;
	V2SU.LaneMask = OverlapMask;
	if (NonOverlapMask.any())
	CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU));
	}
	// If there was no CurrentVRegDefs entry for some lanes yet, create one.
	if (LaneMask.any())
	CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
	}

	/// \brief Adds a register data dependency if the instruction that defines the
	/// virtual register used at OperIdx is mapped to an SUnit. Add a register
	/// antidependency from this SUnit to instructions that occur later in the same
	/// scheduling region if they write the virtual register.
	///
	/// TODO: Handle ExitSU "uses" properly.
	void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
	const MachineInstr *MI = SU->getInstr();
	const MachineOperand &MO = MI->getOperand(OperIdx);
	unsigned Reg = MO.getReg();

	// Remember the use. Data dependencies will be added when we find the def.
	LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO)
	: LaneBitmask::getAll();
	CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));

	// Add antidependences to the following defs of the vreg.
	for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
	CurrentVRegDefs.end())) {
	// Ignore defs for unrelated lanes.
	LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
	if ((PrevDefLaneMask & LaneMask).none())
	continue;
	if (V2SU.SU == SU)
	continue;

	V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
	}
	}

	/// Returns true if MI is an instruction we are unable to reason about
	/// (like a call or something with unmodeled side effects).
	static inline bool isGlobalMemoryObject(AliasAnalysis AA, MachineInstr MI) {
	return MI->isCall() \|\| MI->hasUnmodeledSideEffects() \|\|
	(MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));
	}

	void ScheduleDAGInstrs::addChainDependency (SUnit SUa, SUnit SUb,
	unsigned Latency) {
	if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
	SDep Dep(SUa, SDep::MayAliasMem);
	Dep.setLatency(Latency);
	SUb->addPred(Dep);
	}
	}

	/// \brief Creates an SUnit for each real instruction, numbered in top-down
	/// topological order. The instruction order A < B, implies that no edge exists
	/// from B to A.
	///
	/// Map each real instruction to its SUnit.
	///
	/// After initSUnits, the SUnits vector cannot be resized and the scheduler may
	/// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
	/// instead of pointers.
	///
	/// MachineScheduler relies on initSUnits numbering the nodes by their order in
	/// the original instruction list.
	void ScheduleDAGInstrs::initSUnits() {
	// We'll be allocating one SUnit for each real instruction in the region,
	// which is contained within a basic block.
	SUnits.reserve(NumRegionInstrs);

	for (MachineInstr &MI : make_range(RegionBegin, RegionEnd)) {
	if (MI.isDebugValue())
	continue;

	SUnit *SU = newSUnit(&MI);
	MISUnitMap[&MI] = SU;

	SU->isCall = MI.isCall();
	SU->isCommutable = MI.isCommutable();

	// Assign the Latency field of SU using target-provided information.
	SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());

	// If this SUnit uses a reserved or unbuffered resource, mark it as such.
	//
	// Reserved resources block an instruction from issuing and stall the
	// entire pipeline. These are identified by BufferSize=0.
	//
	// Unbuffered resources prevent execution of subsequent instructions that
	// require the same resources. This is used for in-order execution pipelines
	// within an out-of-order core. These are identified by BufferSize=1.
	if (SchedModel.hasInstrSchedModel()) {
	const MCSchedClassDesc *SC = getSchedClass(SU);
	for (const MCWriteProcResEntry &PRE :
	make_range(SchedModel.getWriteProcResBegin(SC),
	SchedModel.getWriteProcResEnd(SC))) {
	switch (SchedModel.getProcResource(PRE.ProcResourceIdx)->BufferSize) {
	case 0:
	SU->hasReservedResource = true;
	break;
	case 1:
	SU->isUnbuffered = true;
	break;
	default:
	break;
	}
	}
	}
	}
	}

	class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
	/// Current total number of SUs in map.
	unsigned NumNodes = 0;

	/// 1 for loads, 0 for stores. (see comment in SUList)
	unsigned TrueMemOrderLatency;

	public:
	Value2SUsMap(unsigned lat = 0) : TrueMemOrderLatency(lat) {}

	/// To keep NumNodes up to date, insert() is used instead of
	/// this operator w/ push_back().
	ValueType &operator[](const SUList &Key) {
	llvm_unreachable("Don't use. Use insert() instead."); };

	/// Adds SU to the SUList of V. If Map grows huge, reduce its size by calling
	/// reduce().
	void inline insert(SUnit *SU, ValueType V) {
	MapVector::operator[](V).push_back(SU);
	NumNodes++;
	}

	/// Clears the list of SUs mapped to V.
	void inline clearList(ValueType V) {
	iterator Itr = find(V);
	if (Itr != end()) {
	assert(NumNodes >= Itr->second.size());
	NumNodes -= Itr->second.size();

	Itr->second.clear();
	}
	}

	/// Clears map from all contents.
	void clear() {
	MapVector<ValueType, SUList>::clear();
	NumNodes = 0;
	}

	unsigned inline size() const { return NumNodes; }

	/// Counts the number of SUs in this map after a reduction.
	void reComputeSize() {
	NumNodes = 0;
	for (auto &I : *this)
	NumNodes += I.second.size();
	}

	unsigned inline getTrueMemOrderLatency() const {
	return TrueMemOrderLatency;
	}

	void dump();
	};

	void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
	Value2SUsMap &Val2SUsMap) {
	for (auto &I : Val2SUsMap)
	addChainDependencies(SU, I.second,
	Val2SUsMap.getTrueMemOrderLatency());
	}

	void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
	Value2SUsMap &Val2SUsMap,
	ValueType V) {
	Value2SUsMap::iterator Itr = Val2SUsMap.find(V);
	if (Itr != Val2SUsMap.end())
	addChainDependencies(SU, Itr->second,
	Val2SUsMap.getTrueMemOrderLatency());
	}

	void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
	assert(BarrierChain != nullptr);

	for (auto &I : map) {
	SUList &sus = I.second;
	for (auto *SU : sus)
	SU->addPredBarrier(BarrierChain);
	}
	map.clear();
	}

	void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
	assert(BarrierChain != nullptr);

	// Go through all lists of SUs.
	for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
	Value2SUsMap::iterator CurrItr = I++;
	SUList &sus = CurrItr->second;
	SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
	for (; SUItr != SUEE; ++SUItr) {
	// Stop on BarrierChain or any instruction above it.
	if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
	break;

	(*SUItr)->addPredBarrier(BarrierChain);
	}

	// Remove also the BarrierChain from list if present.
	if (SUItr != SUEE && *SUItr == BarrierChain)
	SUItr++;

	// Remove all SUs that are now successors of BarrierChain.
	if (SUItr != sus.begin())
	sus.erase(sus.begin(), SUItr);
	}

	// Remove all entries with empty su lists.
	map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) {
	return (mapEntry.second.empty()); });

	// Recompute the size of the map (NumNodes).
	map.reComputeSize();
	}

	void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
	RegPressureTracker *RPTracker,
	PressureDiffs *PDiffs,
	LiveIntervals *LIS,
	bool TrackLaneMasks) {
	const TargetSubtargetInfo &ST = MF.getSubtarget();
	bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
	: ST.useAA();
	AAForDep = UseAA ? AA : nullptr;

	BarrierChain = nullptr;

	this->TrackLaneMasks = TrackLaneMasks;
	MISUnitMap.clear();
	ScheduleDAG::clearDAG();

	// Create an SUnit for each real instruction.
	initSUnits();

	if (PDiffs)
	PDiffs->init(SUnits.size());

	// We build scheduling units by walking a block's instruction list
	// from bottom to top.

	// Each MIs' memory operand(s) is analyzed to a list of underlying
	// objects. The SU is then inserted in the SUList(s) mapped from the
	// Value(s). Each Value thus gets mapped to lists of SUs depending
	// on it, stores and loads kept separately. Two SUs are trivially
	// non-aliasing if they both depend on only identified Values and do
	// not share any common Value.
	Value2SUsMap Stores, Loads(1 /TrueMemOrderLatency/);

	// Certain memory accesses are known to not alias any SU in Stores
	// or Loads, and have therefore their own 'NonAlias'
	// domain. E.g. spill / reload instructions never alias LLVM I/R
	// Values. It would be nice to assume that this type of memory
	// accesses always have a proper memory operand modelling, and are
	// therefore never unanalyzable, but this is conservatively not
	// done.
	Value2SUsMap NonAliasStores, NonAliasLoads(1 /TrueMemOrderLatency/);

	// Remove any stale debug info; sometimes BuildSchedGraph is called again
	// without emitting the info from the previous call.
	DbgValues.clear();
	FirstDbgValue = nullptr;

	assert(Defs.empty() && Uses.empty() &&
	"Only BuildGraph should update Defs/Uses");
	Defs.setUniverse(TRI->getNumRegs());
	Uses.setUniverse(TRI->getNumRegs());

	assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
	assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
	unsigned NumVirtRegs = MRI.getNumVirtRegs();
	CurrentVRegDefs.setUniverse(NumVirtRegs);
	CurrentVRegUses.setUniverse(NumVirtRegs);

	// Model data dependencies between instructions being scheduled and the
	// ExitSU.
	addSchedBarrierDeps();

	// Walk the list of instructions, from bottom moving up.
	MachineInstr *DbgMI = nullptr;
	for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
	MII != MIE; --MII) {
	MachineInstr &MI = *std::prev(MII);
	if (DbgMI) {
	DbgValues.push_back(std::make_pair(DbgMI, &MI));
	DbgMI = nullptr;
	}

	if (MI.isDebugValue()) {
	DbgMI = &MI;
	continue;
	}
	SUnit *SU = MISUnitMap[&MI];
	assert(SU && "No SUnit mapped to this MI");

	if (RPTracker) {
	RegisterOperands RegOpers;
	RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false);
	if (TrackLaneMasks) {
	SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
	RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx);
	}
	if (PDiffs != nullptr)
	PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI);

	RPTracker->recedeSkipDebugValues();
	assert(&*RPTracker->getPos() == &MI && "RPTracker in sync");
	RPTracker->recede(RegOpers);
	}

	assert(
	(CanHandleTerminators \|\| (!MI.isTerminator() && !MI.isPosition())) &&
	"Cannot schedule terminators or labels!");

	// Add register-based dependencies (data, anti, and output).
	// For some instructions (calls, returns, inline-asm, etc.) there can
	// be explicit uses and implicit defs, in which case the use will appear
	// on the operand list before the def. Do two passes over the operand
	// list to make sure that defs are processed before any uses.
	bool HasVRegDef = false;
	for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
	const MachineOperand &MO = MI.getOperand(j);
	if (!MO.isReg() \|\| !MO.isDef())
	continue;
	unsigned Reg = MO.getReg();
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	addPhysRegDeps(SU, j);
	} else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
	HasVRegDef = true;
	addVRegDefDeps(SU, j);
	}
	}
	// Now process all uses.
	for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
	const MachineOperand &MO = MI.getOperand(j);
	// Only look at use operands.
	// We do not need to check for MO.readsReg() here because subsequent
	// subregister defs will get output dependence edges and need no
	// additional use dependencies.
	if (!MO.isReg() \|\| !MO.isUse())
	continue;
	unsigned Reg = MO.getReg();
	if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
	addPhysRegDeps(SU, j);
	} else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) {
	addVRegUseDeps(SU, j);
	}
	}

	// If we haven't seen any uses in this scheduling region, create a
	// dependence edge to ExitSU to model the live-out latency. This is required
	// for vreg defs with no in-region use, and prefetches with no vreg def.
	//
	// FIXME: NumDataSuccs would be more precise than NumSuccs here. This
	// check currently relies on being called before adding chain deps.
	if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef \|\| MI.mayLoad())) {
	SDep Dep(SU, SDep::Artificial);
	Dep.setLatency(SU->Latency - 1);
	ExitSU.addPred(Dep);
	}

	// Add memory dependencies (Note: isStoreToStackSlot and
	// isLoadFromStackSLot are not usable after stack slots are lowered to
	// actual addresses).

	// This is a barrier event that acts as a pivotal node in the DAG.
	if (isGlobalMemoryObject(AA, &MI)) {

	// Become the barrier chain.
	if (BarrierChain)
	BarrierChain->addPredBarrier(SU);
	BarrierChain = SU;

	DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
	<< BarrierChain->NodeNum << ").\n";);

	// Add dependencies against everything below it and clear maps.
	addBarrierChain(Stores);
	addBarrierChain(Loads);
	addBarrierChain(NonAliasStores);
	addBarrierChain(NonAliasLoads);

	continue;
	}

	// If it's not a store or a variant load, we're done.
	if (!MI.mayStore() &&
	!(MI.mayLoad() && !MI.isDereferenceableInvariantLoad(AA)))
	continue;

	// Always add dependecy edge to BarrierChain if present.
	if (BarrierChain)
	BarrierChain->addPredBarrier(SU);

	// Find the underlying objects for MI. The Objs vector is either
	// empty, or filled with the Values of memory locations which this
	// SU depends on. An empty vector means the memory location is
	// unknown, and may alias anything.
	UnderlyingObjectsVector Objs;
	getUnderlyingObjectsForInstr(&MI, MFI, Objs, MF.getDataLayout());

	if (MI.mayStore()) {
	if (Objs.empty()) {
	// An unknown store depends on all stores and loads.
	addChainDependencies(SU, Stores);
	addChainDependencies(SU, NonAliasStores);
	addChainDependencies(SU, Loads);
	addChainDependencies(SU, NonAliasLoads);

	// Map this store to 'UnknownValue'.
	Stores.insert(SU, UnknownValue);
	} else {
	// Add precise dependencies against all previously seen memory
	// accesses mapped to the same Value(s).
	for (const UnderlyingObject &UnderlObj : Objs) {
	ValueType V = UnderlObj.getValue();
	bool ThisMayAlias = UnderlObj.mayAlias();

	// Add dependencies to previous stores and loads mapped to V.
	addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
	addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V);
	}
	// Update the store map after all chains have been added to avoid adding
	// self-loop edge if multiple underlying objects are present.
	for (const UnderlyingObject &UnderlObj : Objs) {
	ValueType V = UnderlObj.getValue();
	bool ThisMayAlias = UnderlObj.mayAlias();

	// Map this store to V.
	(ThisMayAlias ? Stores : NonAliasStores).insert(SU, V);
	}
	// The store may have dependencies to unanalyzable loads and
	// stores.
	addChainDependencies(SU, Loads, UnknownValue);
	addChainDependencies(SU, Stores, UnknownValue);
	}
	} else { // SU is a load.
	if (Objs.empty()) {
	// An unknown load depends on all stores.
	addChainDependencies(SU, Stores);
	addChainDependencies(SU, NonAliasStores);

	Loads.insert(SU, UnknownValue);
	} else {
	for (const UnderlyingObject &UnderlObj : Objs) {
	ValueType V = UnderlObj.getValue();
	bool ThisMayAlias = UnderlObj.mayAlias();

	// Add precise dependencies against all previously seen stores
	// mapping to the same Value(s).
	addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);

	// Map this load to V.
	(ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
	}
	// The load may have dependencies to unanalyzable stores.
	addChainDependencies(SU, Stores, UnknownValue);
	}
	}

	// Reduce maps if they grow huge.
	if (Stores.size() + Loads.size() >= HugeRegion) {
	DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
	reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
	}
	if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
	DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
	reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
	}
	}

	if (DbgMI)
	FirstDbgValue = DbgMI;

	Defs.clear();
	Uses.clear();
	CurrentVRegDefs.clear();
	CurrentVRegUses.clear();
	}

	raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
	PSV->printCustom(OS);
	return OS;
	}

	void ScheduleDAGInstrs::Value2SUsMap::dump() {
	for (auto &Itr : *this) {
	if (Itr.first.is<const Value*>()) {
	const Value V = Itr.first.get<const Value>();
	if (isa<UndefValue>(V))
	dbgs() << "Unknown";
	else
	V->printAsOperand(dbgs());
	}
	else if (Itr.first.is<const PseudoSourceValue*>())
	dbgs() << Itr.first.get<const PseudoSourceValue*>();
	else
	llvm_unreachable("Unknown Value type.");

	dbgs() << " : ";
	dumpSUList(Itr.second);
	}
	}

	void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
	Value2SUsMap &loads, unsigned N) {
	DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n";
	stores.dump();
	dbgs() << "Loading SUnits:\n";
	loads.dump());

	// Insert all SU's NodeNums into a vector and sort it.
	std::vector<unsigned> NodeNums;
	NodeNums.reserve(stores.size() + loads.size());
	for (auto &I : stores)
	for (auto *SU : I.second)
	NodeNums.push_back(SU->NodeNum);
	for (auto &I : loads)
	for (auto *SU : I.second)
	NodeNums.push_back(SU->NodeNum);
	std::sort(NodeNums.begin(), NodeNums.end());

	// The N last elements in NodeNums will be removed, and the SU with
	// the lowest NodeNum of them will become the new BarrierChain to
	// let the not yet seen SUs have a dependency to the removed SUs.
	assert(N <= NodeNums.size());
	SUnit newBarrierChain = &SUnits[(NodeNums.end() - N)];
	if (BarrierChain) {
	// The aliasing and non-aliasing maps reduce independently of each
	// other, but share a common BarrierChain. Check if the
	// newBarrierChain is above the former one. If it is not, it may
	// introduce a loop to use newBarrierChain, so keep the old one.
	if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
	BarrierChain->addPredBarrier(newBarrierChain);
	BarrierChain = newBarrierChain;
	DEBUG(dbgs() << "Inserting new barrier chain: SU("
	<< BarrierChain->NodeNum << ").\n";);
	}
	else
	DEBUG(dbgs() << "Keeping old barrier chain: SU("
	<< BarrierChain->NodeNum << ").\n";);
	}
	else
	BarrierChain = newBarrierChain;

	insertBarrierChain(stores);
	insertBarrierChain(loads);

	DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n";
	stores.dump();
	dbgs() << "Loading SUnits:\n";
	loads.dump());
	}

	static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
	MachineInstr &MI, bool addToLiveRegs) {
	for (MachineOperand &MO : MI.operands()) {
	if (!MO.isReg() \|\| !MO.readsReg())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;

	// Things that are available after the instruction are killed by it.
	bool IsKill = LiveRegs.available(MRI, Reg);
	MO.setIsKill(IsKill);
	if (addToLiveRegs)
	LiveRegs.addReg(Reg);
	}
	}

	void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
	DEBUG(dbgs() << "Fixup kills for BB#" << MBB.getNumber() << '\n');

	LiveRegs.init(*TRI);
	LiveRegs.addLiveOuts(MBB);

	// Examine block from end to start...
	for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
	if (MI.isDebugValue())
	continue;

	// Update liveness. Registers that are defed but not used in this
	// instruction are now dead. Mark register and all subregs as they
	// are completely defined.
	for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
	const MachineOperand &MO = *O;
	if (MO.isReg()) {
	if (!MO.isDef())
	continue;
	unsigned Reg = MO.getReg();
	if (!Reg)
	continue;
	LiveRegs.removeReg(Reg);
	} else if (MO.isRegMask()) {
	LiveRegs.removeRegsInMask(MO);
	}
	}

	// If there is a bundle header fix it up first.
	if (!MI.isBundled()) {
	toggleKills(MRI, LiveRegs, MI, true);
	} else {
	MachineBasicBlock::instr_iterator First = MI.getIterator();
	if (MI.isBundle()) {
	toggleKills(MRI, LiveRegs, MI, false);
	++First;
	}
	// Some targets make the (questionable) assumtion that the instructions
	// inside the bundle are ordered and consequently only the last use of
	// a register inside the bundle can kill it.
	MachineBasicBlock::instr_iterator I = std::next(First);
	while (I->isBundledWithSucc())
	++I;
	do {
	if (!I->isDebugValue())
	toggleKills(MRI, LiveRegs, *I, true);
	--I;
	} while(I != First);
	}
	}
	}

	void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
	// Cannot completely remove virtual function even in release mode.
	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	SU->getInstr()->dump();
	#endif
	}

	std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
	std::string s;
	raw_string_ostream oss(s);
	if (SU == &EntrySU)
	oss << "<entry>";
	else if (SU == &ExitSU)
	oss << "<exit>";
	else
	SU->getInstr()->print(oss, /SkipOpers=/true);
	return oss.str();
	}

	/// Return the basic block label. It is not necessarilly unique because a block
	/// contains multiple scheduling regions. But it is fine for visualization.
	std::string ScheduleDAGInstrs::getDAGName() const {
	return "dag." + BB->getFullName();
	}

	//===----------------------------------------------------------------------===//
	// SchedDFSResult Implementation
	//===----------------------------------------------------------------------===//

	namespace llvm {

	/// Internal state used to compute SchedDFSResult.
	class SchedDFSImpl {
	SchedDFSResult &R;

	/// Join DAG nodes into equivalence classes by their subtree.
	IntEqClasses SubtreeClasses;
	/// List PredSU, SuccSU pairs that represent data edges between subtrees.
	std::vector<std::pair<const SUnit , const SUnit>> ConnectionPairs;

	struct RootData {
	unsigned NodeID;
	unsigned ParentNodeID; ///< Parent node (member of the parent subtree).
	unsigned SubInstrCount = 0; ///< Instr count in this tree only, not
	/// children.

	RootData(unsigned id): NodeID(id),
	ParentNodeID(SchedDFSResult::InvalidSubtreeID) {}

	unsigned getSparseSetIndex() const { return NodeID; }
	};

	SparseSet<RootData> RootSet;

	public:
	SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
	RootSet.setUniverse(R.DFSNodeData.size());
	}

	/// Returns true if this node been visited by the DFS traversal.
	///
	/// During visitPostorderNode the Node's SubtreeID is assigned to the Node
	/// ID. Later, SubtreeID is updated but remains valid.
	bool isVisited(const SUnit *SU) const {
	return R.DFSNodeData[SU->NodeNum].SubtreeID
	!= SchedDFSResult::InvalidSubtreeID;
	}

	/// Initializes this node's instruction count. We don't need to flag the node
	/// visited until visitPostorder because the DAG cannot have cycles.
	void visitPreorder(const SUnit *SU) {
	R.DFSNodeData[SU->NodeNum].InstrCount =
	SU->getInstr()->isTransient() ? 0 : 1;
	}

	/// Called once for each node after all predecessors are visited. Revisit this
	/// node's predecessors and potentially join them now that we know the ILP of
	/// the other predecessors.
	void visitPostorderNode(const SUnit *SU) {
	// Mark this node as the root of a subtree. It may be joined with its
	// successors later.
	R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
	RootData RData(SU->NodeNum);
	RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;

	// If any predecessors are still in their own subtree, they either cannot be
	// joined or are large enough to remain separate. If this parent node's
	// total instruction count is not greater than a child subtree by at least
	// the subtree limit, then try to join it now since splitting subtrees is
	// only useful if multiple high-pressure paths are possible.
	unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
	for (const SDep &PredDep : SU->Preds) {
	if (PredDep.getKind() != SDep::Data)
	continue;
	unsigned PredNum = PredDep.getSUnit()->NodeNum;
	if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
	joinPredSubtree(PredDep, SU, /CheckLimit=/false);

	// Either link or merge the TreeData entry from the child to the parent.
	if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
	// If the predecessor's parent is invalid, this is a tree edge and the
	// current node is the parent.
	if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
	RootSet[PredNum].ParentNodeID = SU->NodeNum;
	}
	else if (RootSet.count(PredNum)) {
	// The predecessor is not a root, but is still in the root set. This
	// must be the new parent that it was just joined to. Note that
	// RootSet[PredNum].ParentNodeID may either be invalid or may still be
	// set to the original parent.
	RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
	RootSet.erase(PredNum);
	}
	}
	RootSet[SU->NodeNum] = RData;
	}

	/// \brief Called once for each tree edge after calling visitPostOrderNode on
	/// the predecessor. Increment the parent node's instruction count and
	/// preemptively join this subtree to its parent's if it is small enough.
	void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
	R.DFSNodeData[Succ->NodeNum].InstrCount
	+= R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
	joinPredSubtree(PredDep, Succ);
	}

	/// Adds a connection for cross edges.
	void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
	ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));
	}

	/// Sets each node's subtree ID to the representative ID and record
	/// connections between trees.
	void finalize() {
	SubtreeClasses.compress();
	R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
	assert(SubtreeClasses.getNumClasses() == RootSet.size()
	&& "number of roots should match trees");
	for (const RootData &Root : RootSet) {
	unsigned TreeID = SubtreeClasses[Root.NodeID];
	if (Root.ParentNodeID != SchedDFSResult::InvalidSubtreeID)
	R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[Root.ParentNodeID];
	R.DFSTreeData[TreeID].SubInstrCount = Root.SubInstrCount;
	// Note that SubInstrCount may be greater than InstrCount if we joined
	// subtrees across a cross edge. InstrCount will be attributed to the
	// original parent, while SubInstrCount will be attributed to the joined
	// parent.
	}
	R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
	R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
	DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
	for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
	R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
	DEBUG(dbgs() << " SU(" << Idx << ") in tree "
	<< R.DFSNodeData[Idx].SubtreeID << '\n');
	}
	for (const std::pair<const SUnit, const SUnit> &P : ConnectionPairs) {
	unsigned PredTree = SubtreeClasses[P.first->NodeNum];
	unsigned SuccTree = SubtreeClasses[P.second->NodeNum];
	if (PredTree == SuccTree)
	continue;
	unsigned Depth = P.first->getDepth();
	addConnection(PredTree, SuccTree, Depth);
	addConnection(SuccTree, PredTree, Depth);
	}
	}

	protected:
	/// Joins the predecessor subtree with the successor that is its DFS parent.
	/// Applies some heuristics before joining.
	bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
	bool CheckLimit = true) {
	assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");

	// Check if the predecessor is already joined.
	const SUnit *PredSU = PredDep.getSUnit();
	unsigned PredNum = PredSU->NodeNum;
	if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
	return false;

	// Four is the magic number of successors before a node is considered a
	// pinch point.
	unsigned NumDataSucs = 0;
	for (const SDep &SuccDep : PredSU->Succs) {
	if (SuccDep.getKind() == SDep::Data) {
	if (++NumDataSucs >= 4)
	return false;
	}
	}
	if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
	return false;
	R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
	SubtreeClasses.join(Succ->NodeNum, PredNum);
	return true;
	}

	/// Called by finalize() to record a connection between trees.
	void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
	if (!Depth)
	return;

	do {
	SmallVectorImpl<SchedDFSResult::Connection> &Connections =
	R.SubtreeConnections[FromTree];
	for (SchedDFSResult::Connection &C : Connections) {
	if (C.TreeID == ToTree) {
	C.Level = std::max(C.Level, Depth);
	return;
	}
	}
	Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
	FromTree = R.DFSTreeData[FromTree].ParentTreeID;
	} while (FromTree != SchedDFSResult::InvalidSubtreeID);
	}
	};

	} // end namespace llvm

	namespace {

	/// Manage the stack used by a reverse depth-first search over the DAG.
	class SchedDAGReverseDFS {
	std::vector<std::pair<const SUnit *, SUnit::const_pred_iterator>> DFSStack;

	public:
	bool isComplete() const { return DFSStack.empty(); }

	void follow(const SUnit *SU) {
	DFSStack.push_back(std::make_pair(SU, SU->Preds.begin()));
	}
	void advance() { ++DFSStack.back().second; }

	const SDep *backtrack() {
	DFSStack.pop_back();
	return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second);
	}

	const SUnit *getCurr() const { return DFSStack.back().first; }

	SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }

	SUnit::const_pred_iterator getPredEnd() const {
	return getCurr()->Preds.end();
	}
	};

	} // end anonymous namespace

	static bool hasDataSucc(const SUnit *SU) {
	for (const SDep &SuccDep : SU->Succs) {
	if (SuccDep.getKind() == SDep::Data &&
	!SuccDep.getSUnit()->isBoundaryNode())
	return true;
	}
	return false;
	}

	/// Computes an ILP metric for all nodes in the subDAG reachable via depth-first
	/// search from this root.
	void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
	if (!IsBottomUp)
	llvm_unreachable("Top-down ILP metric is unimplemnted");

	SchedDFSImpl Impl(*this);
	for (const SUnit &SU : SUnits) {
	if (Impl.isVisited(&SU) \|\| hasDataSucc(&SU))
	continue;

	SchedDAGReverseDFS DFS;
	Impl.visitPreorder(&SU);
	DFS.follow(&SU);
	while (true) {
	// Traverse the leftmost path as far as possible.
	while (DFS.getPred() != DFS.getPredEnd()) {
	const SDep &PredDep = *DFS.getPred();
	DFS.advance();
	// Ignore non-data edges.
	if (PredDep.getKind() != SDep::Data
	\|\| PredDep.getSUnit()->isBoundaryNode()) {
	continue;
	}
	// An already visited edge is a cross edge, assuming an acyclic DAG.
	if (Impl.isVisited(PredDep.getSUnit())) {
	Impl.visitCrossEdge(PredDep, DFS.getCurr());
	continue;
	}
	Impl.visitPreorder(PredDep.getSUnit());
	DFS.follow(PredDep.getSUnit());
	}
	// Visit the top of the stack in postorder and backtrack.
	const SUnit *Child = DFS.getCurr();
	const SDep *PredDep = DFS.backtrack();
	Impl.visitPostorderNode(Child);
	if (PredDep)
	Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
	if (DFS.isComplete())
	break;
	}
	}
	Impl.finalize();
	}

	/// The root of the given SubtreeID was just scheduled. For all subtrees
	/// connected to this tree, record the depth of the connection so that the
	/// nearest connected subtrees can be prioritized.
	void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
	for (const Connection &C : SubtreeConnections[SubtreeID]) {
	SubtreeConnectLevels[C.TreeID] =
	std::max(SubtreeConnectLevels[C.TreeID], C.Level);
	DEBUG(dbgs() << " Tree: " << C.TreeID
	<< " @" << SubtreeConnectLevels[C.TreeID] << '\n');
	}
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void ILPValue::print(raw_ostream &OS) const {
	OS << InstrCount << " / " << Length << " = ";
	if (!Length)
	OS << "BADILP";
	else
	OS << format("%g", ((double)InstrCount / Length));
	}

	LLVM_DUMP_METHOD void ILPValue::dump() const {
	dbgs() << *this << '\n';
	}

	namespace llvm {

	LLVM_DUMP_METHOD
	raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
	Val.print(OS);
	return OS;
	}

	} // end namespace llvm

	#endif
	Index: head/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (revision 322319)
	+++ head/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp (revision 322320)
	@@ -1,9828 +1,9837 @@
	//===-- SelectionDAGBuilder.cpp - Selection-DAG building ------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This implements routines for translating from LLVM IR into SelectionDAG IR.
	//
	//===----------------------------------------------------------------------===//

	#include "SelectionDAGBuilder.h"
	#include "SDNodeDbgValue.h"
	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/BranchProbabilityInfo.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/Loads.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/Analysis.h"
	#include "llvm/CodeGen/FastISel.h"
	#include "llvm/CodeGen/FunctionLoweringInfo.h"
	#include "llvm/CodeGen/GCMetadata.h"
	#include "llvm/CodeGen/GCStrategy.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
	#include "llvm/CodeGen/StackMaps.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/ConstantRange.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Statepoint.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetFrameLowering.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetIntrinsicInfo.h"
	#include "llvm/Target/TargetLowering.h"
	#include "llvm/Target/TargetOptions.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <algorithm>
	#include <utility>
	using namespace llvm;

	#define DEBUG_TYPE "isel"

	/// LimitFloatPrecision - Generate low-precision inline sequences for
	/// some float libcalls (6, 8 or 12 bits).
	static unsigned LimitFloatPrecision;

	static cl::opt<unsigned, true>
	LimitFPPrecision("limit-float-precision",
	cl::desc("Generate low-precision inline sequences "
	"for some float libcalls"),
	cl::location(LimitFloatPrecision),
	cl::init(0));
	// Limit the width of DAG chains. This is important in general to prevent
	// DAG-based analysis from blowing up. For example, alias analysis and
	// load clustering may not complete in reasonable time. It is difficult to
	// recognize and avoid this situation within each individual analysis, and
	// future analyses are likely to have the same behavior. Limiting DAG width is
	// the safe approach and will be especially important with global DAGs.
	//
	// MaxParallelChains default is arbitrarily high to avoid affecting
	// optimization, but could be lowered to improve compile time. Any ld-ld-st-st
	// sequence over this should have been converted to llvm.memcpy by the
	// frontend. It is easy to induce this behavior with .ll code such as:
	// %buffer = alloca [4096 x i8]
	// %data = load [4096 x i8]* %argPtr
	// store [4096 x i8] %data, [4096 x i8]* %buffer
	static const unsigned MaxParallelChains = 64;

	+// True if the Value passed requires ABI mangling as it is a parameter to a
	+// function or a return value from a function which is not an intrinsic.
	+static bool isABIRegCopy(const Value * V) {
	+ const bool IsRetInst = V && isa<ReturnInst>(V);
	+ const bool IsCallInst = V && isa<CallInst>(V);
	+ const bool IsInLineAsm =
	+ IsCallInst && static_cast<const CallInst *>(V)->isInlineAsm();
	+ const bool IsIndirectFunctionCall =
	+ IsCallInst && !IsInLineAsm &&
	+ !static_cast<const CallInst *>(V)->getCalledFunction();
	+ // It is possible that the call instruction is an inline asm statement or an
	+ // indirect function call in which case the return value of
	+ // getCalledFunction() would be nullptr.
	+ const bool IsInstrinsicCall =
	+ IsCallInst && !IsInLineAsm && !IsIndirectFunctionCall &&
	+ static_cast<const CallInst *>(V)->getCalledFunction()->getIntrinsicID() !=
	+ Intrinsic::not_intrinsic;
	+
	+ return IsRetInst \|\| (IsCallInst && (!IsInLineAsm && !IsInstrinsicCall));
	+}
	+
	static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
	const SDValue *Parts, unsigned NumParts,
	MVT PartVT, EVT ValueVT, const Value *V,
	bool IsABIRegCopy);

	/// getCopyFromParts - Create a value that contains the specified legal parts
	/// combined into the value they represent. If the parts combine to a type
	/// larger than ValueVT then AssertOp can be used to specify whether the extra
	/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
	/// (ISD::AssertSext).
	static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
	const SDValue *Parts, unsigned NumParts,
	MVT PartVT, EVT ValueVT, const Value *V,
	Optional<ISD::NodeType> AssertOp = None,
	bool IsABIRegCopy = false) {
	if (ValueVT.isVector())
	return getCopyFromPartsVector(DAG, DL, Parts, NumParts,
	PartVT, ValueVT, V, IsABIRegCopy);

	assert(NumParts > 0 && "No parts to assemble!");
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Val = Parts[0];

	if (NumParts > 1) {
	// Assemble the value from multiple parts.
	if (ValueVT.isInteger()) {
	unsigned PartBits = PartVT.getSizeInBits();
	unsigned ValueBits = ValueVT.getSizeInBits();

	// Assemble the power of 2 part.
	unsigned RoundParts = NumParts & (NumParts - 1) ?
	1 << Log2_32(NumParts) : NumParts;
	unsigned RoundBits = PartBits * RoundParts;
	EVT RoundVT = RoundBits == ValueBits ?
	ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits);
	SDValue Lo, Hi;

	EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);

	if (RoundParts > 2) {
	Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
	PartVT, HalfVT, V);
	Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
	RoundParts / 2, PartVT, HalfVT, V);
	} else {
	Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
	Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
	}

	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);

	Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);

	if (RoundParts < NumParts) {
	// Assemble the trailing non-power-of-2 part.
	unsigned OddParts = NumParts - RoundParts;
	EVT OddVT = EVT::getIntegerVT(DAG.getContext(), OddParts PartBits);
	Hi = getCopyFromParts(DAG, DL,
	Parts + RoundParts, OddParts, PartVT, OddVT, V);

	// Combine the round and odd parts.
	Lo = Val;
	if (DAG.getDataLayout().isBigEndian())
	std::swap(Lo, Hi);
	EVT TotalVT = EVT::getIntegerVT(DAG.getContext(), NumParts PartBits);
	Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
	Hi =
	DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
	DAG.getConstant(Lo.getValueSizeInBits(), DL,
	TLI.getPointerTy(DAG.getDataLayout())));
	Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
	Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
	}
	} else if (PartVT.isFloatingPoint()) {
	// FP split into multiple FP parts (for ppcf128)
	assert(ValueVT == EVT(MVT::ppcf128) && PartVT == MVT::f64 &&
	"Unexpected split");
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
	Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
	if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
	std::swap(Lo, Hi);
	Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
	} else {
	// FP split into integer parts (soft fp)
	assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
	!PartVT.isVector() && "Unexpected split");
	EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
	Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V);
	}
	}

	// There is now one part, held in Val. Correct it to match ValueVT.
	// PartEVT is the type of the register class that holds the value.
	// ValueVT is the type of the inline asm operation.
	EVT PartEVT = Val.getValueType();

	if (PartEVT == ValueVT)
	return Val;

	if (PartEVT.isInteger() && ValueVT.isFloatingPoint() &&
	ValueVT.bitsLT(PartEVT)) {
	// For an FP value in an integer part, we need to truncate to the right
	// width first.
	PartEVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
	Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val);
	}

	// Handle types that have the same size.
	if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
	return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

	// Handle types with different sizes.
	if (PartEVT.isInteger() && ValueVT.isInteger()) {
	if (ValueVT.bitsLT(PartEVT)) {
	// For a truncate, see if we have any information to
	// indicate whether the truncated bits will always be
	// zero or sign-extension.
	if (AssertOp.hasValue())
	Val = DAG.getNode(*AssertOp, DL, PartEVT, Val,
	DAG.getValueType(ValueVT));
	return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
	}
	return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
	}

	if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
	// FP_ROUND's are always exact here.
	if (ValueVT.bitsLT(Val.getValueType()))
	return DAG.getNode(
	ISD::FP_ROUND, DL, ValueVT, Val,
	DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));

	return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
	}

	llvm_unreachable("Unknown mismatch!");
	}

	static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
	const Twine &ErrMsg) {
	const Instruction *I = dyn_cast_or_null<Instruction>(V);
	if (!V)
	return Ctx.emitError(ErrMsg);

	const char *AsmError = ", possible invalid constraint for vector type";
	if (const CallInst *CI = dyn_cast<CallInst>(I))
	if (isa<InlineAsm>(CI->getCalledValue()))
	return Ctx.emitError(I, ErrMsg + AsmError);

	return Ctx.emitError(I, ErrMsg);
	}

	/// getCopyFromPartsVector - Create a value that contains the specified legal
	/// parts combined into the value they represent. If the parts combine to a
	/// type larger than ValueVT then AssertOp can be used to specify whether the
	/// extra bits are known to be zero (ISD::AssertZext) or sign extended from
	/// ValueVT (ISD::AssertSext).
	static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
	const SDValue *Parts, unsigned NumParts,
	MVT PartVT, EVT ValueVT, const Value *V,
	bool IsABIRegCopy) {
	assert(ValueVT.isVector() && "Not a vector value");
	assert(NumParts > 0 && "No parts to assemble!");
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Val = Parts[0];

	// Handle a multi-element vector.
	if (NumParts > 1) {
	EVT IntermediateVT;
	MVT RegisterVT;
	unsigned NumIntermediates;
	unsigned NumRegs;

	if (IsABIRegCopy) {
	NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
	*DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates,
	RegisterVT);
	} else {
	NumRegs =
	TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
	NumIntermediates, RegisterVT);
	}

	assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
	NumParts = NumRegs; // Silence a compiler warning.
	assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
	assert(RegisterVT.getSizeInBits() ==
	Parts[0].getSimpleValueType().getSizeInBits() &&
	"Part type sizes don't match!");

	// Assemble the parts into intermediate operands.
	SmallVector<SDValue, 8> Ops(NumIntermediates);
	if (NumIntermediates == NumParts) {
	// If the register was not expanded, truncate or copy the value,
	// as appropriate.
	for (unsigned i = 0; i != NumParts; ++i)
	Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
	PartVT, IntermediateVT, V);
	} else if (NumParts > 0) {
	// If the intermediate type was expanded, build the intermediate
	// operands from the parts.
	assert(NumParts % NumIntermediates == 0 &&
	"Must expand into a divisible number of parts!");
	unsigned Factor = NumParts / NumIntermediates;
	for (unsigned i = 0; i != NumIntermediates; ++i)
	Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
	PartVT, IntermediateVT, V);
	}

	// Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
	// intermediate operands.
	EVT BuiltVectorTy =
	EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(),
	(IntermediateVT.isVector()
	? IntermediateVT.getVectorNumElements() * NumParts
	: NumIntermediates));
	Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
	: ISD::BUILD_VECTOR,
	DL, BuiltVectorTy, Ops);
	}

	// There is now one part, held in Val. Correct it to match ValueVT.
	EVT PartEVT = Val.getValueType();

	if (PartEVT == ValueVT)
	return Val;

	if (PartEVT.isVector()) {
	// If the element type of the source/dest vectors are the same, but the
	// parts vector has more elements than the value vector, then we have a
	// vector widening case (e.g. <2 x float> -> <4 x float>). Extract the
	// elements we want.
	if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
	assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
	"Cannot narrow, it would be a lossy transformation");
	return DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}

	// Vector/Vector bitcast.
	if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
	return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

	assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
	"Cannot handle this kind of promotion");
	// Promoted vector extract
	return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);

	}

	// Trivial bitcast if the types are the same size and the destination
	// vector type is legal.
	if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits() &&
	TLI.isTypeLegal(ValueVT))
	return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);

	if (ValueVT.getVectorNumElements() != 1) {
	// Certain ABIs require that vectors are passed as integers. For vectors
	// are the same size, this is an obvious bitcast.
	if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits()) {
	return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
	} else if (ValueVT.getSizeInBits() < PartEVT.getSizeInBits()) {
	// Bitcast Val back the original type and extract the corresponding
	// vector we want.
	unsigned Elts = PartEVT.getSizeInBits() / ValueVT.getScalarSizeInBits();
	EVT WiderVecType = EVT::getVectorVT(*DAG.getContext(),
	ValueVT.getVectorElementType(), Elts);
	Val = DAG.getBitcast(WiderVecType, Val);
	return DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}

	diagnosePossiblyInvalidConstraint(
	*DAG.getContext(), V, "non-trivial scalar-to-vector conversion");
	return DAG.getUNDEF(ValueVT);
	}

	// Handle cases such as i8 -> <1 x i1>
	EVT ValueSVT = ValueVT.getVectorElementType();
	if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT)
	Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
	: DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);

	return DAG.getBuildVector(ValueVT, DL, Val);
	}

	static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
	SDValue Val, SDValue *Parts, unsigned NumParts,
	MVT PartVT, const Value *V, bool IsABIRegCopy);

	/// getCopyToParts - Create a series of nodes that contain the specified value
	/// split into legal parts. If the parts contain more bits than Val, then, for
	/// integers, ExtendKind can be used to specify how to generate the extra bits.
	static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
	SDValue *Parts, unsigned NumParts, MVT PartVT,
	const Value *V,
	ISD::NodeType ExtendKind = ISD::ANY_EXTEND,
	bool IsABIRegCopy = false) {
	EVT ValueVT = Val.getValueType();

	// Handle the vector case separately.
	if (ValueVT.isVector())
	return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V,
	IsABIRegCopy);

	unsigned PartBits = PartVT.getSizeInBits();
	unsigned OrigNumParts = NumParts;
	assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) &&
	"Copying to an illegal type!");

	if (NumParts == 0)
	return;

	assert(!ValueVT.isVector() && "Vector case handled elsewhere");
	EVT PartEVT = PartVT;
	if (PartEVT == ValueVT) {
	assert(NumParts == 1 && "No-op copy with multiple parts!");
	Parts[0] = Val;
	return;
	}

	if (NumParts * PartBits > ValueVT.getSizeInBits()) {
	// If the parts cover more bits than the value has, promote the value.
	if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
	assert(NumParts == 1 && "Do not know what to promote to!");
	Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
	} else {
	if (ValueVT.isFloatingPoint()) {
	// FP values need to be bitcast, then extended if they are being put
	// into a larger container.
	ValueVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
	Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
	}
	assert((PartVT.isInteger() \|\| PartVT == MVT::x86mmx) &&
	ValueVT.isInteger() &&
	"Unknown mismatch!");
	ValueVT = EVT::getIntegerVT(DAG.getContext(), NumParts PartBits);
	Val = DAG.getNode(ExtendKind, DL, ValueVT, Val);
	if (PartVT == MVT::x86mmx)
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	}
	} else if (PartBits == ValueVT.getSizeInBits()) {
	// Different types of the same size.
	assert(NumParts == 1 && PartEVT != ValueVT);
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	} else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
	// If the parts cover less bits than value has, truncate the value.
	assert((PartVT.isInteger() \|\| PartVT == MVT::x86mmx) &&
	ValueVT.isInteger() &&
	"Unknown mismatch!");
	ValueVT = EVT::getIntegerVT(DAG.getContext(), NumParts PartBits);
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
	if (PartVT == MVT::x86mmx)
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	}

	// The value may have changed - recompute ValueVT.
	ValueVT = Val.getValueType();
	assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
	"Failed to tile the value with PartVT!");

	if (NumParts == 1) {
	if (PartEVT != ValueVT) {
	diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
	"scalar-to-vector conversion failed");
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	}

	Parts[0] = Val;
	return;
	}

	// Expand the value into multiple parts.
	if (NumParts & (NumParts - 1)) {
	// The number of parts is not a power of 2. Split off and copy the tail.
	assert(PartVT.isInteger() && ValueVT.isInteger() &&
	"Do not know what to expand to!");
	unsigned RoundParts = 1 << Log2_32(NumParts);
	unsigned RoundBits = RoundParts * PartBits;
	unsigned OddParts = NumParts - RoundParts;
	SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
	DAG.getIntPtrConstant(RoundBits, DL));
	getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V);

	if (DAG.getDataLayout().isBigEndian())
	// The odd parts were reversed by getCopyToParts - unreverse them.
	std::reverse(Parts + RoundParts, Parts + NumParts);

	NumParts = RoundParts;
	ValueVT = EVT::getIntegerVT(DAG.getContext(), NumParts PartBits);
	Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
	}

	// The number of parts is a power of 2. Repeatedly bisect the value using
	// EXTRACT_ELEMENT.
	Parts[0] = DAG.getNode(ISD::BITCAST, DL,
	EVT::getIntegerVT(*DAG.getContext(),
	ValueVT.getSizeInBits()),
	Val);

	for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
	for (unsigned i = 0; i < NumParts; i += StepSize) {
	unsigned ThisBits = StepSize * PartBits / 2;
	EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
	SDValue &Part0 = Parts[i];
	SDValue &Part1 = Parts[i+StepSize/2];

	Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
	ThisVT, Part0, DAG.getIntPtrConstant(1, DL));
	Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
	ThisVT, Part0, DAG.getIntPtrConstant(0, DL));

	if (ThisBits == PartBits && ThisVT != PartVT) {
	Part0 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part0);
	Part1 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part1);
	}
	}
	}

	if (DAG.getDataLayout().isBigEndian())
	std::reverse(Parts, Parts + OrigNumParts);
	}


	/// getCopyToPartsVector - Create a series of nodes that contain the specified
	/// value split into legal parts.
	static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
	SDValue Val, SDValue *Parts, unsigned NumParts,
	MVT PartVT, const Value *V,
	bool IsABIRegCopy) {

	EVT ValueVT = Val.getValueType();
	assert(ValueVT.isVector() && "Not a vector");
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (NumParts == 1) {
	EVT PartEVT = PartVT;
	if (PartEVT == ValueVT) {
	// Nothing to do.
	} else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
	// Bitconvert vector->vector case.
	Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
	} else if (PartVT.isVector() &&
	PartEVT.getVectorElementType() == ValueVT.getVectorElementType() &&
	PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
	EVT ElementVT = PartVT.getVectorElementType();
	// Vector widening case, e.g. <2 x float> -> <4 x float>. Shuffle in
	// undef elements.
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
	Ops.push_back(DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, ElementVT, Val,
	DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));

	for (unsigned i = ValueVT.getVectorNumElements(),
	e = PartVT.getVectorNumElements(); i != e; ++i)
	Ops.push_back(DAG.getUNDEF(ElementVT));

	Val = DAG.getBuildVector(PartVT, DL, Ops);

	// FIXME: Use CONCAT for 2x -> 4x.

	//SDValue UndefElts = DAG.getUNDEF(VectorTy);
	//Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
	} else if (PartVT.isVector() &&
	PartEVT.getVectorElementType().bitsGE(
	ValueVT.getVectorElementType()) &&
	PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {

	// Promoted vector extract
	Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
	} else {
	if (ValueVT.getVectorNumElements() == 1) {
	Val = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));

	} else {
	assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() &&
	"lossy conversion of vector to scalar type");
	EVT IntermediateType =
	EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
	Val = DAG.getBitcast(IntermediateType, Val);
	Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
	}
	}

	assert(Val.getValueType() == PartVT && "Unexpected vector part value type");
	Parts[0] = Val;
	return;
	}

	// Handle a multi-element vector.
	EVT IntermediateVT;
	MVT RegisterVT;
	unsigned NumIntermediates;
	unsigned NumRegs;
	if (IsABIRegCopy) {
	NumRegs = TLI.getVectorTypeBreakdownForCallingConv(
	*DAG.getContext(), ValueVT, IntermediateVT, NumIntermediates,
	RegisterVT);
	} else {
	NumRegs =
	TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
	NumIntermediates, RegisterVT);
	}
	unsigned NumElements = ValueVT.getVectorNumElements();

	assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
	NumParts = NumRegs; // Silence a compiler warning.
	assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");

	// Convert the vector to the appropiate type if necessary.
	unsigned DestVectorNoElts =
	NumIntermediates *
	(IntermediateVT.isVector() ? IntermediateVT.getVectorNumElements() : 1);
	EVT BuiltVectorTy = EVT::getVectorVT(
	*DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
	if (Val.getValueType() != BuiltVectorTy)
	Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);

	// Split the vector into intermediate operands.
	SmallVector<SDValue, 8> Ops(NumIntermediates);
	for (unsigned i = 0; i != NumIntermediates; ++i) {
	if (IntermediateVT.isVector())
	Ops[i] =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
	DAG.getConstant(i * (NumElements / NumIntermediates), DL,
	TLI.getVectorIdxTy(DAG.getDataLayout())));
	else
	Ops[i] = DAG.getNode(
	ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
	DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
	}

	// Split the intermediate operands into legal parts.
	if (NumParts == NumIntermediates) {
	// If the register was not expanded, promote or copy the value,
	// as appropriate.
	for (unsigned i = 0; i != NumParts; ++i)
	getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V);
	} else if (NumParts > 0) {
	// If the intermediate type was expanded, split each the value into
	// legal parts.
	assert(NumIntermediates != 0 && "division by zero");
	assert(NumParts % NumIntermediates == 0 &&
	"Must expand into a divisible number of parts!");
	unsigned Factor = NumParts / NumIntermediates;
	for (unsigned i = 0; i != NumIntermediates; ++i)
	getCopyToParts(DAG, DL, Ops[i], &Parts[i*Factor], Factor, PartVT, V);
	}
	}

	RegsForValue::RegsForValue() { IsABIMangled = false; }

	RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
	EVT valuevt, bool IsABIMangledValue)
	: ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs),
	RegCount(1, regs.size()), IsABIMangled(IsABIMangledValue) {}

	RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
	const DataLayout &DL, unsigned Reg, Type *Ty,
	bool IsABIMangledValue) {
	ComputeValueVTs(TLI, DL, Ty, ValueVTs);

	IsABIMangled = IsABIMangledValue;

	for (EVT ValueVT : ValueVTs) {
	unsigned NumRegs = IsABIMangledValue
	? TLI.getNumRegistersForCallingConv(Context, ValueVT)
	: TLI.getNumRegisters(Context, ValueVT);
	MVT RegisterVT = IsABIMangledValue
	? TLI.getRegisterTypeForCallingConv(Context, ValueVT)
	: TLI.getRegisterType(Context, ValueVT);
	for (unsigned i = 0; i != NumRegs; ++i)
	Regs.push_back(Reg + i);
	RegVTs.push_back(RegisterVT);
	RegCount.push_back(NumRegs);
	Reg += NumRegs;
	}
	}

	SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
	FunctionLoweringInfo &FuncInfo,
	const SDLoc &dl, SDValue &Chain,
	SDValue Flag, const Value V) const {
	// A Value with type {} or [0 x %t] needs no registers.
	if (ValueVTs.empty())
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Assemble the legal parts into the final values.
	SmallVector<SDValue, 4> Values(ValueVTs.size());
	SmallVector<SDValue, 8> Parts;
	for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
	// Copy the legal parts from the registers.
	EVT ValueVT = ValueVTs[Value];
	unsigned NumRegs = RegCount[Value];
	MVT RegisterVT = IsABIMangled
	? TLI.getRegisterTypeForCallingConv(RegVTs[Value])
	: RegVTs[Value];

	Parts.resize(NumRegs);
	for (unsigned i = 0; i != NumRegs; ++i) {
	SDValue P;
	if (!Flag) {
	P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
	} else {
	P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
	*Flag = P.getValue(2);
	}

	Chain = P.getValue(1);
	Parts[i] = P;

	// If the source register was virtual and if we know something about it,
	// add an assert node.
	if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) \|\|
	!RegisterVT.isInteger() \|\| RegisterVT.isVector())
	continue;

	const FunctionLoweringInfo::LiveOutInfo *LOI =
	FuncInfo.GetLiveOutRegInfo(Regs[Part+i]);
	if (!LOI)
	continue;

	unsigned RegSize = RegisterVT.getSizeInBits();
	unsigned NumSignBits = LOI->NumSignBits;
	unsigned NumZeroBits = LOI->Known.countMinLeadingZeros();

	if (NumZeroBits == RegSize) {
	// The current value is a zero.
	// Explicitly express that as it would be easier for
	// optimizations to kick in.
	Parts[i] = DAG.getConstant(0, dl, RegisterVT);
	continue;
	}

	// FIXME: We capture more information than the dag can represent. For
	// now, just use the tightest assertzext/assertsext possible.
	bool isSExt = true;
	EVT FromVT(MVT::Other);
	if (NumSignBits == RegSize) {
	isSExt = true; // ASSERT SEXT 1
	FromVT = MVT::i1;
	} else if (NumZeroBits >= RegSize - 1) {
	isSExt = false; // ASSERT ZEXT 1
	FromVT = MVT::i1;
	} else if (NumSignBits > RegSize - 8) {
	isSExt = true; // ASSERT SEXT 8
	FromVT = MVT::i8;
	} else if (NumZeroBits >= RegSize - 8) {
	isSExt = false; // ASSERT ZEXT 8
	FromVT = MVT::i8;
	} else if (NumSignBits > RegSize - 16) {
	isSExt = true; // ASSERT SEXT 16
	FromVT = MVT::i16;
	} else if (NumZeroBits >= RegSize - 16) {
	isSExt = false; // ASSERT ZEXT 16
	FromVT = MVT::i16;
	} else if (NumSignBits > RegSize - 32) {
	isSExt = true; // ASSERT SEXT 32
	FromVT = MVT::i32;
	} else if (NumZeroBits >= RegSize - 32) {
	isSExt = false; // ASSERT ZEXT 32
	FromVT = MVT::i32;
	} else {
	continue;
	}
	// Add an assertion node.
	assert(FromVT != MVT::Other);
	Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
	RegisterVT, P, DAG.getValueType(FromVT));
	}

	Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
	NumRegs, RegisterVT, ValueVT, V);
	Part += NumRegs;
	Parts.clear();
	}

	return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
	}

	void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
	const SDLoc &dl, SDValue &Chain, SDValue *Flag,
	const Value *V,
	ISD::NodeType PreferredExtendType) const {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	ISD::NodeType ExtendKind = PreferredExtendType;

	// Get the list of the values's legal parts.
	unsigned NumRegs = Regs.size();
	SmallVector<SDValue, 8> Parts(NumRegs);
	for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
	unsigned NumParts = RegCount[Value];

	MVT RegisterVT = IsABIMangled
	? TLI.getRegisterTypeForCallingConv(RegVTs[Value])
	: RegVTs[Value];

	if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
	ExtendKind = ISD::ZERO_EXTEND;

	getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value),
	&Parts[Part], NumParts, RegisterVT, V, ExtendKind);
	Part += NumParts;
	}

	// Copy the parts into the registers.
	SmallVector<SDValue, 8> Chains(NumRegs);
	for (unsigned i = 0; i != NumRegs; ++i) {
	SDValue Part;
	if (!Flag) {
	Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
	} else {
	Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
	*Flag = Part.getValue(1);
	}

	Chains[i] = Part.getValue(0);
	}

	if (NumRegs == 1 \|\| Flag)
	// If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
	// flagged to it. That is the CopyToReg nodes and the user are considered
	// a single scheduling unit. If we create a TokenFactor and return it as
	// chain, then the TokenFactor is both a predecessor (operand) of the
	// user as well as a successor (the TF operands are flagged to the user).
	// c1, f1 = CopyToReg
	// c2, f2 = CopyToReg
	// c3 = TokenFactor c1, c2
	// ...
	// = op c3, ..., f2
	Chain = Chains[NumRegs-1];
	else
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
	unsigned MatchingIdx, const SDLoc &dl,
	SelectionDAG &DAG,
	std::vector<SDValue> &Ops) const {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
	if (HasMatching)
	Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
	else if (!Regs.empty() &&
	TargetRegisterInfo::isVirtualRegister(Regs.front())) {
	// Put the register class of the virtual registers in the flag word. That
	// way, later passes can recompute register class constraints for inline
	// assembly as well as normal instructions.
	// Don't do this for tied operands that can use the regclass information
	// from the def.
	const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
	const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
	Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
	}

	SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
	Ops.push_back(Res);

	unsigned SP = TLI.getStackPointerRegisterToSaveRestore();
	for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
	unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
	MVT RegisterVT = RegVTs[Value];
	for (unsigned i = 0; i != NumRegs; ++i) {
	assert(Reg < Regs.size() && "Mismatch in # registers expected");
	unsigned TheReg = Regs[Reg++];
	Ops.push_back(DAG.getRegister(TheReg, RegisterVT));

	if (TheReg == SP && Code == InlineAsm::Kind_Clobber) {
	// If we clobbered the stack pointer, MFI should know about it.
	assert(DAG.getMachineFunction().getFrameInfo().hasOpaqueSPAdjustment());
	}
	}
	}
	}

	void SelectionDAGBuilder::init(GCFunctionInfo gfi, AliasAnalysis aa,
	const TargetLibraryInfo *li) {
	AA = aa;
	GFI = gfi;
	LibInfo = li;
	DL = &DAG.getDataLayout();
	Context = DAG.getContext();
	LPadToCallSiteMap.clear();
	}

	void SelectionDAGBuilder::clear() {
	NodeMap.clear();
	UnusedArgNodeMap.clear();
	PendingLoads.clear();
	PendingExports.clear();
	CurInst = nullptr;
	HasTailCall = false;
	SDNodeOrder = LowestSDNodeOrder;
	StatepointLowering.clear();
	}

	void SelectionDAGBuilder::clearDanglingDebugInfo() {
	DanglingDebugInfoMap.clear();
	}

	SDValue SelectionDAGBuilder::getRoot() {
	if (PendingLoads.empty())
	return DAG.getRoot();

	if (PendingLoads.size() == 1) {
	SDValue Root = PendingLoads[0];
	DAG.setRoot(Root);
	PendingLoads.clear();
	return Root;
	}

	// Otherwise, we have to make a token factor node.
	SDValue Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
	PendingLoads);
	PendingLoads.clear();
	DAG.setRoot(Root);
	return Root;
	}

	SDValue SelectionDAGBuilder::getControlRoot() {
	SDValue Root = DAG.getRoot();

	if (PendingExports.empty())
	return Root;

	// Turn all of the CopyToReg chains into one factored node.
	if (Root.getOpcode() != ISD::EntryToken) {
	unsigned i = 0, e = PendingExports.size();
	for (; i != e; ++i) {
	assert(PendingExports[i].getNode()->getNumOperands() > 1);
	if (PendingExports[i].getNode()->getOperand(0) == Root)
	break; // Don't add the root if we already indirectly depend on it.
	}

	if (i == e)
	PendingExports.push_back(Root);
	}

	Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
	PendingExports);
	PendingExports.clear();
	DAG.setRoot(Root);
	return Root;
	}

	void SelectionDAGBuilder::visit(const Instruction &I) {
	// Set up outgoing PHI node register values before emitting the terminator.
	if (isa<TerminatorInst>(&I)) {
	HandlePHINodesInSuccessorBlocks(I.getParent());
	}

	// Increase the SDNodeOrder if dealing with a non-debug instruction.
	if (!isa<DbgInfoIntrinsic>(I))
	++SDNodeOrder;

	CurInst = &I;

	visit(I.getOpcode(), I);

	if (!isa<TerminatorInst>(&I) && !HasTailCall &&
	!isStatepoint(&I)) // statepoints handle their exports internally
	CopyToExportRegsIfNeeded(&I);

	CurInst = nullptr;
	}

	void SelectionDAGBuilder::visitPHI(const PHINode &) {
	llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!");
	}

	void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
	// Note: this doesn't use InstVisitor, because it has to work with
	// ConstantExpr's in addition to instructions.
	switch (Opcode) {
	default: llvm_unreachable("Unknown instruction type encountered!");
	// Build the switch statement using the Instruction.def file.
	#define HANDLE_INST(NUM, OPCODE, CLASS) \
	case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
	#include "llvm/IR/Instruction.def"
	}
	}

	// resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
	// generate the debug data structures now that we've seen its definition.
	void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
	SDValue Val) {
	DanglingDebugInfo &DDI = DanglingDebugInfoMap[V];
	if (DDI.getDI()) {
	const DbgValueInst *DI = DDI.getDI();
	DebugLoc dl = DDI.getdl();
	unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
	DILocalVariable *Variable = DI->getVariable();
	DIExpression *Expr = DI->getExpression();
	assert(Variable->isValidLocationForIntrinsic(dl) &&
	"Expected inlined-at fields to agree");
	uint64_t Offset = DI->getOffset();
	SDDbgValue *SDV;
	if (Val.getNode()) {
	if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, Offset, false,
	Val)) {
	SDV = getDbgValue(Val, Variable, Expr, Offset, dl, DbgSDNodeOrder);
	DAG.AddDbgValue(SDV, Val.getNode(), false);
	}
	} else
	DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
	DanglingDebugInfoMap[V] = DanglingDebugInfo();
	}
	}

	/// getCopyFromRegs - If there was virtual register allocated for the value V
	/// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
	SDValue SelectionDAGBuilder::getCopyFromRegs(const Value V, Type Ty) {
	DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
	SDValue Result;

	if (It != FuncInfo.ValueMap.end()) {
	unsigned InReg = It->second;
	- bool IsABIRegCopy =
	- V && ((isa<CallInst>(V) &&
	- !(static_cast<const CallInst *>(V))->isInlineAsm()) \|\|
	- isa<ReturnInst>(V));

	RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
	- DAG.getDataLayout(), InReg, Ty, IsABIRegCopy);
	+ DAG.getDataLayout(), InReg, Ty, isABIRegCopy(V));
	SDValue Chain = DAG.getEntryNode();
	Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr,
	V);
	resolveDanglingDebugInfo(V, Result);
	}

	return Result;
	}

	/// getValue - Return an SDValue for the given Value.
	SDValue SelectionDAGBuilder::getValue(const Value *V) {
	// If we already have an SDValue for this value, use it. It's important
	// to do this first, so that we don't create a CopyFromReg if we already
	// have a regular SDValue.
	SDValue &N = NodeMap[V];
	if (N.getNode()) return N;

	// If there's a virtual register allocated and initialized for this
	// value, use it.
	if (SDValue copyFromReg = getCopyFromRegs(V, V->getType()))
	return copyFromReg;

	// Otherwise create a new SDValue and remember it.
	SDValue Val = getValueImpl(V);
	NodeMap[V] = Val;
	resolveDanglingDebugInfo(V, Val);
	return Val;
	}

	// Return true if SDValue exists for the given Value
	bool SelectionDAGBuilder::findValue(const Value *V) const {
	return (NodeMap.find(V) != NodeMap.end()) \|\|
	(FuncInfo.ValueMap.find(V) != FuncInfo.ValueMap.end());
	}

	/// getNonRegisterValue - Return an SDValue for the given Value, but
	/// don't look in FuncInfo.ValueMap for a virtual register.
	SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
	// If we already have an SDValue for this value, use it.
	SDValue &N = NodeMap[V];
	if (N.getNode()) {
	if (isa<ConstantSDNode>(N) \|\| isa<ConstantFPSDNode>(N)) {
	// Remove the debug location from the node as the node is about to be used
	// in a location which may differ from the original debug location. This
	// is relevant to Constant and ConstantFP nodes because they can appear
	// as constant expressions inside PHI nodes.
	N->setDebugLoc(DebugLoc());
	}
	return N;
	}

	// Otherwise create a new SDValue and remember it.
	SDValue Val = getValueImpl(V);
	NodeMap[V] = Val;
	resolveDanglingDebugInfo(V, Val);
	return Val;
	}

	/// getValueImpl - Helper function for getValue and getNonRegisterValue.
	/// Create an SDValue for the given value.
	SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (const Constant *C = dyn_cast<Constant>(V)) {
	EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);

	if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
	return DAG.getConstant(*CI, getCurSDLoc(), VT);

	if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
	return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);

	if (isa<ConstantPointerNull>(C)) {
	unsigned AS = V->getType()->getPointerAddressSpace();
	return DAG.getConstant(0, getCurSDLoc(),
	TLI.getPointerTy(DAG.getDataLayout(), AS));
	}

	if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
	return DAG.getConstantFP(*CFP, getCurSDLoc(), VT);

	if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
	return DAG.getUNDEF(VT);

	if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
	visit(CE->getOpcode(), *CE);
	SDValue N1 = NodeMap[V];
	assert(N1.getNode() && "visit didn't populate the NodeMap!");
	return N1;
	}

	if (isa<ConstantStruct>(C) \|\| isa<ConstantArray>(C)) {
	SmallVector<SDValue, 4> Constants;
	for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
	OI != OE; ++OI) {
	SDNode Val = getValue(OI).getNode();
	// If the operand is an empty aggregate, there are no values.
	if (!Val) continue;
	// Add each leaf value from the operand to the Constants list
	// to form a flattened list of all the values.
	for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
	Constants.push_back(SDValue(Val, i));
	}

	return DAG.getMergeValues(Constants, getCurSDLoc());
	}

	if (const ConstantDataSequential *CDS =
	dyn_cast<ConstantDataSequential>(C)) {
	SmallVector<SDValue, 4> Ops;
	for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
	SDNode *Val = getValue(CDS->getElementAsConstant(i)).getNode();
	// Add each leaf value from the operand to the Constants list
	// to form a flattened list of all the values.
	for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
	Ops.push_back(SDValue(Val, i));
	}

	if (isa<ArrayType>(CDS->getType()))
	return DAG.getMergeValues(Ops, getCurSDLoc());
	return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
	}

	if (C->getType()->isStructTy() \|\| C->getType()->isArrayTy()) {
	assert((isa<ConstantAggregateZero>(C) \|\| isa<UndefValue>(C)) &&
	"Unknown struct or array constant!");

	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), C->getType(), ValueVTs);
	unsigned NumElts = ValueVTs.size();
	if (NumElts == 0)
	return SDValue(); // empty struct
	SmallVector<SDValue, 4> Constants(NumElts);
	for (unsigned i = 0; i != NumElts; ++i) {
	EVT EltVT = ValueVTs[i];
	if (isa<UndefValue>(C))
	Constants[i] = DAG.getUNDEF(EltVT);
	else if (EltVT.isFloatingPoint())
	Constants[i] = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
	else
	Constants[i] = DAG.getConstant(0, getCurSDLoc(), EltVT);
	}

	return DAG.getMergeValues(Constants, getCurSDLoc());
	}

	if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
	return DAG.getBlockAddress(BA, VT);

	VectorType *VecTy = cast<VectorType>(V->getType());
	unsigned NumElements = VecTy->getNumElements();

	// Now that we know the number and type of the elements, get that number of
	// elements into the Ops array based on what kind of constant it is.
	SmallVector<SDValue, 16> Ops;
	if (const ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
	for (unsigned i = 0; i != NumElements; ++i)
	Ops.push_back(getValue(CV->getOperand(i)));
	} else {
	assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
	EVT EltVT =
	TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());

	SDValue Op;
	if (EltVT.isFloatingPoint())
	Op = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
	else
	Op = DAG.getConstant(0, getCurSDLoc(), EltVT);
	Ops.assign(NumElements, Op);
	}

	// Create a BUILD_VECTOR node.
	return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
	}

	// If this is a static alloca, generate it as the frameindex instead of
	// computation.
	if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
	DenseMap<const AllocaInst*, int>::iterator SI =
	FuncInfo.StaticAllocaMap.find(AI);
	if (SI != FuncInfo.StaticAllocaMap.end())
	return DAG.getFrameIndex(SI->second,
	TLI.getFrameIndexTy(DAG.getDataLayout()));
	}

	// If this is an instruction which fast-isel has deferred, select it now.
	if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
	unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
	- bool IsABIRegCopy =
	- V && ((isa<CallInst>(V) &&
	- !(static_cast<const CallInst *>(V))->isInlineAsm()) \|\|
	- isa<ReturnInst>(V));

	RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
	- Inst->getType(), IsABIRegCopy);
	+ Inst->getType(), isABIRegCopy(V));
	SDValue Chain = DAG.getEntryNode();
	return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
	}

	llvm_unreachable("Can't get register for value!");
	}

	void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
	auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
	bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
	bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
	MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
	// In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
	if (IsMSVCCXX \|\| IsCoreCLR)
	CatchPadMBB->setIsEHFuncletEntry();

	DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other, getControlRoot()));
	}

	void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
	// Update machine-CFG edge.
	MachineBasicBlock *TargetMBB = FuncInfo.MBBMap[I.getSuccessor()];
	FuncInfo.MBB->addSuccessor(TargetMBB);

	auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
	bool IsSEH = isAsynchronousEHPersonality(Pers);
	if (IsSEH) {
	// If this is not a fall-through branch or optimizations are switched off,
	// emit the branch.
	if (TargetMBB != NextBlock(FuncInfo.MBB) \|\|
	TM.getOptLevel() == CodeGenOpt::None)
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
	getControlRoot(), DAG.getBasicBlock(TargetMBB)));
	return;
	}

	// Figure out the funclet membership for the catchret's successor.
	// This will be used by the FuncletLayout pass to determine how to order the
	// BB's.
	// A 'catchret' returns to the outer scope's color.
	Value *ParentPad = I.getCatchSwitchParentPad();
	const BasicBlock *SuccessorColor;
	if (isa<ConstantTokenNone>(ParentPad))
	SuccessorColor = &FuncInfo.Fn->getEntryBlock();
	else
	SuccessorColor = cast<Instruction>(ParentPad)->getParent();
	assert(SuccessorColor && "No parent funclet for catchret!");
	MachineBasicBlock *SuccessorColorMBB = FuncInfo.MBBMap[SuccessorColor];
	assert(SuccessorColorMBB && "No MBB for SuccessorColor!");

	// Create the terminator node.
	SDValue Ret = DAG.getNode(ISD::CATCHRET, getCurSDLoc(), MVT::Other,
	getControlRoot(), DAG.getBasicBlock(TargetMBB),
	DAG.getBasicBlock(SuccessorColorMBB));
	DAG.setRoot(Ret);
	}

	void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
	// Don't emit any special code for the cleanuppad instruction. It just marks
	// the start of a funclet.
	FuncInfo.MBB->setIsEHFuncletEntry();
	FuncInfo.MBB->setIsCleanupFuncletEntry();
	}

	/// When an invoke or a cleanupret unwinds to the next EH pad, there are
	/// many places it could ultimately go. In the IR, we have a single unwind
	/// destination, but in the machine CFG, we enumerate all the possible blocks.
	/// This function skips over imaginary basic blocks that hold catchswitch
	/// instructions, and finds all the "real" machine
	/// basic block destinations. As those destinations may not be successors of
	/// EHPadBB, here we also calculate the edge probability to those destinations.
	/// The passed-in Prob is the edge probability to EHPadBB.
	static void findUnwindDestinations(
	FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
	BranchProbability Prob,
	SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
	&UnwindDests) {
	EHPersonality Personality =
	classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
	bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
	bool IsCoreCLR = Personality == EHPersonality::CoreCLR;

	while (EHPadBB) {
	const Instruction *Pad = EHPadBB->getFirstNonPHI();
	BasicBlock *NewEHPadBB = nullptr;
	if (isa<LandingPadInst>(Pad)) {
	// Stop on landingpads. They are not funclets.
	UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
	break;
	} else if (isa<CleanupPadInst>(Pad)) {
	// Stop on cleanup pads. Cleanups are always funclet entries for all known
	// personalities.
	UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
	UnwindDests.back().first->setIsEHFuncletEntry();
	break;
	} else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
	// Add the catchpad handlers to the possible destinations.
	for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
	UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
	// For MSVC++ and the CLR, catchblocks are funclets and need prologues.
	if (IsMSVCCXX \|\| IsCoreCLR)
	UnwindDests.back().first->setIsEHFuncletEntry();
	}
	NewEHPadBB = CatchSwitch->getUnwindDest();
	} else {
	continue;
	}

	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	if (BPI && NewEHPadBB)
	Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
	EHPadBB = NewEHPadBB;
	}
	}

	void SelectionDAGBuilder::visitCleanupRet(const CleanupReturnInst &I) {
	// Update successor info.
	SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
	auto UnwindDest = I.getUnwindDest();
	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	BranchProbability UnwindDestProb =
	(BPI && UnwindDest)
	? BPI->getEdgeProbability(FuncInfo.MBB->getBasicBlock(), UnwindDest)
	: BranchProbability::getZero();
	findUnwindDestinations(FuncInfo, UnwindDest, UnwindDestProb, UnwindDests);
	for (auto &UnwindDest : UnwindDests) {
	UnwindDest.first->setIsEHPad();
	addSuccessorWithProb(FuncInfo.MBB, UnwindDest.first, UnwindDest.second);
	}
	FuncInfo.MBB->normalizeSuccProbs();

	// Create the terminator node.
	SDValue Ret =
	DAG.getNode(ISD::CLEANUPRET, getCurSDLoc(), MVT::Other, getControlRoot());
	DAG.setRoot(Ret);
	}

	void SelectionDAGBuilder::visitCatchSwitch(const CatchSwitchInst &CSI) {
	report_fatal_error("visitCatchSwitch not yet implemented!");
	}

	void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	auto &DL = DAG.getDataLayout();
	SDValue Chain = getControlRoot();
	SmallVector<ISD::OutputArg, 8> Outs;
	SmallVector<SDValue, 8> OutVals;

	// Calls to @llvm.experimental.deoptimize don't generate a return value, so
	// lower
	//
	// %val = call <ty> @llvm.experimental.deoptimize()
	// ret <ty> %val
	//
	// differently.
	if (I.getParent()->getTerminatingDeoptimizeCall()) {
	LowerDeoptimizingReturn();
	return;
	}

	if (!FuncInfo.CanLowerReturn) {
	unsigned DemoteReg = FuncInfo.DemoteRegister;
	const Function *F = I.getParent()->getParent();

	// Emit a store of the return value through the virtual register.
	// Leave Outs empty so that LowerReturn won't try to load return
	// registers the usual way.
	SmallVector<EVT, 1> PtrValueVTs;
	ComputeValueVTs(TLI, DL, PointerType::getUnqual(F->getReturnType()),
	PtrValueVTs);

	SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
	DemoteReg, PtrValueVTs[0]);
	SDValue RetOp = getValue(I.getOperand(0));

	SmallVector<EVT, 4> ValueVTs;
	SmallVector<uint64_t, 4> Offsets;
	ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &Offsets);
	unsigned NumValues = ValueVTs.size();

	// An aggregate return value cannot wrap around the address space, so
	// offsets to its parts don't wrap either.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);

	SmallVector<SDValue, 4> Chains(NumValues);
	for (unsigned i = 0; i != NumValues; ++i) {
	SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(),
	RetPtr.getValueType(), RetPtr,
	DAG.getIntPtrConstant(Offsets[i],
	getCurSDLoc()),
	Flags);
	Chains[i] = DAG.getStore(Chain, getCurSDLoc(),
	SDValue(RetOp.getNode(), RetOp.getResNo() + i),
	// FIXME: better loc info would be nice.
	Add, MachinePointerInfo());
	}

	Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
	MVT::Other, Chains);
	} else if (I.getNumOperands() != 0) {
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
	unsigned NumValues = ValueVTs.size();
	if (NumValues) {
	SDValue RetOp = getValue(I.getOperand(0));

	const Function *F = I.getParent()->getParent();

	ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
	if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
	Attribute::SExt))
	ExtendKind = ISD::SIGN_EXTEND;
	else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
	Attribute::ZExt))
	ExtendKind = ISD::ZERO_EXTEND;

	LLVMContext &Context = F->getContext();
	bool RetInReg = F->getAttributes().hasAttribute(
	AttributeList::ReturnIndex, Attribute::InReg);

	for (unsigned j = 0; j != NumValues; ++j) {
	EVT VT = ValueVTs[j];

	if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
	VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);

	unsigned NumParts = TLI.getNumRegistersForCallingConv(Context, VT);
	MVT PartVT = TLI.getRegisterTypeForCallingConv(Context, VT);
	SmallVector<SDValue, 4> Parts(NumParts);
	getCopyToParts(DAG, getCurSDLoc(),
	SDValue(RetOp.getNode(), RetOp.getResNo() + j),
	&Parts[0], NumParts, PartVT, &I, ExtendKind, true);

	// 'inreg' on function refers to return value
	ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
	if (RetInReg)
	Flags.setInReg();

	// Propagate extension type if any
	if (ExtendKind == ISD::SIGN_EXTEND)
	Flags.setSExt();
	else if (ExtendKind == ISD::ZERO_EXTEND)
	Flags.setZExt();

	for (unsigned i = 0; i < NumParts; ++i) {
	Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
	VT, /isfixed=/true, 0, 0));
	OutVals.push_back(Parts[i]);
	}
	}
	}
	}

	// Push in swifterror virtual register as the last element of Outs. This makes
	// sure swifterror virtual register will be returned in the swifterror
	// physical register.
	const Function *F = I.getParent()->getParent();
	if (TLI.supportSwiftError() &&
	F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) {
	assert(FuncInfo.SwiftErrorArg && "Need a swift error argument");
	ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
	Flags.setSwiftError();
	Outs.push_back(ISD::OutputArg(Flags, EVT(TLI.getPointerTy(DL)) /vt/,
	EVT(TLI.getPointerTy(DL)) /argvt/,
	true /isfixed/, 1 /origidx/,
	0 /partOffs/));
	// Create SDNode for the swifterror virtual register.
	OutVals.push_back(
	DAG.getRegister(FuncInfo.getOrCreateSwiftErrorVRegUseAt(
	&I, FuncInfo.MBB, FuncInfo.SwiftErrorArg).first,
	EVT(TLI.getPointerTy(DL))));
	}

	bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
	CallingConv::ID CallConv =
	DAG.getMachineFunction().getFunction()->getCallingConv();
	Chain = DAG.getTargetLoweringInfo().LowerReturn(
	Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);

	// Verify that the target's LowerReturn behaved as expected.
	assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
	"LowerReturn didn't return a valid chain!");

	// Update the DAG with the new chain value resulting from return lowering.
	DAG.setRoot(Chain);
	}

	/// CopyToExportRegsIfNeeded - If the given value has virtual registers
	/// created for it, emit nodes to copy the value into the virtual
	/// registers.
	void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
	// Skip empty types
	if (V->getType()->isEmptyTy())
	return;

	DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
	if (VMI != FuncInfo.ValueMap.end()) {
	assert(!V->use_empty() && "Unused value assigned virtual registers!");
	CopyValueToVirtualRegister(V, VMI->second);
	}
	}

	/// ExportFromCurrentBlock - If this condition isn't known to be exported from
	/// the current basic block, add it to ValueMap now so that we'll get a
	/// CopyTo/FromReg.
	void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) {
	// No need to export constants.
	if (!isa<Instruction>(V) && !isa<Argument>(V)) return;

	// Already exported?
	if (FuncInfo.isExportedInst(V)) return;

	unsigned Reg = FuncInfo.InitializeRegForValue(V);
	CopyValueToVirtualRegister(V, Reg);
	}

	bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
	const BasicBlock *FromBB) {
	// The operands of the setcc have to be in this block. We don't know
	// how to export them from some other block.
	if (const Instruction *VI = dyn_cast<Instruction>(V)) {
	// Can export from current BB.
	if (VI->getParent() == FromBB)
	return true;

	// Is already exported, noop.
	return FuncInfo.isExportedInst(V);
	}

	// If this is an argument, we can export it if the BB is the entry block or
	// if it is already exported.
	if (isa<Argument>(V)) {
	if (FromBB == &FromBB->getParent()->getEntryBlock())
	return true;

	// Otherwise, can only export this if it is already exported.
	return FuncInfo.isExportedInst(V);
	}

	// Otherwise, constants can always be exported.
	return true;
	}

	/// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
	BranchProbability
	SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src,
	const MachineBasicBlock *Dst) const {
	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	const BasicBlock *SrcBB = Src->getBasicBlock();
	const BasicBlock *DstBB = Dst->getBasicBlock();
	if (!BPI) {
	// If BPI is not available, set the default probability as 1 / N, where N is
	// the number of successors.
	auto SuccSize = std::max<uint32_t>(
	std::distance(succ_begin(SrcBB), succ_end(SrcBB)), 1);
	return BranchProbability(1, SuccSize);
	}
	return BPI->getEdgeProbability(SrcBB, DstBB);
	}

	void SelectionDAGBuilder::addSuccessorWithProb(MachineBasicBlock *Src,
	MachineBasicBlock *Dst,
	BranchProbability Prob) {
	if (!FuncInfo.BPI)
	Src->addSuccessorWithoutProb(Dst);
	else {
	if (Prob.isUnknown())
	Prob = getEdgeProbability(Src, Dst);
	Src->addSuccessor(Dst, Prob);
	}
	}

	static bool InBlock(const Value V, const BasicBlock BB) {
	if (const Instruction *I = dyn_cast<Instruction>(V))
	return I->getParent() == BB;
	return true;
	}

	/// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
	/// This function emits a branch and is used at the leaves of an OR or an
	/// AND operator tree.
	///
	void
	SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
	MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	MachineBasicBlock *CurBB,
	MachineBasicBlock *SwitchBB,
	BranchProbability TProb,
	BranchProbability FProb,
	bool InvertCond) {
	const BasicBlock *BB = CurBB->getBasicBlock();

	// If the leaf of the tree is a comparison, merge the condition into
	// the caseblock.
	if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
	// The operands of the cmp have to be in this block. We don't know
	// how to export them from some other block. If this is the first block
	// of the sequence, no exporting is needed.
	if (CurBB == SwitchBB \|\|
	(isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
	isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
	ISD::CondCode Condition;
	if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
	ICmpInst::Predicate Pred =
	InvertCond ? IC->getInversePredicate() : IC->getPredicate();
	Condition = getICmpCondCode(Pred);
	} else {
	const FCmpInst *FC = cast<FCmpInst>(Cond);
	FCmpInst::Predicate Pred =
	InvertCond ? FC->getInversePredicate() : FC->getPredicate();
	Condition = getFCmpCondCode(Pred);
	if (TM.Options.NoNaNsFPMath)
	Condition = getFCmpCodeWithoutNaN(Condition);
	}

	CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr,
	TBB, FBB, CurBB, TProb, FProb);
	SwitchCases.push_back(CB);
	return;
	}
	}

	// Create a CaseBlock record representing this branch.
	ISD::CondCode Opc = InvertCond ? ISD::SETNE : ISD::SETEQ;
	CaseBlock CB(Opc, Cond, ConstantInt::getTrue(*DAG.getContext()),
	nullptr, TBB, FBB, CurBB, TProb, FProb);
	SwitchCases.push_back(CB);
	}

	/// FindMergedConditions - If Cond is an expression like
	void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
	MachineBasicBlock *TBB,
	MachineBasicBlock *FBB,
	MachineBasicBlock *CurBB,
	MachineBasicBlock *SwitchBB,
	Instruction::BinaryOps Opc,
	BranchProbability TProb,
	BranchProbability FProb,
	bool InvertCond) {
	// Skip over not part of the tree and remember to invert op and operands at
	// next level.
	if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) {
	const Value *CondOp = BinaryOperator::getNotArgument(Cond);
	if (InBlock(CondOp, CurBB->getBasicBlock())) {
	FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
	!InvertCond);
	return;
	}
	}

	const Instruction *BOp = dyn_cast<Instruction>(Cond);
	// Compute the effective opcode for Cond, taking into account whether it needs
	// to be inverted, e.g.
	// and (not (or A, B)), C
	// gets lowered as
	// and (and (not A, not B), C)
	unsigned BOpc = 0;
	if (BOp) {
	BOpc = BOp->getOpcode();
	if (InvertCond) {
	if (BOpc == Instruction::And)
	BOpc = Instruction::Or;
	else if (BOpc == Instruction::Or)
	BOpc = Instruction::And;
	}
	}

	// If this node is not part of the or/and tree, emit it as a branch.
	if (!BOp \|\| !(isa<BinaryOperator>(BOp) \|\| isa<CmpInst>(BOp)) \|\|
	BOpc != Opc \|\| !BOp->hasOneUse() \|\|
	BOp->getParent() != CurBB->getBasicBlock() \|\|
	!InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) \|\|
	!InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
	EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
	TProb, FProb, InvertCond);
	return;
	}

	// Create TmpBB after CurBB.
	MachineFunction::iterator BBI(CurBB);
	MachineFunction &MF = DAG.getMachineFunction();
	MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
	CurBB->getParent()->insert(++BBI, TmpBB);

	if (Opc == Instruction::Or) {
	// Codegen X \| Y as:
	// BB1:
	// jmp_if_X TBB
	// jmp TmpBB
	// TmpBB:
	// jmp_if_Y TBB
	// jmp FBB
	//

	// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
	// The requirement is that
	// TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
	// = TrueProb for original BB.
	// Assuming the original probabilities are A and B, one choice is to set
	// BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
	// A/(1+B) and 2B/(1+B). This choice assumes that
	// TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
	// Another choice is to assume TrueProb for BB1 equals to TrueProb for
	// TmpBB, but the math is more complicated.

	auto NewTrueProb = TProb / 2;
	auto NewFalseProb = TProb / 2 + FProb;
	// Emit the LHS condition.
	FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
	NewTrueProb, NewFalseProb, InvertCond);

	// Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
	SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
	BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
	// Emit the RHS condition into TmpBB.
	FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
	Probs[0], Probs[1], InvertCond);
	} else {
	assert(Opc == Instruction::And && "Unknown merge op!");
	// Codegen X & Y as:
	// BB1:
	// jmp_if_X TmpBB
	// jmp FBB
	// TmpBB:
	// jmp_if_Y TBB
	// jmp FBB
	//
	// This requires creation of TmpBB after CurBB.

	// We have flexibility in setting Prob for BB1 and Prob for TmpBB.
	// The requirement is that
	// FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
	// = FalseProb for original BB.
	// Assuming the original probabilities are A and B, one choice is to set
	// BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
	// 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
	// TrueProb for BB1 * FalseProb for TmpBB.

	auto NewTrueProb = TProb + FProb / 2;
	auto NewFalseProb = FProb / 2;
	// Emit the LHS condition.
	FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
	NewTrueProb, NewFalseProb, InvertCond);

	// Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
	SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
	BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
	// Emit the RHS condition into TmpBB.
	FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
	Probs[0], Probs[1], InvertCond);
	}
	}

	/// If the set of cases should be emitted as a series of branches, return true.
	/// If we should emit this as a bunch of and/or'd together conditions, return
	/// false.
	bool
	SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases) {
	if (Cases.size() != 2) return true;

	// If this is two comparisons of the same values or'd or and'd together, they
	// will get folded into a single comparison, so don't emit two blocks.
	if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
	Cases[0].CmpRHS == Cases[1].CmpRHS) \|\|
	(Cases[0].CmpRHS == Cases[1].CmpLHS &&
	Cases[0].CmpLHS == Cases[1].CmpRHS)) {
	return false;
	}

	// Handle: (X != null) \| (Y != null) --> (X\|Y) != 0
	// Handle: (X == null) & (Y == null) --> (X\|Y) == 0
	if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
	Cases[0].CC == Cases[1].CC &&
	isa<Constant>(Cases[0].CmpRHS) &&
	cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
	if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB)
	return false;
	if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB)
	return false;
	}

	return true;
	}

	void SelectionDAGBuilder::visitBr(const BranchInst &I) {
	MachineBasicBlock *BrMBB = FuncInfo.MBB;

	// Update machine-CFG edges.
	MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];

	if (I.isUnconditional()) {
	// Update machine-CFG edges.
	BrMBB->addSuccessor(Succ0MBB);

	// If this is not a fall-through branch or optimizations are switched off,
	// emit the branch.
	if (Succ0MBB != NextBlock(BrMBB) \|\| TM.getOptLevel() == CodeGenOpt::None)
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
	MVT::Other, getControlRoot(),
	DAG.getBasicBlock(Succ0MBB)));

	return;
	}

	// If this condition is one of the special cases we handle, do special stuff
	// now.
	const Value *CondVal = I.getCondition();
	MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];

	// If this is a series of conditions that are or'd or and'd together, emit
	// this as a sequence of branches instead of setcc's with and/or operations.
	// As long as jumps are not expensive, this should improve performance.
	// For example, instead of something like:
	// cmp A, B
	// C = seteq
	// cmp D, E
	// F = setle
	// or C, F
	// jnz foo
	// Emit:
	// cmp A, B
	// je foo
	// cmp D, E
	// jle foo
	//
	if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
	Instruction::BinaryOps Opcode = BOp->getOpcode();
	if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
	!I.getMetadata(LLVMContext::MD_unpredictable) &&
	(Opcode == Instruction::And \|\| Opcode == Instruction::Or)) {
	FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
	Opcode,
	getEdgeProbability(BrMBB, Succ0MBB),
	getEdgeProbability(BrMBB, Succ1MBB),
	/InvertCond=/false);
	// If the compares in later blocks need to use values not currently
	// exported from this block, export them now. This block should always
	// be the first entry.
	assert(SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!");

	// Allow some cases to be rejected.
	if (ShouldEmitAsBranches(SwitchCases)) {
	for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i) {
	ExportFromCurrentBlock(SwitchCases[i].CmpLHS);
	ExportFromCurrentBlock(SwitchCases[i].CmpRHS);
	}

	// Emit the branch for this block.
	visitSwitchCase(SwitchCases[0], BrMBB);
	SwitchCases.erase(SwitchCases.begin());
	return;
	}

	// Okay, we decided not to do this, remove any inserted MBB's and clear
	// SwitchCases.
	for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i)
	FuncInfo.MF->erase(SwitchCases[i].ThisBB);

	SwitchCases.clear();
	}
	}

	// Create a CaseBlock record representing this branch.
	CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
	nullptr, Succ0MBB, Succ1MBB, BrMBB);

	// Use visitSwitchCase to actually insert the fast branch sequence for this
	// cond branch.
	visitSwitchCase(CB, BrMBB);
	}

	/// visitSwitchCase - Emits the necessary code to represent a single node in
	/// the binary search tree resulting from lowering a switch instruction.
	void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
	MachineBasicBlock *SwitchBB) {
	SDValue Cond;
	SDValue CondLHS = getValue(CB.CmpLHS);
	SDLoc dl = getCurSDLoc();

	// Build the setcc now.
	if (!CB.CmpMHS) {
	// Fold "(X == true)" to X and "(X == false)" to !X to
	// handle common cases produced by branch lowering.
	if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
	CB.CC == ISD::SETEQ)
	Cond = CondLHS;
	else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) &&
	CB.CC == ISD::SETEQ) {
	SDValue True = DAG.getConstant(1, dl, CondLHS.getValueType());
	Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
	} else
	Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
	} else {
	assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");

	const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
	const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();

	SDValue CmpOp = getValue(CB.CmpMHS);
	EVT VT = CmpOp.getValueType();

	if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
	Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, dl, VT),
	ISD::SETLE);
	} else {
	SDValue SUB = DAG.getNode(ISD::SUB, dl,
	VT, CmpOp, DAG.getConstant(Low, dl, VT));
	Cond = DAG.getSetCC(dl, MVT::i1, SUB,
	DAG.getConstant(High-Low, dl, VT), ISD::SETULE);
	}
	}

	// Update successor info
	addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
	// TrueBB and FalseBB are always different unless the incoming IR is
	// degenerate. This only happens when running llc on weird IR.
	if (CB.TrueBB != CB.FalseBB)
	addSuccessorWithProb(SwitchBB, CB.FalseBB, CB.FalseProb);
	SwitchBB->normalizeSuccProbs();

	// If the lhs block is the next block, invert the condition so that we can
	// fall through to the lhs instead of the rhs block.
	if (CB.TrueBB == NextBlock(SwitchBB)) {
	std::swap(CB.TrueBB, CB.FalseBB);
	SDValue True = DAG.getConstant(1, dl, Cond.getValueType());
	Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
	}

	SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, getControlRoot(), Cond,
	DAG.getBasicBlock(CB.TrueBB));

	// Insert the false branch. Do this even if it's a fall through branch,
	// this makes it easier to do DAG optimizations which require inverting
	// the branch condition.
	BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
	DAG.getBasicBlock(CB.FalseBB));

	DAG.setRoot(BrCond);
	}

	/// visitJumpTable - Emit JumpTable node in the current MBB
	void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
	// Emit the code for the jump table
	assert(JT.Reg != -1U && "Should lower JT Header first!");
	EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
	SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
	JT.Reg, PTy);
	SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
	SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurSDLoc(),
	MVT::Other, Index.getValue(1),
	Table, Index);
	DAG.setRoot(BrJumpTable);
	}

	/// visitJumpTableHeader - This function emits necessary code to produce index
	/// in the JumpTable from switch case.
	void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
	JumpTableHeader &JTH,
	MachineBasicBlock *SwitchBB) {
	SDLoc dl = getCurSDLoc();

	// Subtract the lowest switch case value from the value being switched on and
	// conditional branch to default mbb if the result is greater than the
	// difference between smallest and largest cases.
	SDValue SwitchOp = getValue(JTH.SValue);
	EVT VT = SwitchOp.getValueType();
	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
	DAG.getConstant(JTH.First, dl, VT));

	// The SDNode we just created, which holds the value being switched on minus
	// the smallest case value, needs to be copied to a virtual register so it
	// can be used as an index into the jump table in a subsequent basic block.
	// This value may be smaller or larger than the target's pointer type, and
	// therefore require extension or truncating.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));

	unsigned JumpTableReg =
	FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
	SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
	JumpTableReg, SwitchOp);
	JT.Reg = JumpTableReg;

	// Emit the range check for the jump table, and branch to the default block
	// for the switch statement if the value being switched on exceeds the largest
	// case in the switch.
	SDValue CMP = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	Sub.getValueType()),
	Sub, DAG.getConstant(JTH.Last - JTH.First, dl, VT), ISD::SETUGT);

	SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, CopyTo, CMP,
	DAG.getBasicBlock(JT.Default));

	// Avoid emitting unnecessary branches to the next block.
	if (JT.MBB != NextBlock(SwitchBB))
	BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
	DAG.getBasicBlock(JT.MBB));

	DAG.setRoot(BrCond);
	}

	/// Create a LOAD_STACK_GUARD node, and let it carry the target specific global
	/// variable if there exists one.
	static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
	SDValue &Chain) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	MachineFunction &MF = DAG.getMachineFunction();
	Value Global = TLI.getSDagStackGuard(MF.getFunction()->getParent());
	MachineSDNode *Node =
	DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD, DL, PtrTy, Chain);
	if (Global) {
	MachinePointerInfo MPInfo(Global);
	MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
	auto Flags = MachineMemOperand::MOLoad \| MachineMemOperand::MOInvariant \|
	MachineMemOperand::MODereferenceable;
	*MemRefs = MF.getMachineMemOperand(MPInfo, Flags, PtrTy.getSizeInBits() / 8,
	DAG.getEVTAlignment(PtrTy));
	Node->setMemRefs(MemRefs, MemRefs + 1);
	}
	return SDValue(Node, 0);
	}

	/// Codegen a new tail for a stack protector check ParentMBB which has had its
	/// tail spliced into a stack protector check success bb.
	///
	/// For a high level explanation of how this fits into the stack protector
	/// generation see the comment on the declaration of class
	/// StackProtectorDescriptor.
	void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
	MachineBasicBlock *ParentBB) {

	// First create the loads to the guard/stack slot for the comparison.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());

	MachineFrameInfo &MFI = ParentBB->getParent()->getFrameInfo();
	int FI = MFI.getStackProtectorIndex();

	SDValue Guard;
	SDLoc dl = getCurSDLoc();
	SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
	const Module &M = *ParentBB->getParent()->getFunction()->getParent();
	unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext()));

	// Generate code to load the content of the guard slot.
	SDValue StackSlot = DAG.getLoad(
	PtrTy, dl, DAG.getEntryNode(), StackSlotPtr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
	MachineMemOperand::MOVolatile);

	// Retrieve guard check function, nullptr if instrumentation is inlined.
	if (const Value *GuardCheck = TLI.getSSPStackGuardCheck(M)) {
	// The target provides a guard check function to validate the guard value.
	// Generate a call to that function with the content of the guard slot as
	// argument.
	auto *Fn = cast<Function>(GuardCheck);
	FunctionType *FnTy = Fn->getFunctionType();
	assert(FnTy->getNumParams() == 1 && "Invalid function signature");

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Node = StackSlot;
	Entry.Ty = FnTy->getParamType(0);
	if (Fn->hasAttribute(1, Attribute::AttrKind::InReg))
	Entry.IsInReg = true;
	Args.push_back(Entry);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(getCurSDLoc())
	.setChain(DAG.getEntryNode())
	.setCallee(Fn->getCallingConv(), FnTy->getReturnType(),
	getValue(GuardCheck), std::move(Args));

	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
	DAG.setRoot(Result.second);
	return;
	}

	// If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
	// Otherwise, emit a volatile load to retrieve the stack guard value.
	SDValue Chain = DAG.getEntryNode();
	if (TLI.useLoadStackGuardNode()) {
	Guard = getLoadStackGuard(DAG, dl, Chain);
	} else {
	const Value *IRGuard = TLI.getSDagStackGuard(M);
	SDValue GuardPtr = getValue(IRGuard);

	Guard =
	DAG.getLoad(PtrTy, dl, Chain, GuardPtr, MachinePointerInfo(IRGuard, 0),
	Align, MachineMemOperand::MOVolatile);
	}

	// Perform the comparison via a subtract/getsetcc.
	EVT VT = Guard.getValueType();
	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, StackSlot);

	SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(),
	Sub.getValueType()),
	Sub, DAG.getConstant(0, dl, VT), ISD::SETNE);

	// If the sub is not 0, then we know the guard/stackslot do not equal, so
	// branch to failure MBB.
	SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, StackSlot.getOperand(0),
	Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
	// Otherwise branch to success MBB.
	SDValue Br = DAG.getNode(ISD::BR, dl,
	MVT::Other, BrCond,
	DAG.getBasicBlock(SPD.getSuccessMBB()));

	DAG.setRoot(Br);
	}

	/// Codegen the failure basic block for a stack protector check.
	///
	/// A failure stack protector machine basic block consists simply of a call to
	/// __stack_chk_fail().
	///
	/// For a high level explanation of how this fits into the stack protector
	/// generation see the comment on the declaration of class
	/// StackProtectorDescriptor.
	void
	SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Chain =
	TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
	None, false, getCurSDLoc(), false, false).second;
	DAG.setRoot(Chain);
	}

	/// visitBitTestHeader - This function emits necessary code to produce value
	/// suitable for "bit tests"
	void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
	MachineBasicBlock *SwitchBB) {
	SDLoc dl = getCurSDLoc();

	// Subtract the minimum value
	SDValue SwitchOp = getValue(B.SValue);
	EVT VT = SwitchOp.getValueType();
	SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
	DAG.getConstant(B.First, dl, VT));

	// Check range
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue RangeCmp = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	Sub.getValueType()),
	Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);

	// Determine the type of the test operands.
	bool UsePtrType = false;
	if (!TLI.isTypeLegal(VT))
	UsePtrType = true;
	else {
	for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
	if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
	// Switch table case range are encoded into series of masks.
	// Just use pointer type, it's guaranteed to fit.
	UsePtrType = true;
	break;
	}
	}
	if (UsePtrType) {
	VT = TLI.getPointerTy(DAG.getDataLayout());
	Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
	}

	B.RegVT = VT.getSimpleVT();
	B.Reg = FuncInfo.CreateReg(B.RegVT);
	SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl, B.Reg, Sub);

	MachineBasicBlock* MBB = B.Cases[0].ThisBB;

	addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
	addSuccessorWithProb(SwitchBB, MBB, B.Prob);
	SwitchBB->normalizeSuccProbs();

	SDValue BrRange = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, CopyTo, RangeCmp,
	DAG.getBasicBlock(B.Default));

	// Avoid emitting unnecessary branches to the next block.
	if (MBB != NextBlock(SwitchBB))
	BrRange = DAG.getNode(ISD::BR, dl, MVT::Other, BrRange,
	DAG.getBasicBlock(MBB));

	DAG.setRoot(BrRange);
	}

	/// visitBitTestCase - this function produces one "bit test"
	void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
	MachineBasicBlock* NextMBB,
	BranchProbability BranchProbToNext,
	unsigned Reg,
	BitTestCase &B,
	MachineBasicBlock *SwitchBB) {
	SDLoc dl = getCurSDLoc();
	MVT VT = BB.RegVT;
	SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), dl, Reg, VT);
	SDValue Cmp;
	unsigned PopCount = countPopulation(B.Mask);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (PopCount == 1) {
	// Testing for a single bit; just compare the shift count with what it
	// would need to be to shift a 1 bit in that position.
	Cmp = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
	ShiftOp, DAG.getConstant(countTrailingZeros(B.Mask), dl, VT),
	ISD::SETEQ);
	} else if (PopCount == BB.Range) {
	// There is only one zero bit in the range, test for it directly.
	Cmp = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
	ShiftOp, DAG.getConstant(countTrailingOnes(B.Mask), dl, VT),
	ISD::SETNE);
	} else {
	// Make desired shift
	SDValue SwitchVal = DAG.getNode(ISD::SHL, dl, VT,
	DAG.getConstant(1, dl, VT), ShiftOp);

	// Emit bit tests and jumps
	SDValue AndOp = DAG.getNode(ISD::AND, dl,
	VT, SwitchVal, DAG.getConstant(B.Mask, dl, VT));
	Cmp = DAG.getSetCC(
	dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
	AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE);
	}

	// The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
	addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
	// The branch probability from SwitchBB to NextMBB is BranchProbToNext.
	addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
	// It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
	// one as they are relative probabilities (and thus work more like weights),
	// and hence we need to normalize them to let the sum of them become one.
	SwitchBB->normalizeSuccProbs();

	SDValue BrAnd = DAG.getNode(ISD::BRCOND, dl,
	MVT::Other, getControlRoot(),
	Cmp, DAG.getBasicBlock(B.TargetBB));

	// Avoid emitting unnecessary branches to the next block.
	if (NextMBB != NextBlock(SwitchBB))
	BrAnd = DAG.getNode(ISD::BR, dl, MVT::Other, BrAnd,
	DAG.getBasicBlock(NextMBB));

	DAG.setRoot(BrAnd);
	}

	void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
	MachineBasicBlock *InvokeMBB = FuncInfo.MBB;

	// Retrieve successors. Look through artificial IR level blocks like
	// catchswitch for successors.
	MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
	const BasicBlock *EHPadBB = I.getSuccessor(1);

	// Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
	// have to do anything here to lower funclet bundles.
	assert(!I.hasOperandBundlesOtherThan(
	{LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
	"Cannot lower invokes with arbitrary operand bundles yet!");

	const Value *Callee(I.getCalledValue());
	const Function *Fn = dyn_cast<Function>(Callee);
	if (isa<InlineAsm>(Callee))
	visitInlineAsm(&I);
	else if (Fn && Fn->isIntrinsic()) {
	switch (Fn->getIntrinsicID()) {
	default:
	llvm_unreachable("Cannot invoke this intrinsic");
	case Intrinsic::donothing:
	// Ignore invokes to @llvm.donothing: jump directly to the next BB.
	break;
	case Intrinsic::experimental_patchpoint_void:
	case Intrinsic::experimental_patchpoint_i64:
	visitPatchpoint(&I, EHPadBB);
	break;
	case Intrinsic::experimental_gc_statepoint:
	LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
	break;
	}
	} else if (I.countOperandBundlesOfType(LLVMContext::OB_deopt)) {
	// Currently we do not lower any intrinsic calls with deopt operand bundles.
	// Eventually we will support lowering the @llvm.experimental.deoptimize
	// intrinsic, and right now there are no plans to support other intrinsics
	// with deopt state.
	LowerCallSiteWithDeoptBundle(&I, getValue(Callee), EHPadBB);
	} else {
	LowerCallTo(&I, getValue(Callee), false, EHPadBB);
	}

	// If the value of the invoke is used outside of its defining block, make it
	// available as a virtual register.
	// We already took care of the exported value for the statepoint instruction
	// during call to the LowerStatepoint.
	if (!isStatepoint(I)) {
	CopyToExportRegsIfNeeded(&I);
	}

	SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	BranchProbability EHPadBBProb =
	BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
	: BranchProbability::getZero();
	findUnwindDestinations(FuncInfo, EHPadBB, EHPadBBProb, UnwindDests);

	// Update successor info.
	addSuccessorWithProb(InvokeMBB, Return);
	for (auto &UnwindDest : UnwindDests) {
	UnwindDest.first->setIsEHPad();
	addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
	}
	InvokeMBB->normalizeSuccProbs();

	// Drop into normal successor.
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
	MVT::Other, getControlRoot(),
	DAG.getBasicBlock(Return)));
	}

	void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
	llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!");
	}

	void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
	assert(FuncInfo.MBB->isEHPad() &&
	"Call to landingpad not in landing pad!");

	MachineBasicBlock *MBB = FuncInfo.MBB;
	addLandingPadInfo(LP, *MBB);

	// If there aren't registers to copy the values into (e.g., during SjLj
	// exceptions), then don't bother to create these DAG nodes.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const Constant *PersonalityFn = FuncInfo.Fn->getPersonalityFn();
	if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
	TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
	return;

	// If landingpad's return type is token type, we don't create DAG nodes
	// for its exception pointer and selector value. The extraction of exception
	// pointer or selector value from token type landingpads is not currently
	// supported.
	if (LP.getType()->isTokenTy())
	return;

	SmallVector<EVT, 2> ValueVTs;
	SDLoc dl = getCurSDLoc();
	ComputeValueVTs(TLI, DAG.getDataLayout(), LP.getType(), ValueVTs);
	assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");

	// Get the two live-in registers as SDValues. The physregs have already been
	// copied into virtual registers.
	SDValue Ops[2];
	if (FuncInfo.ExceptionPointerVirtReg) {
	Ops[0] = DAG.getZExtOrTrunc(
	DAG.getCopyFromReg(DAG.getEntryNode(), dl,
	FuncInfo.ExceptionPointerVirtReg,
	TLI.getPointerTy(DAG.getDataLayout())),
	dl, ValueVTs[0]);
	} else {
	Ops[0] = DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout()));
	}
	Ops[1] = DAG.getZExtOrTrunc(
	DAG.getCopyFromReg(DAG.getEntryNode(), dl,
	FuncInfo.ExceptionSelectorVirtReg,
	TLI.getPointerTy(DAG.getDataLayout())),
	dl, ValueVTs[1]);

	// Merge into one.
	SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
	DAG.getVTList(ValueVTs), Ops);
	setValue(&LP, Res);
	}

	void SelectionDAGBuilder::sortAndRangeify(CaseClusterVector &Clusters) {
	#ifndef NDEBUG
	for (const CaseCluster &CC : Clusters)
	assert(CC.Low == CC.High && "Input clusters must be single-case");
	#endif

	std::sort(Clusters.begin(), Clusters.end(),
	[](const CaseCluster &a, const CaseCluster &b) {
	return a.Low->getValue().slt(b.Low->getValue());
	});

	// Merge adjacent clusters with the same destination.
	const unsigned N = Clusters.size();
	unsigned DstIndex = 0;
	for (unsigned SrcIndex = 0; SrcIndex < N; ++SrcIndex) {
	CaseCluster &CC = Clusters[SrcIndex];
	const ConstantInt *CaseVal = CC.Low;
	MachineBasicBlock *Succ = CC.MBB;

	if (DstIndex != 0 && Clusters[DstIndex - 1].MBB == Succ &&
	(CaseVal->getValue() - Clusters[DstIndex - 1].High->getValue()) == 1) {
	// If this case has the same successor and is a neighbour, merge it into
	// the previous cluster.
	Clusters[DstIndex - 1].High = CaseVal;
	Clusters[DstIndex - 1].Prob += CC.Prob;
	} else {
	std::memmove(&Clusters[DstIndex++], &Clusters[SrcIndex],
	sizeof(Clusters[SrcIndex]));
	}
	}
	Clusters.resize(DstIndex);
	}

	void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
	MachineBasicBlock *Last) {
	// Update JTCases.
	for (unsigned i = 0, e = JTCases.size(); i != e; ++i)
	if (JTCases[i].first.HeaderBB == First)
	JTCases[i].first.HeaderBB = Last;

	// Update BitTestCases.
	for (unsigned i = 0, e = BitTestCases.size(); i != e; ++i)
	if (BitTestCases[i].Parent == First)
	BitTestCases[i].Parent = Last;
	}

	void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
	MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;

	// Update machine-CFG edges with unique successors.
	SmallSet<BasicBlock*, 32> Done;
	for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
	BasicBlock *BB = I.getSuccessor(i);
	bool Inserted = Done.insert(BB).second;
	if (!Inserted)
	continue;

	MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
	addSuccessorWithProb(IndirectBrMBB, Succ);
	}
	IndirectBrMBB->normalizeSuccProbs();

	DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(),
	MVT::Other, getControlRoot(),
	getValue(I.getAddress())));
	}

	void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
	if (DAG.getTarget().Options.TrapUnreachable)
	DAG.setRoot(
	DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
	}

	void SelectionDAGBuilder::visitFSub(const User &I) {
	// -0.0 - X --> fneg
	Type *Ty = I.getType();
	if (isa<Constant>(I.getOperand(0)) &&
	I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
	SDValue Op2 = getValue(I.getOperand(1));
	setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(),
	Op2.getValueType(), Op2));
	return;
	}

	visitBinary(I, ISD::FSUB);
	}

	/// Checks if the given instruction performs a vector reduction, in which case
	/// we have the freedom to alter the elements in the result as long as the
	/// reduction of them stays unchanged.
	static bool isVectorReductionOp(const User *I) {
	const Instruction *Inst = dyn_cast<Instruction>(I);
	if (!Inst \|\| !Inst->getType()->isVectorTy())
	return false;

	auto OpCode = Inst->getOpcode();
	switch (OpCode) {
	case Instruction::Add:
	case Instruction::Mul:
	case Instruction::And:
	case Instruction::Or:
	case Instruction::Xor:
	break;
	case Instruction::FAdd:
	case Instruction::FMul:
	if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
	if (FPOp->getFastMathFlags().unsafeAlgebra())
	break;
	LLVM_FALLTHROUGH;
	default:
	return false;
	}

	unsigned ElemNum = Inst->getType()->getVectorNumElements();
	unsigned ElemNumToReduce = ElemNum;

	// Do DFS search on the def-use chain from the given instruction. We only
	// allow four kinds of operations during the search until we reach the
	// instruction that extracts the first element from the vector:
	//
	// 1. The reduction operation of the same opcode as the given instruction.
	//
	// 2. PHI node.
	//
	// 3. ShuffleVector instruction together with a reduction operation that
	// does a partial reduction.
	//
	// 4. ExtractElement that extracts the first element from the vector, and we
	// stop searching the def-use chain here.
	//
	// 3 & 4 above perform a reduction on all elements of the vector. We push defs
	// from 1-3 to the stack to continue the DFS. The given instruction is not
	// a reduction operation if we meet any other instructions other than those
	// listed above.

	SmallVector<const User *, 16> UsersToVisit{Inst};
	SmallPtrSet<const User *, 16> Visited;
	bool ReduxExtracted = false;

	while (!UsersToVisit.empty()) {
	auto User = UsersToVisit.back();
	UsersToVisit.pop_back();
	if (!Visited.insert(User).second)
	continue;

	for (const auto &U : User->users()) {
	auto Inst = dyn_cast<Instruction>(U);
	if (!Inst)
	return false;

	if (Inst->getOpcode() == OpCode \|\| isa<PHINode>(U)) {
	if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
	if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
	return false;
	UsersToVisit.push_back(U);
	} else if (const ShuffleVectorInst *ShufInst =
	dyn_cast<ShuffleVectorInst>(U)) {
	// Detect the following pattern: A ShuffleVector instruction together
	// with a reduction that do partial reduction on the first and second
	// ElemNumToReduce / 2 elements, and store the result in
	// ElemNumToReduce / 2 elements in another vector.

	unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
	if (ResultElements < ElemNum)
	return false;

	if (ElemNumToReduce == 1)
	return false;
	if (!isa<UndefValue>(U->getOperand(1)))
	return false;
	for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
	if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
	return false;
	for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
	if (ShufInst->getMaskValue(i) != -1)
	return false;

	// There is only one user of this ShuffleVector instruction, which
	// must be a reduction operation.
	if (!U->hasOneUse())
	return false;

	auto U2 = dyn_cast<Instruction>(*U->user_begin());
	if (!U2 \|\| U2->getOpcode() != OpCode)
	return false;

	// Check operands of the reduction operation.
	if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) \|\|
	(U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
	UsersToVisit.push_back(U2);
	ElemNumToReduce /= 2;
	} else
	return false;
	} else if (isa<ExtractElementInst>(U)) {
	// At this moment we should have reduced all elements in the vector.
	if (ElemNumToReduce != 1)
	return false;

	const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
	if (!Val \|\| Val->getZExtValue() != 0)
	return false;

	ReduxExtracted = true;
	} else
	return false;
	}
	}
	return ReduxExtracted;
	}

	void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));

	bool nuw = false;
	bool nsw = false;
	bool exact = false;
	bool vec_redux = false;
	FastMathFlags FMF;

	if (const OverflowingBinaryOperator *OFBinOp =
	dyn_cast<const OverflowingBinaryOperator>(&I)) {
	nuw = OFBinOp->hasNoUnsignedWrap();
	nsw = OFBinOp->hasNoSignedWrap();
	}
	if (const PossiblyExactOperator *ExactOp =
	dyn_cast<const PossiblyExactOperator>(&I))
	exact = ExactOp->isExact();
	if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
	FMF = FPOp->getFastMathFlags();

	if (isVectorReductionOp(&I)) {
	vec_redux = true;
	DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
	}

	SDNodeFlags Flags;
	Flags.setExact(exact);
	Flags.setNoSignedWrap(nsw);
	Flags.setNoUnsignedWrap(nuw);
	Flags.setVectorReduction(vec_redux);
	Flags.setAllowReciprocal(FMF.allowReciprocal());
	Flags.setAllowContract(FMF.allowContract());
	Flags.setNoInfs(FMF.noInfs());
	Flags.setNoNaNs(FMF.noNaNs());
	Flags.setNoSignedZeros(FMF.noSignedZeros());
	Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());

	SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
	Op1, Op2, Flags);
	setValue(&I, BinNodeValue);
	}

	void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));

	EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
	Op2.getValueType(), DAG.getDataLayout());

	// Coerce the shift amount to the right type if we can.
	if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
	unsigned ShiftSize = ShiftTy.getSizeInBits();
	unsigned Op2Size = Op2.getValueSizeInBits();
	SDLoc DL = getCurSDLoc();

	// If the operand is smaller than the shift count type, promote it.
	if (ShiftSize > Op2Size)
	Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2);

	// If the operand is larger than the shift count type but the shift
	// count type has enough bits to represent any shift value, truncate
	// it now. This is a common case and it exposes the truncate to
	// optimization early.
	else if (ShiftSize >= Log2_32_Ceil(Op2.getValueSizeInBits()))
	Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
	// Otherwise we'll need to temporarily settle for some other convenient
	// type. Type legalization will make adjustments once the shiftee is split.
	else
	Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
	}

	bool nuw = false;
	bool nsw = false;
	bool exact = false;

	if (Opcode == ISD::SRL \|\| Opcode == ISD::SRA \|\| Opcode == ISD::SHL) {

	if (const OverflowingBinaryOperator *OFBinOp =
	dyn_cast<const OverflowingBinaryOperator>(&I)) {
	nuw = OFBinOp->hasNoUnsignedWrap();
	nsw = OFBinOp->hasNoSignedWrap();
	}
	if (const PossiblyExactOperator *ExactOp =
	dyn_cast<const PossiblyExactOperator>(&I))
	exact = ExactOp->isExact();
	}
	SDNodeFlags Flags;
	Flags.setExact(exact);
	Flags.setNoSignedWrap(nsw);
	Flags.setNoUnsignedWrap(nuw);
	SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
	Flags);
	setValue(&I, Res);
	}

	void SelectionDAGBuilder::visitSDiv(const User &I) {
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));

	SDNodeFlags Flags;
	Flags.setExact(isa<PossiblyExactOperator>(&I) &&
	cast<PossiblyExactOperator>(&I)->isExact());
	setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1,
	Op2, Flags));
	}

	void SelectionDAGBuilder::visitICmp(const User &I) {
	ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
	if (const ICmpInst *IC = dyn_cast<ICmpInst>(&I))
	predicate = IC->getPredicate();
	else if (const ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
	predicate = ICmpInst::Predicate(IC->getPredicate());
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));
	ISD::CondCode Opcode = getICmpCondCode(predicate);

	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
	}

	void SelectionDAGBuilder::visitFCmp(const User &I) {
	FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
	if (const FCmpInst *FC = dyn_cast<FCmpInst>(&I))
	predicate = FC->getPredicate();
	else if (const ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
	predicate = FCmpInst::Predicate(FC->getPredicate());
	SDValue Op1 = getValue(I.getOperand(0));
	SDValue Op2 = getValue(I.getOperand(1));
	ISD::CondCode Condition = getFCmpCondCode(predicate);

	// FIXME: Fcmp instructions have fast-math-flags in IR, so we should use them.
	// FIXME: We should propagate the fast-math-flags to the DAG node itself for
	// further optimization, but currently FMF is only applicable to binary nodes.
	if (TM.Options.NoNaNsFPMath)
	Condition = getFCmpCodeWithoutNaN(Condition);
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
	}

	// Check if the condition of the select has one use or two users that are both
	// selects with the same condition.
	static bool hasOnlySelectUsers(const Value *Cond) {
	return all_of(Cond->users(), [](const Value *V) {
	return isa<SelectInst>(V);
	});
	}

	void SelectionDAGBuilder::visitSelect(const User &I) {
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
	ValueVTs);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0) return;

	SmallVector<SDValue, 4> Values(NumValues);
	SDValue Cond = getValue(I.getOperand(0));
	SDValue LHSVal = getValue(I.getOperand(1));
	SDValue RHSVal = getValue(I.getOperand(2));
	auto BaseOps = {Cond};
	ISD::NodeType OpCode = Cond.getValueType().isVector() ?
	ISD::VSELECT : ISD::SELECT;

	// Min/max matching is only viable if all output VTs are the same.
	if (std::equal(ValueVTs.begin(), ValueVTs.end(), ValueVTs.begin())) {
	EVT VT = ValueVTs[0];
	LLVMContext &Ctx = *DAG.getContext();
	auto &TLI = DAG.getTargetLoweringInfo();

	// We care about the legality of the operation after it has been type
	// legalized.
	while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal &&
	VT != TLI.getTypeToTransformTo(Ctx, VT))
	VT = TLI.getTypeToTransformTo(Ctx, VT);

	// If the vselect is legal, assume we want to leave this as a vector setcc +
	// vselect. Otherwise, if this is going to be scalarized, we want to see if
	// min/max is legal on the scalar type.
	bool UseScalarMinMax = VT.isVector() &&
	!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT);

	Value LHS, RHS;
	auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
	ISD::NodeType Opc = ISD::DELETED_NODE;
	switch (SPR.Flavor) {
	case SPF_UMAX: Opc = ISD::UMAX; break;
	case SPF_UMIN: Opc = ISD::UMIN; break;
	case SPF_SMAX: Opc = ISD::SMAX; break;
	case SPF_SMIN: Opc = ISD::SMIN; break;
	case SPF_FMINNUM:
	switch (SPR.NaNBehavior) {
	case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
	case SPNB_RETURNS_NAN: Opc = ISD::FMINNAN; break;
	case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
	case SPNB_RETURNS_ANY: {
	if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT))
	Opc = ISD::FMINNUM;
	else if (TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT))
	Opc = ISD::FMINNAN;
	else if (UseScalarMinMax)
	Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ?
	ISD::FMINNUM : ISD::FMINNAN;
	break;
	}
	}
	break;
	case SPF_FMAXNUM:
	switch (SPR.NaNBehavior) {
	case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
	case SPNB_RETURNS_NAN: Opc = ISD::FMAXNAN; break;
	case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
	case SPNB_RETURNS_ANY:

	if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT))
	Opc = ISD::FMAXNUM;
	else if (TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT))
	Opc = ISD::FMAXNAN;
	else if (UseScalarMinMax)
	Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ?
	ISD::FMAXNUM : ISD::FMAXNAN;
	break;
	}
	break;
	default: break;
	}

	if (Opc != ISD::DELETED_NODE &&
	(TLI.isOperationLegalOrCustom(Opc, VT) \|\|
	(UseScalarMinMax &&
	TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) &&
	// If the underlying comparison instruction is used by any other
	// instruction, the consumed instructions won't be destroyed, so it is
	// not profitable to convert to a min/max.
	hasOnlySelectUsers(cast<SelectInst>(I).getCondition())) {
	OpCode = Opc;
	LHSVal = getValue(LHS);
	RHSVal = getValue(RHS);
	BaseOps = {};
	}
	}

	for (unsigned i = 0; i != NumValues; ++i) {
	SmallVector<SDValue, 3> Ops(BaseOps.begin(), BaseOps.end());
	Ops.push_back(SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
	Ops.push_back(SDValue(RHSVal.getNode(), RHSVal.getResNo() + i));
	Values[i] = DAG.getNode(OpCode, getCurSDLoc(),
	LHSVal.getNode()->getValueType(LHSVal.getResNo()+i),
	Ops);
	}

	setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
	DAG.getVTList(ValueVTs), Values));
	}

	void SelectionDAGBuilder::visitTrunc(const User &I) {
	// TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitZExt(const User &I) {
	// ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
	// ZExt also can't be a cast to bool for same reason. So, nothing much to do
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitSExt(const User &I) {
	// SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
	// SExt also can't be a cast to bool for same reason. So, nothing much to do
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitFPTrunc(const User &I) {
	// FPTrunc is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	SDLoc dl = getCurSDLoc();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
	DAG.getTargetConstant(
	0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
	}

	void SelectionDAGBuilder::visitFPExt(const User &I) {
	// FPExt is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitFPToUI(const User &I) {
	// FPToUI is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitFPToSI(const User &I) {
	// FPToSI is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitUIToFP(const User &I) {
	// UIToFP is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitSIToFP(const User &I) {
	// SIToFP is never a no-op cast, no need to check
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
	}

	void SelectionDAGBuilder::visitPtrToInt(const User &I) {
	// What to do depends on the size of the integer and the size of the pointer.
	// We can either truncate, zero extend, or no-op, accordingly.
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
	}

	void SelectionDAGBuilder::visitIntToPtr(const User &I) {
	// What to do depends on the size of the integer and the size of the pointer.
	// We can either truncate, zero extend, or no-op, accordingly.
	SDValue N = getValue(I.getOperand(0));
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());
	setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
	}

	void SelectionDAGBuilder::visitBitCast(const User &I) {
	SDValue N = getValue(I.getOperand(0));
	SDLoc dl = getCurSDLoc();
	EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType());

	// BitCast assures us that source and destination are the same size so this is
	// either a BITCAST or a no-op.
	if (DestVT != N.getValueType())
	setValue(&I, DAG.getNode(ISD::BITCAST, dl,
	DestVT, N)); // convert types.
	// Check if the original LLVM IR Operand was a ConstantInt, because getValue()
	// might fold any kind of constant expression to an integer constant and that
	// is not what we are looking for. Only recognize a bitcast of a genuine
	// constant integer as an opaque constant.
	else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
	setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /isTarget=/false,
	/isOpaque/true));
	else
	setValue(&I, N); // noop cast.
	}

	void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const Value *SV = I.getOperand(0);
	SDValue N = getValue(SV);
	EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());

	unsigned SrcAS = SV->getType()->getPointerAddressSpace();
	unsigned DestAS = I.getType()->getPointerAddressSpace();

	if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
	N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);

	setValue(&I, N);
	}

	void SelectionDAGBuilder::visitInsertElement(const User &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue InVec = getValue(I.getOperand(0));
	SDValue InVal = getValue(I.getOperand(1));
	SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)), getCurSDLoc(),
	TLI.getVectorIdxTy(DAG.getDataLayout()));
	setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
	TLI.getValueType(DAG.getDataLayout(), I.getType()),
	InVec, InVal, InIdx));
	}

	void SelectionDAGBuilder::visitExtractElement(const User &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue InVec = getValue(I.getOperand(0));
	SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), getCurSDLoc(),
	TLI.getVectorIdxTy(DAG.getDataLayout()));
	setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
	TLI.getValueType(DAG.getDataLayout(), I.getType()),
	InVec, InIdx));
	}

	void SelectionDAGBuilder::visitShuffleVector(const User &I) {
	SDValue Src1 = getValue(I.getOperand(0));
	SDValue Src2 = getValue(I.getOperand(1));
	SDLoc DL = getCurSDLoc();

	SmallVector<int, 8> Mask;
	ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
	unsigned MaskNumElts = Mask.size();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	EVT SrcVT = Src1.getValueType();
	unsigned SrcNumElts = SrcVT.getVectorNumElements();

	if (SrcNumElts == MaskNumElts) {
	setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, Mask));
	return;
	}

	// Normalize the shuffle vector since mask and vector length don't match.
	if (SrcNumElts < MaskNumElts) {
	// Mask is longer than the source vectors. We can use concatenate vector to
	// make the mask and vectors lengths match.

	if (MaskNumElts % SrcNumElts == 0) {
	// Mask length is a multiple of the source vector length.
	// Check if the shuffle is some kind of concatenation of the input
	// vectors.
	unsigned NumConcat = MaskNumElts / SrcNumElts;
	bool IsConcat = true;
	SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
	for (unsigned i = 0; i != MaskNumElts; ++i) {
	int Idx = Mask[i];
	if (Idx < 0)
	continue;
	// Ensure the indices in each SrcVT sized piece are sequential and that
	// the same source is used for the whole piece.
	if ((Idx % SrcNumElts != (i % SrcNumElts)) \|\|
	(ConcatSrcs[i / SrcNumElts] >= 0 &&
	ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) {
	IsConcat = false;
	break;
	}
	// Remember which source this index came from.
	ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
	}

	// The shuffle is concatenating multiple vectors together. Just emit
	// a CONCAT_VECTORS operation.
	if (IsConcat) {
	SmallVector<SDValue, 8> ConcatOps;
	for (auto Src : ConcatSrcs) {
	if (Src < 0)
	ConcatOps.push_back(DAG.getUNDEF(SrcVT));
	else if (Src == 0)
	ConcatOps.push_back(Src1);
	else
	ConcatOps.push_back(Src2);
	}
	setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps));
	return;
	}
	}

	unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
	unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
	EVT PaddedVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
	PaddedMaskNumElts);

	// Pad both vectors with undefs to make them the same length as the mask.
	SDValue UndefVal = DAG.getUNDEF(SrcVT);

	SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
	SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
	MOps1[0] = Src1;
	MOps2[0] = Src2;

	Src1 = Src1.isUndef()
	? DAG.getUNDEF(PaddedVT)
	: DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps1);
	Src2 = Src2.isUndef()
	? DAG.getUNDEF(PaddedVT)
	: DAG.getNode(ISD::CONCAT_VECTORS, DL, PaddedVT, MOps2);

	// Readjust mask for new input vector length.
	SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
	for (unsigned i = 0; i != MaskNumElts; ++i) {
	int Idx = Mask[i];
	if (Idx >= (int)SrcNumElts)
	Idx -= SrcNumElts - PaddedMaskNumElts;
	MappedOps[i] = Idx;
	}

	SDValue Result = DAG.getVectorShuffle(PaddedVT, DL, Src1, Src2, MappedOps);

	// If the concatenated vector was padded, extract a subvector with the
	// correct number of elements.
	if (MaskNumElts != PaddedMaskNumElts)
	Result = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, VT, Result,
	DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));

	setValue(&I, Result);
	return;
	}

	if (SrcNumElts > MaskNumElts) {
	// Analyze the access pattern of the vector to see if we can extract
	// two subvectors and do the shuffle.
	int StartIdx[2] = { -1, -1 }; // StartIdx to extract from
	bool CanExtract = true;
	for (int Idx : Mask) {
	unsigned Input = 0;
	if (Idx < 0)
	continue;

	if (Idx >= (int)SrcNumElts) {
	Input = 1;
	Idx -= SrcNumElts;
	}

	// If all the indices come from the same MaskNumElts sized portion of
	// the sources we can use extract. Also make sure the extract wouldn't
	// extract past the end of the source.
	int NewStartIdx = alignDown(Idx, MaskNumElts);
	if (NewStartIdx + MaskNumElts > SrcNumElts \|\|
	(StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx))
	CanExtract = false;
	// Make sure we always update StartIdx as we use it to track if all
	// elements are undef.
	StartIdx[Input] = NewStartIdx;
	}

	if (StartIdx[0] < 0 && StartIdx[1] < 0) {
	setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
	return;
	}
	if (CanExtract) {
	// Extract appropriate subvector and generate a vector shuffle
	for (unsigned Input = 0; Input < 2; ++Input) {
	SDValue &Src = Input == 0 ? Src1 : Src2;
	if (StartIdx[Input] < 0)
	Src = DAG.getUNDEF(VT);
	else {
	Src = DAG.getNode(
	ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
	DAG.getConstant(StartIdx[Input], DL,
	TLI.getVectorIdxTy(DAG.getDataLayout())));
	}
	}

	// Calculate new mask.
	SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end());
	for (int &Idx : MappedOps) {
	if (Idx >= (int)SrcNumElts)
	Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
	else if (Idx >= 0)
	Idx -= StartIdx[0];
	}

	setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps));
	return;
	}
	}

	// We can't use either concat vectors or extract subvectors so fall back to
	// replacing the shuffle with extract and build vector.
	// to insert and build vector.
	EVT EltVT = VT.getVectorElementType();
	EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
	SmallVector<SDValue,8> Ops;
	for (int Idx : Mask) {
	SDValue Res;

	if (Idx < 0) {
	Res = DAG.getUNDEF(EltVT);
	} else {
	SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
	if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;

	Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
	EltVT, Src, DAG.getConstant(Idx, DL, IdxVT));
	}

	Ops.push_back(Res);
	}

	setValue(&I, DAG.getBuildVector(VT, DL, Ops));
	}

	void SelectionDAGBuilder::visitInsertValue(const User &I) {
	ArrayRef<unsigned> Indices;
	if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(&I))
	Indices = IV->getIndices();
	else
	Indices = cast<ConstantExpr>(&I)->getIndices();

	const Value *Op0 = I.getOperand(0);
	const Value *Op1 = I.getOperand(1);
	Type *AggTy = I.getType();
	Type *ValTy = Op1->getType();
	bool IntoUndef = isa<UndefValue>(Op0);
	bool FromUndef = isa<UndefValue>(Op1);

	unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SmallVector<EVT, 4> AggValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), AggTy, AggValueVTs);
	SmallVector<EVT, 4> ValValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);

	unsigned NumAggValues = AggValueVTs.size();
	unsigned NumValValues = ValValueVTs.size();
	SmallVector<SDValue, 4> Values(NumAggValues);

	// Ignore an insertvalue that produces an empty object
	if (!NumAggValues) {
	setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
	return;
	}

	SDValue Agg = getValue(Op0);
	unsigned i = 0;
	// Copy the beginning value(s) from the original aggregate.
	for (; i != LinearIndex; ++i)
	Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
	SDValue(Agg.getNode(), Agg.getResNo() + i);
	// Copy values from the inserted value(s).
	if (NumValValues) {
	SDValue Val = getValue(Op1);
	for (; i != LinearIndex + NumValValues; ++i)
	Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
	SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
	}
	// Copy remaining value(s) from the original aggregate.
	for (; i != NumAggValues; ++i)
	Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
	SDValue(Agg.getNode(), Agg.getResNo() + i);

	setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
	DAG.getVTList(AggValueVTs), Values));
	}

	void SelectionDAGBuilder::visitExtractValue(const User &I) {
	ArrayRef<unsigned> Indices;
	if (const ExtractValueInst *EV = dyn_cast<ExtractValueInst>(&I))
	Indices = EV->getIndices();
	else
	Indices = cast<ConstantExpr>(&I)->getIndices();

	const Value *Op0 = I.getOperand(0);
	Type *AggTy = Op0->getType();
	Type *ValTy = I.getType();
	bool OutOfUndef = isa<UndefValue>(Op0);

	unsigned LinearIndex = ComputeLinearIndex(AggTy, Indices);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SmallVector<EVT, 4> ValValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);

	unsigned NumValValues = ValValueVTs.size();

	// Ignore a extractvalue that produces an empty object
	if (!NumValValues) {
	setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
	return;
	}

	SmallVector<SDValue, 4> Values(NumValValues);

	SDValue Agg = getValue(Op0);
	// Copy out the selected value(s).
	for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
	Values[i - LinearIndex] =
	OutOfUndef ?
	DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
	SDValue(Agg.getNode(), Agg.getResNo() + i);

	setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
	DAG.getVTList(ValValueVTs), Values));
	}

	void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
	Value *Op0 = I.getOperand(0);
	// Note that the pointer operand may be a vector of pointers. Take the scalar
	// element which holds a pointer.
	unsigned AS = Op0->getType()->getScalarType()->getPointerAddressSpace();
	SDValue N = getValue(Op0);
	SDLoc dl = getCurSDLoc();

	// Normalize Vector GEP - all scalar operands should be converted to the
	// splat vector.
	unsigned VectorWidth = I.getType()->isVectorTy() ?
	cast<VectorType>(I.getType())->getVectorNumElements() : 0;

	if (VectorWidth && !N.getValueType().isVector()) {
	LLVMContext &Context = *DAG.getContext();
	EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorWidth);
	N = DAG.getSplatBuildVector(VT, dl, N);
	}

	for (gep_type_iterator GTI = gep_type_begin(&I), E = gep_type_end(&I);
	GTI != E; ++GTI) {
	const Value *Idx = GTI.getOperand();
	if (StructType *StTy = GTI.getStructTypeOrNull()) {
	unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
	if (Field) {
	// N = N + Offset
	uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);

	// In an inbounds GEP with an offset that is nonnegative even when
	// interpreted as signed, assume there is no unsigned overflow.
	SDNodeFlags Flags;
	if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
	Flags.setNoUnsignedWrap(true);

	N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
	DAG.getConstant(Offset, dl, N.getValueType()), Flags);
	}
	} else {
	MVT PtrTy =
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout(), AS);
	unsigned PtrSize = PtrTy.getSizeInBits();
	APInt ElementSize(PtrSize, DL->getTypeAllocSize(GTI.getIndexedType()));

	// If this is a scalar constant or a splat vector of constants,
	// handle it quickly.
	const auto *CI = dyn_cast<ConstantInt>(Idx);
	if (!CI && isa<ConstantDataVector>(Idx) &&
	cast<ConstantDataVector>(Idx)->getSplatValue())
	CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());

	if (CI) {
	if (CI->isZero())
	continue;
	APInt Offs = ElementSize * CI->getValue().sextOrTrunc(PtrSize);
	LLVMContext &Context = *DAG.getContext();
	SDValue OffsVal = VectorWidth ?
	DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, PtrTy, VectorWidth)) :
	DAG.getConstant(Offs, dl, PtrTy);

	// In an inbouds GEP with an offset that is nonnegative even when
	// interpreted as signed, assume there is no unsigned overflow.
	SDNodeFlags Flags;
	if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
	Flags.setNoUnsignedWrap(true);

	N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, Flags);
	continue;
	}

	// N = N + Idx * ElementSize;
	SDValue IdxN = getValue(Idx);

	if (!IdxN.getValueType().isVector() && VectorWidth) {
	EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth);
	IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
	}

	// If the index is smaller or larger than intptr_t, truncate or extend
	// it.
	IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType());

	// If this is a multiply by a power of two, turn it into a shl
	// immediately. This is a very common case.
	if (ElementSize != 1) {
	if (ElementSize.isPowerOf2()) {
	unsigned Amt = ElementSize.logBase2();
	IdxN = DAG.getNode(ISD::SHL, dl,
	N.getValueType(), IdxN,
	DAG.getConstant(Amt, dl, IdxN.getValueType()));
	} else {
	SDValue Scale = DAG.getConstant(ElementSize, dl, IdxN.getValueType());
	IdxN = DAG.getNode(ISD::MUL, dl,
	N.getValueType(), IdxN, Scale);
	}
	}

	N = DAG.getNode(ISD::ADD, dl,
	N.getValueType(), N, IdxN);
	}
	}

	setValue(&I, N);
	}

	void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
	// If this is a fixed sized alloca in the entry block of the function,
	// allocate it statically on the stack.
	if (FuncInfo.StaticAllocaMap.count(&I))
	return; // getValue will auto-populate this.

	SDLoc dl = getCurSDLoc();
	Type *Ty = I.getAllocatedType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	auto &DL = DAG.getDataLayout();
	uint64_t TySize = DL.getTypeAllocSize(Ty);
	unsigned Align =
	std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment());

	SDValue AllocSize = getValue(I.getArraySize());

	EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
	if (AllocSize.getValueType() != IntPtr)
	AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);

	AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr,
	AllocSize,
	DAG.getConstant(TySize, dl, IntPtr));

	// Handle alignment. If the requested alignment is less than or equal to
	// the stack alignment, ignore it. If the size is greater than or equal to
	// the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
	unsigned StackAlign =
	DAG.getSubtarget().getFrameLowering()->getStackAlignment();
	if (Align <= StackAlign)
	Align = 0;

	// Round the size of the allocation up to the stack alignment size
	// by add SA-1 to the size. This doesn't overflow because we're computing
	// an address inside an alloca.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);
	AllocSize = DAG.getNode(ISD::ADD, dl,
	AllocSize.getValueType(), AllocSize,
	DAG.getIntPtrConstant(StackAlign - 1, dl), Flags);

	// Mask out the low bits for alignment purposes.
	AllocSize = DAG.getNode(ISD::AND, dl,
	AllocSize.getValueType(), AllocSize,
	DAG.getIntPtrConstant(~(uint64_t)(StackAlign - 1),
	dl));

	SDValue Ops[] = { getRoot(), AllocSize, DAG.getIntPtrConstant(Align, dl) };
	SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
	SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops);
	setValue(&I, DSA);
	DAG.setRoot(DSA.getValue(1));

	assert(FuncInfo.MF->getFrameInfo().hasVarSizedObjects());
	}

	void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
	if (I.isAtomic())
	return visitAtomicLoad(I);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const Value *SV = I.getOperand(0);
	if (TLI.supportSwiftError()) {
	// Swifterror values can come from either a function parameter with
	// swifterror attribute or an alloca with swifterror attribute.
	if (const Argument *Arg = dyn_cast<Argument>(SV)) {
	if (Arg->hasSwiftErrorAttr())
	return visitLoadFromSwiftError(I);
	}

	if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
	if (Alloca->isSwiftError())
	return visitLoadFromSwiftError(I);
	}
	}

	SDValue Ptr = getValue(SV);

	Type *Ty = I.getType();

	bool isVolatile = I.isVolatile();
	bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
	bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr;
	bool isDereferenceable = isDereferenceablePointer(SV, DAG.getDataLayout());
	unsigned Alignment = I.getAlignment();

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);

	SmallVector<EVT, 4> ValueVTs;
	SmallVector<uint64_t, 4> Offsets;
	ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &Offsets);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0)
	return;

	SDValue Root;
	bool ConstantMemory = false;
	if (isVolatile \|\| NumValues > MaxParallelChains)
	// Serialize volatile loads with other side effects.
	Root = getRoot();
	else if (AA && AA->pointsToConstantMemory(MemoryLocation(
	SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
	// Do not serialize (non-volatile) loads of constant memory with anything.
	Root = DAG.getEntryNode();
	ConstantMemory = true;
	} else {
	// Do not serialize non-volatile loads against each other.
	Root = DAG.getRoot();
	}

	SDLoc dl = getCurSDLoc();

	if (isVolatile)
	Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);

	// An aggregate load cannot wrap around the address space, so offsets to its
	// parts don't wrap either.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);

	SmallVector<SDValue, 4> Values(NumValues);
	SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
	EVT PtrVT = Ptr.getValueType();
	unsigned ChainI = 0;
	for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
	// Serializing loads here may result in excessive register pressure, and
	// TokenFactor places arbitrary choke points on the scheduler. SD scheduling
	// could recover a bit by hoisting nodes upward in the chain by recognizing
	// they are side-effect free or do not alias. The optimizer should really
	// avoid this case by converting large object/array copies to llvm.memcpy
	// (MaxParallelChains should always remain as failsafe).
	if (ChainI == MaxParallelChains) {
	assert(PendingLoads.empty() && "PendingLoads must be serialized first");
	SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(Chains.data(), ChainI));
	Root = Chain;
	ChainI = 0;
	}
	SDValue A = DAG.getNode(ISD::ADD, dl,
	PtrVT, Ptr,
	DAG.getConstant(Offsets[i], dl, PtrVT),
	Flags);
	auto MMOFlags = MachineMemOperand::MONone;
	if (isVolatile)
	MMOFlags \|= MachineMemOperand::MOVolatile;
	if (isNonTemporal)
	MMOFlags \|= MachineMemOperand::MONonTemporal;
	if (isInvariant)
	MMOFlags \|= MachineMemOperand::MOInvariant;
	if (isDereferenceable)
	MMOFlags \|= MachineMemOperand::MODereferenceable;
	MMOFlags \|= TLI.getMMOFlags(I);

	SDValue L = DAG.getLoad(ValueVTs[i], dl, Root, A,
	MachinePointerInfo(SV, Offsets[i]), Alignment,
	MMOFlags, AAInfo, Ranges);

	Values[i] = L;
	Chains[ChainI] = L.getValue(1);
	}

	if (!ConstantMemory) {
	SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(Chains.data(), ChainI));
	if (isVolatile)
	DAG.setRoot(Chain);
	else
	PendingLoads.push_back(Chain);
	}

	setValue(&I, DAG.getNode(ISD::MERGE_VALUES, dl,
	DAG.getVTList(ValueVTs), Values));
	}

	void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
	assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
	"call visitStoreToSwiftError when backend supports swifterror");

	SmallVector<EVT, 4> ValueVTs;
	SmallVector<uint64_t, 4> Offsets;
	const Value *SrcV = I.getOperand(0);
	ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
	SrcV->getType(), ValueVTs, &Offsets);
	assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
	"expect a single EVT for swifterror");

	SDValue Src = getValue(SrcV);
	// Create a virtual register, then update the virtual register.
	unsigned VReg; bool CreatedVReg;
	std::tie(VReg, CreatedVReg) = FuncInfo.getOrCreateSwiftErrorVRegDefAt(&I);
	// Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
	// Chain can be getRoot or getControlRoot.
	SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg,
	SDValue(Src.getNode(), Src.getResNo()));
	DAG.setRoot(CopyNode);
	if (CreatedVReg)
	FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, I.getOperand(1), VReg);
	}

	void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
	assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
	"call visitLoadFromSwiftError when backend supports swifterror");

	assert(!I.isVolatile() &&
	I.getMetadata(LLVMContext::MD_nontemporal) == nullptr &&
	I.getMetadata(LLVMContext::MD_invariant_load) == nullptr &&
	"Support volatile, non temporal, invariant for load_from_swift_error");

	const Value *SV = I.getOperand(0);
	Type *Ty = I.getType();
	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	assert((!AA \|\| !AA->pointsToConstantMemory(MemoryLocation(
	SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) &&
	"load_from_swift_error should not be constant memory");

	SmallVector<EVT, 4> ValueVTs;
	SmallVector<uint64_t, 4> Offsets;
	ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
	ValueVTs, &Offsets);
	assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
	"expect a single EVT for swifterror");

	// Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT
	SDValue L = DAG.getCopyFromReg(
	getRoot(), getCurSDLoc(),
	FuncInfo.getOrCreateSwiftErrorVRegUseAt(&I, FuncInfo.MBB, SV).first,
	ValueVTs[0]);

	setValue(&I, L);
	}

	void SelectionDAGBuilder::visitStore(const StoreInst &I) {
	if (I.isAtomic())
	return visitAtomicStore(I);

	const Value *SrcV = I.getOperand(0);
	const Value *PtrV = I.getOperand(1);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.supportSwiftError()) {
	// Swifterror values can come from either a function parameter with
	// swifterror attribute or an alloca with swifterror attribute.
	if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
	if (Arg->hasSwiftErrorAttr())
	return visitStoreToSwiftError(I);
	}

	if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
	if (Alloca->isSwiftError())
	return visitStoreToSwiftError(I);
	}
	}

	SmallVector<EVT, 4> ValueVTs;
	SmallVector<uint64_t, 4> Offsets;
	ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
	SrcV->getType(), ValueVTs, &Offsets);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0)
	return;

	// Get the lowered operands. Note that we do this after
	// checking if NumResults is zero, because with zero results
	// the operands won't have values in the map.
	SDValue Src = getValue(SrcV);
	SDValue Ptr = getValue(PtrV);

	SDValue Root = getRoot();
	SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
	SDLoc dl = getCurSDLoc();
	EVT PtrVT = Ptr.getValueType();
	unsigned Alignment = I.getAlignment();
	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);

	auto MMOFlags = MachineMemOperand::MONone;
	if (I.isVolatile())
	MMOFlags \|= MachineMemOperand::MOVolatile;
	if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
	MMOFlags \|= MachineMemOperand::MONonTemporal;
	MMOFlags \|= TLI.getMMOFlags(I);

	// An aggregate load cannot wrap around the address space, so offsets to its
	// parts don't wrap either.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);

	unsigned ChainI = 0;
	for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
	// See visitLoad comments.
	if (ChainI == MaxParallelChains) {
	SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(Chains.data(), ChainI));
	Root = Chain;
	ChainI = 0;
	}
	SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
	DAG.getConstant(Offsets[i], dl, PtrVT), Flags);
	SDValue St = DAG.getStore(
	Root, dl, SDValue(Src.getNode(), Src.getResNo() + i), Add,
	MachinePointerInfo(PtrV, Offsets[i]), Alignment, MMOFlags, AAInfo);
	Chains[ChainI] = St;
	}

	SDValue StoreNode = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	makeArrayRef(Chains.data(), ChainI));
	DAG.setRoot(StoreNode);
	}

	void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
	bool IsCompressing) {
	SDLoc sdl = getCurSDLoc();

	auto getMaskedStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
	unsigned& Alignment) {
	// llvm.masked.store.*(Src0, Ptr, alignment, Mask)
	Src0 = I.getArgOperand(0);
	Ptr = I.getArgOperand(1);
	Alignment = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
	Mask = I.getArgOperand(3);
	};
	auto getCompressingStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
	unsigned& Alignment) {
	// llvm.masked.compressstore.*(Src0, Ptr, Mask)
	Src0 = I.getArgOperand(0);
	Ptr = I.getArgOperand(1);
	Mask = I.getArgOperand(2);
	Alignment = 0;
	};

	Value PtrOperand, MaskOperand, *Src0Operand;
	unsigned Alignment;
	if (IsCompressing)
	getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
	else
	getMaskedStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);

	SDValue Ptr = getValue(PtrOperand);
	SDValue Src0 = getValue(Src0Operand);
	SDValue Mask = getValue(MaskOperand);

	EVT VT = Src0.getValueType();
	if (!Alignment)
	Alignment = DAG.getEVTAlignment(VT);

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);

	MachineMemOperand *MMO =
	DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(PtrOperand),
	MachineMemOperand::MOStore, VT.getStoreSize(),
	Alignment, AAInfo);
	SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
	MMO, false /* Truncating */,
	IsCompressing);
	DAG.setRoot(StoreNode);
	setValue(&I, StoreNode);
	}

	// Get a uniform base for the Gather/Scatter intrinsic.
	// The first argument of the Gather/Scatter intrinsic is a vector of pointers.
	// We try to represent it as a base pointer + vector of indices.
	// Usually, the vector of pointers comes from a 'getelementptr' instruction.
	// The first operand of the GEP may be a single pointer or a vector of pointers
	// Example:
	// %gep.ptr = getelementptr i32, <8 x i32*> %vptr, <8 x i32> %ind
	// or
	// %gep.ptr = getelementptr i32, i32* %ptr, <8 x i32> %ind
	// %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.ptr, ..
	//
	// When the first GEP operand is a single pointer - it is the uniform base we
	// are looking for. If first operand of the GEP is a splat vector - we
	// extract the spalt value and use it as a uniform base.
	// In all other cases the function returns 'false'.
	//
	static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
	SelectionDAGBuilder* SDB) {

	SelectionDAG& DAG = SDB->DAG;
	LLVMContext &Context = *DAG.getContext();

	assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
	const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
	if (!GEP \|\| GEP->getNumOperands() > 2)
	return false;

	const Value *GEPPtr = GEP->getPointerOperand();
	if (!GEPPtr->getType()->isVectorTy())
	Ptr = GEPPtr;
	else if (!(Ptr = getSplatValue(GEPPtr)))
	return false;

	Value *IndexVal = GEP->getOperand(1);

	// The operands of the GEP may be defined in another basic block.
	// In this case we'll not find nodes for the operands.
	if (!SDB->findValue(Ptr) \|\| !SDB->findValue(IndexVal))
	return false;

	Base = SDB->getValue(Ptr);
	Index = SDB->getValue(IndexVal);

	// Suppress sign extension.
	if (SExtInst* Sext = dyn_cast<SExtInst>(IndexVal)) {
	if (SDB->findValue(Sext->getOperand(0))) {
	IndexVal = Sext->getOperand(0);
	Index = SDB->getValue(IndexVal);
	}
	}
	if (!Index.getValueType().isVector()) {
	unsigned GEPWidth = GEP->getType()->getVectorNumElements();
	EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth);
	Index = DAG.getSplatBuildVector(VT, SDLoc(Index), Index);
	}
	return true;
	}

	void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
	SDLoc sdl = getCurSDLoc();

	// llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask)
	const Value *Ptr = I.getArgOperand(1);
	SDValue Src0 = getValue(I.getArgOperand(0));
	SDValue Mask = getValue(I.getArgOperand(3));
	EVT VT = Src0.getValueType();
	unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
	if (!Alignment)
	Alignment = DAG.getEVTAlignment(VT);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);

	SDValue Base;
	SDValue Index;
	const Value *BasePtr = Ptr;
	bool UniformBase = getUniformBase(BasePtr, Base, Index, this);

	const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
	MachineMemOperand *MMO = DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
	MachineMemOperand::MOStore, VT.getStoreSize(),
	Alignment, AAInfo);
	if (!UniformBase) {
	Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
	Index = getValue(Ptr);
	}
	SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index };
	SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
	Ops, MMO);
	DAG.setRoot(Scatter);
	setValue(&I, Scatter);
	}

	void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
	SDLoc sdl = getCurSDLoc();

	auto getMaskedLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
	unsigned& Alignment) {
	// @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
	Ptr = I.getArgOperand(0);
	Alignment = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
	Mask = I.getArgOperand(2);
	Src0 = I.getArgOperand(3);
	};
	auto getExpandingLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
	unsigned& Alignment) {
	// @llvm.masked.expandload.*(Ptr, Mask, Src0)
	Ptr = I.getArgOperand(0);
	Alignment = 0;
	Mask = I.getArgOperand(1);
	Src0 = I.getArgOperand(2);
	};

	Value PtrOperand, MaskOperand, *Src0Operand;
	unsigned Alignment;
	if (IsExpanding)
	getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
	else
	getMaskedLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);

	SDValue Ptr = getValue(PtrOperand);
	SDValue Src0 = getValue(Src0Operand);
	SDValue Mask = getValue(MaskOperand);

	EVT VT = Src0.getValueType();
	if (!Alignment)
	Alignment = DAG.getEVTAlignment(VT);

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);

	// Do not serialize masked loads of constant memory with anything.
	bool AddToChain = !AA \|\| !AA->pointsToConstantMemory(MemoryLocation(
	PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
	SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();

	MachineMemOperand *MMO =
	DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(PtrOperand),
	MachineMemOperand::MOLoad, VT.getStoreSize(),
	Alignment, AAInfo, Ranges);

	SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
	ISD::NON_EXTLOAD, IsExpanding);
	if (AddToChain) {
	SDValue OutChain = Load.getValue(1);
	DAG.setRoot(OutChain);
	}
	setValue(&I, Load);
	}

	void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
	SDLoc sdl = getCurSDLoc();

	// @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
	const Value *Ptr = I.getArgOperand(0);
	SDValue Src0 = getValue(I.getArgOperand(3));
	SDValue Mask = getValue(I.getArgOperand(2));

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
	if (!Alignment)
	Alignment = DAG.getEVTAlignment(VT);

	AAMDNodes AAInfo;
	I.getAAMetadata(AAInfo);
	const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);

	SDValue Root = DAG.getRoot();
	SDValue Base;
	SDValue Index;
	const Value *BasePtr = Ptr;
	bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
	bool ConstantMemory = false;
	if (UniformBase &&
	AA && AA->pointsToConstantMemory(MemoryLocation(
	BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
	AAInfo))) {
	// Do not serialize (non-volatile) loads of constant memory with anything.
	Root = DAG.getEntryNode();
	ConstantMemory = true;
	}

	MachineMemOperand *MMO =
	DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(UniformBase ? BasePtr : nullptr),
	MachineMemOperand::MOLoad, VT.getStoreSize(),
	Alignment, AAInfo, Ranges);

	if (!UniformBase) {
	Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
	Index = getValue(Ptr);
	}
	SDValue Ops[] = { Root, Src0, Mask, Base, Index };
	SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
	Ops, MMO);

	SDValue OutChain = Gather.getValue(1);
	if (!ConstantMemory)
	PendingLoads.push_back(OutChain);
	setValue(&I, Gather);
	}

	void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
	SDLoc dl = getCurSDLoc();
	AtomicOrdering SuccessOrder = I.getSuccessOrdering();
	AtomicOrdering FailureOrder = I.getFailureOrdering();
	SyncScope::ID SSID = I.getSyncScopeID();

	SDValue InChain = getRoot();

	MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
	SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
	SDValue L = DAG.getAtomicCmpSwap(
	ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
	getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
	getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
	/Alignment=/ 0, SuccessOrder, FailureOrder, SSID);

	SDValue OutChain = L.getValue(2);

	setValue(&I, L);
	DAG.setRoot(OutChain);
	}

	void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
	SDLoc dl = getCurSDLoc();
	ISD::NodeType NT;
	switch (I.getOperation()) {
	default: llvm_unreachable("Unknown atomicrmw operation");
	case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break;
	case AtomicRMWInst::Add: NT = ISD::ATOMIC_LOAD_ADD; break;
	case AtomicRMWInst::Sub: NT = ISD::ATOMIC_LOAD_SUB; break;
	case AtomicRMWInst::And: NT = ISD::ATOMIC_LOAD_AND; break;
	case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break;
	case AtomicRMWInst::Or: NT = ISD::ATOMIC_LOAD_OR; break;
	case AtomicRMWInst::Xor: NT = ISD::ATOMIC_LOAD_XOR; break;
	case AtomicRMWInst::Max: NT = ISD::ATOMIC_LOAD_MAX; break;
	case AtomicRMWInst::Min: NT = ISD::ATOMIC_LOAD_MIN; break;
	case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break;
	case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
	}
	AtomicOrdering Order = I.getOrdering();
	SyncScope::ID SSID = I.getSyncScopeID();

	SDValue InChain = getRoot();

	SDValue L =
	DAG.getAtomic(NT, dl,
	getValue(I.getValOperand()).getSimpleValueType(),
	InChain,
	getValue(I.getPointerOperand()),
	getValue(I.getValOperand()),
	I.getPointerOperand(),
	/* Alignment=*/ 0, Order, SSID);

	SDValue OutChain = L.getValue(1);

	setValue(&I, L);
	DAG.setRoot(OutChain);
	}

	void SelectionDAGBuilder::visitFence(const FenceInst &I) {
	SDLoc dl = getCurSDLoc();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Ops[3];
	Ops[0] = getRoot();
	Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl,
	TLI.getFenceOperandTy(DAG.getDataLayout()));
	Ops[2] = DAG.getConstant(I.getSyncScopeID(), dl,
	TLI.getFenceOperandTy(DAG.getDataLayout()));
	DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
	}

	void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
	SDLoc dl = getCurSDLoc();
	AtomicOrdering Order = I.getOrdering();
	SyncScope::ID SSID = I.getSyncScopeID();

	SDValue InChain = getRoot();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());

	if (I.getAlignment() < VT.getSizeInBits() / 8)
	report_fatal_error("Cannot generate unaligned atomic load");

	MachineMemOperand *MMO =
	DAG.getMachineFunction().
	getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
	MachineMemOperand::MOVolatile \|
	MachineMemOperand::MOLoad,
	VT.getStoreSize(),
	I.getAlignment() ? I.getAlignment() :
	DAG.getEVTAlignment(VT),
	AAMDNodes(), nullptr, SSID, Order);

	InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
	SDValue L =
	DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
	getValue(I.getPointerOperand()), MMO);

	SDValue OutChain = L.getValue(1);

	setValue(&I, L);
	DAG.setRoot(OutChain);
	}

	void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
	SDLoc dl = getCurSDLoc();

	AtomicOrdering Order = I.getOrdering();
	SyncScope::ID SSID = I.getSyncScopeID();

	SDValue InChain = getRoot();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT VT =
	TLI.getValueType(DAG.getDataLayout(), I.getValueOperand()->getType());

	if (I.getAlignment() < VT.getSizeInBits() / 8)
	report_fatal_error("Cannot generate unaligned atomic store");

	SDValue OutChain =
	DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
	InChain,
	getValue(I.getPointerOperand()),
	getValue(I.getValueOperand()),
	I.getPointerOperand(), I.getAlignment(),
	Order, SSID);

	DAG.setRoot(OutChain);
	}

	/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
	/// node.
	void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
	unsigned Intrinsic) {
	// Ignore the callsite's attributes. A specific call site may be marked with
	// readnone, but the lowering code will expect the chain based on the
	// definition.
	const Function *F = I.getCalledFunction();
	bool HasChain = !F->doesNotAccessMemory();
	bool OnlyLoad = HasChain && F->onlyReadsMemory();

	// Build the operand list.
	SmallVector<SDValue, 8> Ops;
	if (HasChain) { // If this intrinsic has side-effects, chainify it.
	if (OnlyLoad) {
	// We don't need to serialize loads against other loads.
	Ops.push_back(DAG.getRoot());
	} else {
	Ops.push_back(getRoot());
	}
	}

	// Info is set by getTgtMemInstrinsic
	TargetLowering::IntrinsicInfo Info;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic);

	// Add the intrinsic ID as an integer operand if it's not a target intrinsic.
	if (!IsTgtIntrinsic \|\| Info.opc == ISD::INTRINSIC_VOID \|\|
	Info.opc == ISD::INTRINSIC_W_CHAIN)
	Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
	TLI.getPointerTy(DAG.getDataLayout())));

	// Add all operands of the call to the operand list.
	for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
	SDValue Op = getValue(I.getArgOperand(i));
	Ops.push_back(Op);
	}

	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);

	if (HasChain)
	ValueVTs.push_back(MVT::Other);

	SDVTList VTs = DAG.getVTList(ValueVTs);

	// Create the node.
	SDValue Result;
	if (IsTgtIntrinsic) {
	// This is target intrinsic that touches memory
	Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(),
	VTs, Ops, Info.memVT,
	MachinePointerInfo(Info.ptrVal, Info.offset),
	Info.align, Info.vol,
	Info.readMem, Info.writeMem, Info.size);
	} else if (!HasChain) {
	Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
	} else if (!I.getType()->isVoidTy()) {
	Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
	} else {
	Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
	}

	if (HasChain) {
	SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
	if (OnlyLoad)
	PendingLoads.push_back(Chain);
	else
	DAG.setRoot(Chain);
	}

	if (!I.getType()->isVoidTy()) {
	if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
	EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
	Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
	} else
	Result = lowerRangeToAssertZExt(DAG, I, Result);

	setValue(&I, Result);
	}
	}

	/// GetSignificand - Get the significand and build it into a floating-point
	/// number with exponent of 1:
	///
	/// Op = (Op & 0x007fffff) \| 0x3f800000;
	///
	/// where Op is the hexadecimal representation of floating point value.
	static SDValue GetSignificand(SelectionDAG &DAG, SDValue Op, const SDLoc &dl) {
	SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
	DAG.getConstant(0x007fffff, dl, MVT::i32));
	SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
	DAG.getConstant(0x3f800000, dl, MVT::i32));
	return DAG.getNode(ISD::BITCAST, dl, MVT::f32, t2);
	}

	/// GetExponent - Get the exponent:
	///
	/// (float)(int)(((Op & 0x7f800000) >> 23) - 127);
	///
	/// where Op is the hexadecimal representation of floating point value.
	static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
	const TargetLowering &TLI, const SDLoc &dl) {
	SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
	DAG.getConstant(0x7f800000, dl, MVT::i32));
	SDValue t1 = DAG.getNode(
	ISD::SRL, dl, MVT::i32, t0,
	DAG.getConstant(23, dl, TLI.getPointerTy(DAG.getDataLayout())));
	SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
	DAG.getConstant(127, dl, MVT::i32));
	return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
	}

	/// getF32Constant - Get 32-bit floating point constant.
	static SDValue getF32Constant(SelectionDAG &DAG, unsigned Flt,
	const SDLoc &dl) {
	return DAG.getConstantFP(APFloat(APFloat::IEEEsingle(), APInt(32, Flt)), dl,
	MVT::f32);
	}

	static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
	SelectionDAG &DAG) {
	// TODO: What fast-math-flags should be set on the floating-point nodes?

	// IntegerPartOfX = ((int32_t)(t0);
	SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);

	// FractionalPartOfX = t0 - (float)IntegerPartOfX;
	SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
	SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);

	// IntegerPartOfX <<= 23;
	IntegerPartOfX = DAG.getNode(
	ISD::SHL, dl, MVT::i32, IntegerPartOfX,
	DAG.getConstant(23, dl, DAG.getTargetLoweringInfo().getPointerTy(
	DAG.getDataLayout())));

	SDValue TwoToFractionalPartOfX;
	if (LimitFloatPrecision <= 6) {
	// For floating-point precision of 6:
	//
	// TwoToFractionalPartOfX =
	// 0.997535578f +
	// (0.735607626f + 0.252464424f * x) * x;
	//
	// error 0.0144103317, which is 6 bits
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3e814304, dl));
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f3c50c8, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3f7f5e7e, dl));
	} else if (LimitFloatPrecision <= 12) {
	// For floating-point precision of 12:
	//
	// TwoToFractionalPartOfX =
	// 0.999892986f +
	// (0.696457318f +
	// (0.224338339f + 0.792043434e-1f * x) * x) * x;
	//
	// error 0.000107046256, which is 13 to 14 bits
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3da235e3, dl));
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3e65b8f3, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3f324b07, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x3f7ff8fd, dl));
	} else { // LimitFloatPrecision <= 18
	// For floating-point precision of 18:
	//
	// TwoToFractionalPartOfX =
	// 0.999999982f +
	// (0.693148872f +
	// (0.240227044f +
	// (0.554906021e-1f +
	// (0.961591928e-2f +
	// (0.136028312e-2f + 0.157059148e-3f x)x)x)x)x)x;
	// error 2.47208000*10^(-7), which is better than 18 bits
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3924b03e, dl));
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3ab24b87, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3c1d8c17, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x3d634a1d, dl));
	SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
	SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
	getF32Constant(DAG, 0x3e75fe14, dl));
	SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
	SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
	getF32Constant(DAG, 0x3f317234, dl));
	SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
	TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
	getF32Constant(DAG, 0x3f800000, dl));
	}

	// Add the exponent into the result in integer domain.
	SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFractionalPartOfX);
	return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
	DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX));
	}

	/// expandExp - Lower an exp intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {

	// Put the exponent in the right bit position for later addition to the
	// final result:
	//
	// #define LOG2OFe 1.4426950f
	// t0 = Op * LOG2OFe

	// TODO: What fast-math-flags should be set here?
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
	getF32Constant(DAG, 0x3fb8aa3b, dl));
	return getLimitedPrecisionExp2(t0, dl, DAG);
	}

	// No special expansion.
	return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op);
	}

	/// expandLog - Lower a log intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {

	// TODO: What fast-math-flags should be set on the floating-point nodes?

	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
	SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);

	// Scale the exponent by log(2) [0.69314718f].
	SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
	SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
	getF32Constant(DAG, 0x3f317218, dl));

	// Get the significand and build it into a floating-point number with
	// exponent of 1.
	SDValue X = GetSignificand(DAG, Op1, dl);

	SDValue LogOfMantissa;
	if (LimitFloatPrecision <= 6) {
	// For floating-point precision of 6:
	//
	// LogofMantissa =
	// -1.1609546f +
	// (1.4034025f - 0.23903021f * x) * x;
	//
	// error 0.0034276066, which is better than 8 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbe74c456, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3fb3a2b1, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f949a29, dl));
	} else if (LimitFloatPrecision <= 12) {
	// For floating-point precision of 12:
	//
	// LogOfMantissa =
	// -1.7417939f +
	// (2.8212026f +
	// (-1.4699568f +
	// (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
	//
	// error 0.000061011436, which is 14 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbd67b6d6, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3ee4f4b8, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3fbc278b, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x40348e95, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x3fdef31a, dl));
	} else { // LimitFloatPrecision <= 18
	// For floating-point precision of 18:
	//
	// LogOfMantissa =
	// -2.1072184f +
	// (4.2372794f +
	// (-3.7029485f +
	// (2.2781945f +
	// (-0.87823314f +
	// (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
	//
	// error 0.0000023660568, which is better than 18 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbc91e5ac, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3e4350aa, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f60d3e3, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x4011cdf0, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x406cfd1c, dl));
	SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
	SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
	getF32Constant(DAG, 0x408797cb, dl));
	SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
	LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
	getF32Constant(DAG, 0x4006dcab, dl));
	}

	return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, LogOfMantissa);
	}

	// No special expansion.
	return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op);
	}

	/// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {

	// TODO: What fast-math-flags should be set on the floating-point nodes?

	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
	SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);

	// Get the exponent.
	SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);

	// Get the significand and build it into a floating-point number with
	// exponent of 1.
	SDValue X = GetSignificand(DAG, Op1, dl);

	// Different possible minimax approximations of significand in
	// floating-point for various degrees of accuracy over [1,2].
	SDValue Log2ofMantissa;
	if (LimitFloatPrecision <= 6) {
	// For floating-point precision of 6:
	//
	// Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
	//
	// error 0.0049451742, which is more than 7 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbeb08fe0, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x40019463, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3fd6633d, dl));
	} else if (LimitFloatPrecision <= 12) {
	// For floating-point precision of 12:
	//
	// Log2ofMantissa =
	// -2.51285454f +
	// (4.07009056f +
	// (-2.12067489f +
	// (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
	//
	// error 0.0000876136000, which is better than 13 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbda7262e, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3f25280b, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x4007b923, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x40823e2f, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x4020d29c, dl));
	} else { // LimitFloatPrecision <= 18
	// For floating-point precision of 18:
	//
	// Log2ofMantissa =
	// -3.0400495f +
	// (6.1129976f +
	// (-5.3420409f +
	// (3.2865683f +
	// (-1.2669343f +
	// (0.27515199f -
	// 0.25691327e-1f * x) * x) * x) * x) * x) * x;
	//
	// error 0.0000018516, which is better than 18 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbcd2769e, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3e8ce0b9, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3fa22ae7, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x40525723, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x40aaf200, dl));
	SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
	SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
	getF32Constant(DAG, 0x40c39dad, dl));
	SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
	Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
	getF32Constant(DAG, 0x4042902c, dl));
	}

	return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log2ofMantissa);
	}

	// No special expansion.
	return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op);
	}

	/// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {

	// TODO: What fast-math-flags should be set on the floating-point nodes?

	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
	SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);

	// Scale the exponent by log10(2) [0.30102999f].
	SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
	SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
	getF32Constant(DAG, 0x3e9a209a, dl));

	// Get the significand and build it into a floating-point number with
	// exponent of 1.
	SDValue X = GetSignificand(DAG, Op1, dl);

	SDValue Log10ofMantissa;
	if (LimitFloatPrecision <= 6) {
	// For floating-point precision of 6:
	//
	// Log10ofMantissa =
	// -0.50419619f +
	// (0.60948995f - 0.10380950f * x) * x;
	//
	// error 0.0014886165, which is 6 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0xbdd49a13, dl));
	SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3f1c0789, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f011300, dl));
	} else if (LimitFloatPrecision <= 12) {
	// For floating-point precision of 12:
	//
	// Log10ofMantissa =
	// -0.64831180f +
	// (0.91751397f +
	// (-0.31664806f + 0.47637168e-1f * x) * x) * x;
	//
	// error 0.00019228036, which is better than 12 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3d431f31, dl));
	SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3ea21fb2, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3f6ae232, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3f25f7c3, dl));
	} else { // LimitFloatPrecision <= 18
	// For floating-point precision of 18:
	//
	// Log10ofMantissa =
	// -0.84299375f +
	// (1.5327582f +
	// (-1.0688956f +
	// (0.49102474f +
	// (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
	//
	// error 0.0000037995730, which is better than 18 bits
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
	getF32Constant(DAG, 0x3c5d51ce, dl));
	SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
	getF32Constant(DAG, 0x3e00685a, dl));
	SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
	SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
	getF32Constant(DAG, 0x3efb6798, dl));
	SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
	SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
	getF32Constant(DAG, 0x3f88d192, dl));
	SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
	SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
	getF32Constant(DAG, 0x3fc4316c, dl));
	SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
	Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
	getF32Constant(DAG, 0x3f57ce70, dl));
	}

	return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log10ofMantissa);
	}

	// No special expansion.
	return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op);
	}

	/// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
	/// limited-precision mode.
	static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
	const TargetLowering &TLI) {
	if (Op.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18)
	return getLimitedPrecisionExp2(Op, dl, DAG);

	// No special expansion.
	return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op);
	}

	/// visitPow - Lower a pow intrinsic. Handles the special sequences for
	/// limited-precision mode with x == 10.0f.
	static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
	SelectionDAG &DAG, const TargetLowering &TLI) {
	bool IsExp10 = false;
	if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
	LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
	if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
	APFloat Ten(10.0f);
	IsExp10 = LHSC->isExactlyValue(Ten);
	}
	}

	// TODO: What fast-math-flags should be set on the FMUL node?
	if (IsExp10) {
	// Put the exponent in the right bit position for later addition to the
	// final result:
	//
	// #define LOG2OF10 3.3219281f
	// t0 = Op * LOG2OF10;
	SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, RHS,
	getF32Constant(DAG, 0x40549a78, dl));
	return getLimitedPrecisionExp2(t0, dl, DAG);
	}

	// No special expansion.
	return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS);
	}


	/// ExpandPowI - Expand a llvm.powi intrinsic.
	static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
	SelectionDAG &DAG) {
	// If RHS is a constant, we can expand this out to a multiplication tree,
	// otherwise we end up lowering to a call to __powidf2 (for example). When
	// optimizing for size, we only want to do this if the expansion would produce
	// a small number of multiplies, otherwise we do the full expansion.
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	// Get the exponent as a positive value.
	unsigned Val = RHSC->getSExtValue();
	if ((int)Val < 0) Val = -Val;

	// powi(x, 0) -> 1.0
	if (Val == 0)
	return DAG.getConstantFP(1.0, DL, LHS.getValueType());

	const Function *F = DAG.getMachineFunction().getFunction();
	if (!F->optForSize() \|\|
	// If optimizing for size, don't insert too many multiplies.
	// This inserts up to 5 multiplies.
	countPopulation(Val) + Log2_32(Val) < 7) {
	// We use the simple binary decomposition method to generate the multiply
	// sequence. There are more optimal ways to do this (for example,
	// powi(x,15) generates one more multiply than it should), but this has
	// the benefit of being both really simple and much better than a libcall.
	SDValue Res; // Logically starts equal to 1.0
	SDValue CurSquare = LHS;
	// TODO: Intrinsics should have fast-math-flags that propagate to these
	// nodes.
	while (Val) {
	if (Val & 1) {
	if (Res.getNode())
	Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
	else
	Res = CurSquare; // 1.0*CurSquare.
	}

	CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
	CurSquare, CurSquare);
	Val >>= 1;
	}

	// If the original was negative, invert the result, producing 1/(xxx).
	if (RHSC->getSExtValue() < 0)
	Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
	DAG.getConstantFP(1.0, DL, LHS.getValueType()), Res);
	return Res;
	}
	}

	// Otherwise, expand to a libcall.
	return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
	}

	// getUnderlyingArgReg - Find underlying register used for a truncated or
	// bitcasted argument.
	static unsigned getUnderlyingArgReg(const SDValue &N) {
	switch (N.getOpcode()) {
	case ISD::CopyFromReg:
	return cast<RegisterSDNode>(N.getOperand(1))->getReg();
	case ISD::BITCAST:
	case ISD::AssertZext:
	case ISD::AssertSext:
	case ISD::TRUNCATE:
	return getUnderlyingArgReg(N.getOperand(0));
	default:
	return 0;
	}
	}

	/// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function
	/// argument, create the corresponding DBG_VALUE machine instruction for it now.
	/// At the end of instruction selection, they will be inserted to the entry BB.
	bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
	const Value V, DILocalVariable Variable, DIExpression *Expr,
	DILocation *DL, int64_t Offset, bool IsDbgDeclare, const SDValue &N) {
	const Argument *Arg = dyn_cast<Argument>(V);
	if (!Arg)
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();

	// Ignore inlined function arguments here.
	//
	// FIXME: Should we be checking DL->inlinedAt() to determine this?
	if (!Variable->getScope()->getSubprogram()->describes(MF.getFunction()))
	return false;

	bool IsIndirect = false;
	Optional<MachineOperand> Op;
	// Some arguments' frame index is recorded during argument lowering.
	int FI = FuncInfo.getArgumentFrameIndex(Arg);
	if (FI != INT_MAX)
	Op = MachineOperand::CreateFI(FI);

	if (!Op && N.getNode()) {
	unsigned Reg = getUnderlyingArgReg(N);
	if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	unsigned PR = RegInfo.getLiveInPhysReg(Reg);
	if (PR)
	Reg = PR;
	}
	if (Reg) {
	Op = MachineOperand::CreateReg(Reg, false);
	IsIndirect = IsDbgDeclare;
	}
	}

	if (!Op) {
	// Check if ValueMap has reg number.
	DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
	if (VMI != FuncInfo.ValueMap.end()) {
	Op = MachineOperand::CreateReg(VMI->second, false);
	IsIndirect = IsDbgDeclare;
	}
	}

	if (!Op && N.getNode())
	// Check if frame index is available.
	if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(N.getNode()))
	if (FrameIndexSDNode *FINode =
	dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
	Op = MachineOperand::CreateFI(FINode->getIndex());

	if (!Op)
	return false;

	assert(Variable->isValidLocationForIntrinsic(DL) &&
	"Expected inlined-at fields to agree");
	if (Op->isReg())
	FuncInfo.ArgDbgValues.push_back(
	BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
	Op->getReg(), Offset, Variable, Expr));
	else
	FuncInfo.ArgDbgValues.push_back(
	BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE))
	.add(*Op)
	.addImm(Offset)
	.addMetadata(Variable)
	.addMetadata(Expr));

	return true;
	}

	/// Return the appropriate SDDbgValue based on N.
	SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
	DILocalVariable *Variable,
	DIExpression *Expr, int64_t Offset,
	const DebugLoc &dl,
	unsigned DbgSDNodeOrder) {
	if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
	// Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe
	// stack slot locations as such instead of as indirectly addressed
	// locations.
	return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(), 0, dl,
	DbgSDNodeOrder);
	}
	return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false,
	Offset, dl, DbgSDNodeOrder);
	}

	// VisualStudio defines setjmp as _setjmp
	#if defined(_MSC_VER) && defined(setjmp) && \
	!defined(setjmp_undefined_for_msvc)
	# pragma push_macro("setjmp")
	# undef setjmp
	# define setjmp_undefined_for_msvc
	#endif

	/// Lower the call to the specified intrinsic function. If we want to emit this
	/// as a call to a named external function, return the name. Otherwise, lower it
	/// and return null.
	const char *
	SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDLoc sdl = getCurSDLoc();
	DebugLoc dl = getCurDebugLoc();
	SDValue Res;

	switch (Intrinsic) {
	default:
	// By default, turn this into a target intrinsic node.
	visitTargetIntrinsic(I, Intrinsic);
	return nullptr;
	case Intrinsic::vastart: visitVAStart(I); return nullptr;
	case Intrinsic::vaend: visitVAEnd(I); return nullptr;
	case Intrinsic::vacopy: visitVACopy(I); return nullptr;
	case Intrinsic::returnaddress:
	setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl,
	TLI.getPointerTy(DAG.getDataLayout()),
	getValue(I.getArgOperand(0))));
	return nullptr;
	case Intrinsic::addressofreturnaddress:
	setValue(&I, DAG.getNode(ISD::ADDROFRETURNADDR, sdl,
	TLI.getPointerTy(DAG.getDataLayout())));
	return nullptr;
	case Intrinsic::frameaddress:
	setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
	TLI.getPointerTy(DAG.getDataLayout()),
	getValue(I.getArgOperand(0))));
	return nullptr;
	case Intrinsic::read_register: {
	Value *Reg = I.getArgOperand(0);
	SDValue Chain = getRoot();
	SDValue RegName =
	DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	Res = DAG.getNode(ISD::READ_REGISTER, sdl,
	DAG.getVTList(VT, MVT::Other), Chain, RegName);
	setValue(&I, Res);
	DAG.setRoot(Res.getValue(1));
	return nullptr;
	}
	case Intrinsic::write_register: {
	Value *Reg = I.getArgOperand(0);
	Value *RegValue = I.getArgOperand(1);
	SDValue Chain = getRoot();
	SDValue RegName =
	DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
	DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
	RegName, getValue(RegValue)));
	return nullptr;
	}
	case Intrinsic::setjmp:
	return &"_setjmp"[!TLI.usesUnderscoreSetJmp()];
	case Intrinsic::longjmp:
	return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
	case Intrinsic::memcpy: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	SDValue Op3 = getValue(I.getArgOperand(2));
	unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
	if (!Align)
	Align = 1; // @llvm.memcpy defines 0 and 1 to both mean no alignment.
	bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
	false, isTC,
	MachinePointerInfo(I.getArgOperand(0)),
	MachinePointerInfo(I.getArgOperand(1)));
	updateDAGForMaybeTailCall(MC);
	return nullptr;
	}
	case Intrinsic::memset: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	SDValue Op3 = getValue(I.getArgOperand(2));
	unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
	if (!Align)
	Align = 1; // @llvm.memset defines 0 and 1 to both mean no alignment.
	bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	SDValue MS = DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
	isTC, MachinePointerInfo(I.getArgOperand(0)));
	updateDAGForMaybeTailCall(MS);
	return nullptr;
	}
	case Intrinsic::memmove: {
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));
	SDValue Op3 = getValue(I.getArgOperand(2));
	unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
	if (!Align)
	Align = 1; // @llvm.memmove defines 0 and 1 to both mean no alignment.
	bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
	bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
	SDValue MM = DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
	isTC, MachinePointerInfo(I.getArgOperand(0)),
	MachinePointerInfo(I.getArgOperand(1)));
	updateDAGForMaybeTailCall(MM);
	return nullptr;
	}
	case Intrinsic::memcpy_element_unordered_atomic: {
	const ElementUnorderedAtomicMemCpyInst &MI =
	cast<ElementUnorderedAtomicMemCpyInst>(I);
	SDValue Dst = getValue(MI.getRawDest());
	SDValue Src = getValue(MI.getRawSource());
	SDValue Length = getValue(MI.getLength());

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
	Entry.Node = Dst;
	Args.push_back(Entry);

	Entry.Node = Src;
	Args.push_back(Entry);

	Entry.Ty = MI.getLength()->getType();
	Entry.Node = Length;
	Args.push_back(Entry);

	uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
	RTLIB::Libcall LibraryCall =
	RTLIB::getMEMCPY_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
	if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
	report_fatal_error("Unsupported element size");

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
	TLI.getLibcallCallingConv(LibraryCall),
	Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
	TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
	DAG.setRoot(CallResult.second);
	return nullptr;
	}
	case Intrinsic::memmove_element_unordered_atomic: {
	auto &MI = cast<ElementUnorderedAtomicMemMoveInst>(I);
	SDValue Dst = getValue(MI.getRawDest());
	SDValue Src = getValue(MI.getRawSource());
	SDValue Length = getValue(MI.getLength());

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
	Entry.Node = Dst;
	Args.push_back(Entry);

	Entry.Node = Src;
	Args.push_back(Entry);

	Entry.Ty = MI.getLength()->getType();
	Entry.Node = Length;
	Args.push_back(Entry);

	uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
	RTLIB::Libcall LibraryCall =
	RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
	if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
	report_fatal_error("Unsupported element size");

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
	TLI.getLibcallCallingConv(LibraryCall),
	Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
	TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
	DAG.setRoot(CallResult.second);
	return nullptr;
	}
	case Intrinsic::memset_element_unordered_atomic: {
	auto &MI = cast<ElementUnorderedAtomicMemSetInst>(I);
	SDValue Dst = getValue(MI.getRawDest());
	SDValue Val = getValue(MI.getValue());
	SDValue Length = getValue(MI.getLength());

	// Emit a library call.
	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
	Entry.Node = Dst;
	Args.push_back(Entry);

	Entry.Ty = Type::getInt8Ty(*DAG.getContext());
	Entry.Node = Val;
	Args.push_back(Entry);

	Entry.Ty = MI.getLength()->getType();
	Entry.Node = Length;
	Args.push_back(Entry);

	uint64_t ElementSizeConstant = MI.getElementSizeInBytes();
	RTLIB::Libcall LibraryCall =
	RTLIB::getMEMSET_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant);
	if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
	report_fatal_error("Unsupported element size");

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
	TLI.getLibcallCallingConv(LibraryCall),
	Type::getVoidTy(*DAG.getContext()),
	DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
	TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
	DAG.setRoot(CallResult.second);
	return nullptr;
	}
	case Intrinsic::dbg_declare: {
	const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
	DILocalVariable *Variable = DI.getVariable();
	DIExpression *Expression = DI.getExpression();
	const Value *Address = DI.getAddress();
	assert(Variable && "Missing variable");
	if (!Address) {
	DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
	return nullptr;
	}

	// Check if address has undef value.
	if (isa<UndefValue>(Address) \|\|
	(Address->use_empty() && !isa<Argument>(Address))) {
	DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
	return nullptr;
	}

	// Byval arguments with frame indices were already handled after argument
	// lowering and before isel.
	const auto *Arg =
	dyn_cast<Argument>(Address->stripInBoundsConstantOffsets());
	if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX)
	return nullptr;

	SDValue &N = NodeMap[Address];
	if (!N.getNode() && isa<Argument>(Address))
	// Check unused arguments map.
	N = UnusedArgNodeMap[Address];
	SDDbgValue *SDV;
	if (N.getNode()) {
	if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
	Address = BCI->getOperand(0);
	// Parameters are handled specially.
	bool isParameter = Variable->isParameter() \|\| isa<Argument>(Address);
	auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
	if (isParameter && FINode) {
	// Byval parameter. We have a frame index at this point.
	SDV = DAG.getFrameIndexDbgValue(Variable, Expression,
	FINode->getIndex(), 0, dl, SDNodeOrder);
	} else if (isa<Argument>(Address)) {
	// Address is an argument, so try to emit its dbg value using
	// virtual register info from the FuncInfo.ValueMap.
	EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true, N);
	return nullptr;
	} else {
	SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
	true, 0, dl, SDNodeOrder);
	}
	DAG.AddDbgValue(SDV, N.getNode(), isParameter);
	} else {
	// If Address is an argument then try to emit its dbg value using
	// virtual register info from the FuncInfo.ValueMap.
	if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true,
	N)) {
	DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
	}
	}
	return nullptr;
	}
	case Intrinsic::dbg_value: {
	const DbgValueInst &DI = cast<DbgValueInst>(I);
	assert(DI.getVariable() && "Missing variable");

	DILocalVariable *Variable = DI.getVariable();
	DIExpression *Expression = DI.getExpression();
	uint64_t Offset = DI.getOffset();
	const Value *V = DI.getValue();
	if (!V)
	return nullptr;

	SDDbgValue *SDV;
	if (isa<ConstantInt>(V) \|\| isa<ConstantFP>(V) \|\| isa<UndefValue>(V)) {
	SDV = DAG.getConstantDbgValue(Variable, Expression, V, Offset, dl,
	SDNodeOrder);
	DAG.AddDbgValue(SDV, nullptr, false);
	return nullptr;
	}

	// Do not use getValue() in here; we don't want to generate code at
	// this point if it hasn't been done yet.
	SDValue N = NodeMap[V];
	if (!N.getNode() && isa<Argument>(V)) // Check unused arguments map.
	N = UnusedArgNodeMap[V];
	if (N.getNode()) {
	if (EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset, false,
	N))
	return nullptr;
	SDV = getDbgValue(N, Variable, Expression, Offset, dl, SDNodeOrder);
	DAG.AddDbgValue(SDV, N.getNode(), false);
	return nullptr;
	}

	if (!V->use_empty() ) {
	// Do not call getValue(V) yet, as we don't want to generate code.
	// Remember it for later.
	DanglingDebugInfo DDI(&DI, dl, SDNodeOrder);
	DanglingDebugInfoMap[V] = DDI;
	return nullptr;
	}

	DEBUG(dbgs() << "Dropping debug location info for:\n " << DI << "\n");
	DEBUG(dbgs() << " Last seen at:\n " << *V << "\n");
	return nullptr;
	}

	case Intrinsic::eh_typeid_for: {
	// Find the type id for the given typeinfo.
	GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0));
	unsigned TypeID = DAG.getMachineFunction().getTypeIDFor(GV);
	Res = DAG.getConstant(TypeID, sdl, MVT::i32);
	setValue(&I, Res);
	return nullptr;
	}

	case Intrinsic::eh_return_i32:
	case Intrinsic::eh_return_i64:
	DAG.getMachineFunction().setCallsEHReturn(true);
	DAG.setRoot(DAG.getNode(ISD::EH_RETURN, sdl,
	MVT::Other,
	getControlRoot(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return nullptr;
	case Intrinsic::eh_unwind_init:
	DAG.getMachineFunction().setCallsUnwindInit(true);
	return nullptr;
	case Intrinsic::eh_dwarf_cfa: {
	setValue(&I, DAG.getNode(ISD::EH_DWARF_CFA, sdl,
	TLI.getPointerTy(DAG.getDataLayout()),
	getValue(I.getArgOperand(0))));
	return nullptr;
	}
	case Intrinsic::eh_sjlj_callsite: {
	MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
	ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
	assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
	assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");

	MMI.setCurrentCallSite(CI->getZExtValue());
	return nullptr;
	}
	case Intrinsic::eh_sjlj_functioncontext: {
	// Get and store the index of the function context.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	AllocaInst *FnCtx =
	cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
	int FI = FuncInfo.StaticAllocaMap[FnCtx];
	MFI.setFunctionContextIndex(FI);
	return nullptr;
	}
	case Intrinsic::eh_sjlj_setjmp: {
	SDValue Ops[2];
	Ops[0] = getRoot();
	Ops[1] = getValue(I.getArgOperand(0));
	SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl,
	DAG.getVTList(MVT::i32, MVT::Other), Ops);
	setValue(&I, Op.getValue(0));
	DAG.setRoot(Op.getValue(1));
	return nullptr;
	}
	case Intrinsic::eh_sjlj_longjmp: {
	DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other,
	getRoot(), getValue(I.getArgOperand(0))));
	return nullptr;
	}
	case Intrinsic::eh_sjlj_setup_dispatch: {
	DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_SETUP_DISPATCH, sdl, MVT::Other,
	getRoot()));
	return nullptr;
	}

	case Intrinsic::masked_gather:
	visitMaskedGather(I);
	return nullptr;
	case Intrinsic::masked_load:
	visitMaskedLoad(I);
	return nullptr;
	case Intrinsic::masked_scatter:
	visitMaskedScatter(I);
	return nullptr;
	case Intrinsic::masked_store:
	visitMaskedStore(I);
	return nullptr;
	case Intrinsic::masked_expandload:
	visitMaskedLoad(I, true /* IsExpanding */);
	return nullptr;
	case Intrinsic::masked_compressstore:
	visitMaskedStore(I, true /* IsCompressing */);
	return nullptr;
	case Intrinsic::x86_mmx_pslli_w:
	case Intrinsic::x86_mmx_pslli_d:
	case Intrinsic::x86_mmx_pslli_q:
	case Intrinsic::x86_mmx_psrli_w:
	case Intrinsic::x86_mmx_psrli_d:
	case Intrinsic::x86_mmx_psrli_q:
	case Intrinsic::x86_mmx_psrai_w:
	case Intrinsic::x86_mmx_psrai_d: {
	SDValue ShAmt = getValue(I.getArgOperand(1));
	if (isa<ConstantSDNode>(ShAmt)) {
	visitTargetIntrinsic(I, Intrinsic);
	return nullptr;
	}
	unsigned NewIntrinsic = 0;
	EVT ShAmtVT = MVT::v2i32;
	switch (Intrinsic) {
	case Intrinsic::x86_mmx_pslli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psll_w;
	break;
	case Intrinsic::x86_mmx_pslli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psll_d;
	break;
	case Intrinsic::x86_mmx_pslli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psll_q;
	break;
	case Intrinsic::x86_mmx_psrli_w:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
	break;
	case Intrinsic::x86_mmx_psrli_d:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
	break;
	case Intrinsic::x86_mmx_psrli_q:
	NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
	break;
	case Intrinsic::x86_mmx_psrai_w:
	NewIntrinsic = Intrinsic::x86_mmx_psra_w;
	break;
	case Intrinsic::x86_mmx_psrai_d:
	NewIntrinsic = Intrinsic::x86_mmx_psra_d;
	break;
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	}

	// The vector shift intrinsics with scalars uses 32b shift amounts but
	// the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
	// to be zero.
	// We must do this early because v2i32 is not a legal type.
	SDValue ShOps[2];
	ShOps[0] = ShAmt;
	ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
	ShAmt = DAG.getBuildVector(ShAmtVT, sdl, ShOps);
	EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
	Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
	DAG.getConstant(NewIntrinsic, sdl, MVT::i32),
	getValue(I.getArgOperand(0)), ShAmt);
	setValue(&I, Res);
	return nullptr;
	}
	case Intrinsic::powi:
	setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)), DAG));
	return nullptr;
	case Intrinsic::log:
	setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return nullptr;
	case Intrinsic::log2:
	setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return nullptr;
	case Intrinsic::log10:
	setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return nullptr;
	case Intrinsic::exp:
	setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return nullptr;
	case Intrinsic::exp2:
	setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
	return nullptr;
	case Intrinsic::pow:
	setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)), DAG, TLI));
	return nullptr;
	case Intrinsic::sqrt:
	case Intrinsic::fabs:
	case Intrinsic::sin:
	case Intrinsic::cos:
	case Intrinsic::floor:
	case Intrinsic::ceil:
	case Intrinsic::trunc:
	case Intrinsic::rint:
	case Intrinsic::nearbyint:
	case Intrinsic::round:
	case Intrinsic::canonicalize: {
	unsigned Opcode;
	switch (Intrinsic) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::sqrt: Opcode = ISD::FSQRT; break;
	case Intrinsic::fabs: Opcode = ISD::FABS; break;
	case Intrinsic::sin: Opcode = ISD::FSIN; break;
	case Intrinsic::cos: Opcode = ISD::FCOS; break;
	case Intrinsic::floor: Opcode = ISD::FFLOOR; break;
	case Intrinsic::ceil: Opcode = ISD::FCEIL; break;
	case Intrinsic::trunc: Opcode = ISD::FTRUNC; break;
	case Intrinsic::rint: Opcode = ISD::FRINT; break;
	case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
	case Intrinsic::round: Opcode = ISD::FROUND; break;
	case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
	}

	setValue(&I, DAG.getNode(Opcode, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0))));
	return nullptr;
	}
	case Intrinsic::minnum: {
	auto VT = getValue(I.getArgOperand(0)).getValueType();
	unsigned Opc =
	I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT)
	? ISD::FMINNAN
	: ISD::FMINNUM;
	setValue(&I, DAG.getNode(Opc, sdl, VT,
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return nullptr;
	}
	case Intrinsic::maxnum: {
	auto VT = getValue(I.getArgOperand(0)).getValueType();
	unsigned Opc =
	I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT)
	? ISD::FMAXNAN
	: ISD::FMAXNUM;
	setValue(&I, DAG.getNode(Opc, sdl, VT,
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return nullptr;
	}
	case Intrinsic::copysign:
	setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1))));
	return nullptr;
	case Intrinsic::fma:
	setValue(&I, DAG.getNode(ISD::FMA, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)),
	getValue(I.getArgOperand(2))));
	return nullptr;
	case Intrinsic::experimental_constrained_fadd:
	case Intrinsic::experimental_constrained_fsub:
	case Intrinsic::experimental_constrained_fmul:
	case Intrinsic::experimental_constrained_fdiv:
	case Intrinsic::experimental_constrained_frem:
	case Intrinsic::experimental_constrained_sqrt:
	case Intrinsic::experimental_constrained_pow:
	case Intrinsic::experimental_constrained_powi:
	case Intrinsic::experimental_constrained_sin:
	case Intrinsic::experimental_constrained_cos:
	case Intrinsic::experimental_constrained_exp:
	case Intrinsic::experimental_constrained_exp2:
	case Intrinsic::experimental_constrained_log:
	case Intrinsic::experimental_constrained_log10:
	case Intrinsic::experimental_constrained_log2:
	case Intrinsic::experimental_constrained_rint:
	case Intrinsic::experimental_constrained_nearbyint:
	visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
	return nullptr;
	case Intrinsic::fmuladd: {
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
	TLI.isFMAFasterThanFMulAndFAdd(VT)) {
	setValue(&I, DAG.getNode(ISD::FMA, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)),
	getValue(I.getArgOperand(2))));
	} else {
	// TODO: Intrinsic calls should have fast-math-flags.
	SDValue Mul = DAG.getNode(ISD::FMUL, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)));
	SDValue Add = DAG.getNode(ISD::FADD, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	Mul,
	getValue(I.getArgOperand(2)));
	setValue(&I, Add);
	}
	return nullptr;
	}
	case Intrinsic::convert_to_fp16:
	setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16,
	DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16,
	getValue(I.getArgOperand(0)),
	DAG.getTargetConstant(0, sdl,
	MVT::i32))));
	return nullptr;
	case Intrinsic::convert_from_fp16:
	setValue(&I, DAG.getNode(ISD::FP_EXTEND, sdl,
	TLI.getValueType(DAG.getDataLayout(), I.getType()),
	DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
	getValue(I.getArgOperand(0)))));
	return nullptr;
	case Intrinsic::pcmarker: {
	SDValue Tmp = getValue(I.getArgOperand(0));
	DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
	return nullptr;
	}
	case Intrinsic::readcyclecounter: {
	SDValue Op = getRoot();
	Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl,
	DAG.getVTList(MVT::i64, MVT::Other), Op);
	setValue(&I, Res);
	DAG.setRoot(Res.getValue(1));
	return nullptr;
	}
	case Intrinsic::bitreverse:
	setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0))));
	return nullptr;
	case Intrinsic::bswap:
	setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
	getValue(I.getArgOperand(0)).getValueType(),
	getValue(I.getArgOperand(0))));
	return nullptr;
	case Intrinsic::cttz: {
	SDValue Arg = getValue(I.getArgOperand(0));
	ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
	EVT Ty = Arg.getValueType();
	setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
	sdl, Ty, Arg));
	return nullptr;
	}
	case Intrinsic::ctlz: {
	SDValue Arg = getValue(I.getArgOperand(0));
	ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
	EVT Ty = Arg.getValueType();
	setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
	sdl, Ty, Arg));
	return nullptr;
	}
	case Intrinsic::ctpop: {
	SDValue Arg = getValue(I.getArgOperand(0));
	EVT Ty = Arg.getValueType();
	setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
	return nullptr;
	}
	case Intrinsic::stacksave: {
	SDValue Op = getRoot();
	Res = DAG.getNode(
	ISD::STACKSAVE, sdl,
	DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Op);
	setValue(&I, Res);
	DAG.setRoot(Res.getValue(1));
	return nullptr;
	}
	case Intrinsic::stackrestore: {
	Res = getValue(I.getArgOperand(0));
	DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
	return nullptr;
	}
	case Intrinsic::get_dynamic_area_offset: {
	SDValue Op = getRoot();
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
	// Result type for @llvm.get.dynamic.area.offset should match PtrTy for
	// target.
	if (PtrTy != ResTy)
	report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
	" intrinsic!");
	Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
	Op);
	DAG.setRoot(Op);
	setValue(&I, Res);
	return nullptr;
	}
	case Intrinsic::stackguard: {
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	MachineFunction &MF = DAG.getMachineFunction();
	const Module &M = *MF.getFunction()->getParent();
	SDValue Chain = getRoot();
	if (TLI.useLoadStackGuardNode()) {
	Res = getLoadStackGuard(DAG, sdl, Chain);
	} else {
	const Value *Global = TLI.getSDagStackGuard(M);
	unsigned Align = DL->getPrefTypeAlignment(Global->getType());
	Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
	MachinePointerInfo(Global, 0), Align,
	MachineMemOperand::MOVolatile);
	}
	DAG.setRoot(Chain);
	setValue(&I, Res);
	return nullptr;
	}
	case Intrinsic::stackprotector: {
	// Emit code into the DAG to store the stack guard onto the stack.
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
	SDValue Src, Chain = getRoot();

	if (TLI.useLoadStackGuardNode())
	Src = getLoadStackGuard(DAG, sdl, Chain);
	else
	Src = getValue(I.getArgOperand(0)); // The guard's value.

	AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));

	int FI = FuncInfo.StaticAllocaMap[Slot];
	MFI.setStackProtectorIndex(FI);

	SDValue FIN = DAG.getFrameIndex(FI, PtrTy);

	// Store the stack protector onto the stack.
	Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), FI),
	/* Alignment = */ 0, MachineMemOperand::MOVolatile);
	setValue(&I, Res);
	DAG.setRoot(Res);
	return nullptr;
	}
	case Intrinsic::objectsize: {
	// If we don't know by now, we're never going to know.
	ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));

	assert(CI && "Non-constant type in __builtin_object_size?");

	SDValue Arg = getValue(I.getCalledValue());
	EVT Ty = Arg.getValueType();

	if (CI->isZero())
	Res = DAG.getConstant(-1ULL, sdl, Ty);
	else
	Res = DAG.getConstant(0, sdl, Ty);

	setValue(&I, Res);
	return nullptr;
	}
	case Intrinsic::annotation:
	case Intrinsic::ptr_annotation:
	case Intrinsic::invariant_group_barrier:
	// Drop the intrinsic, but forward the value
	setValue(&I, getValue(I.getOperand(0)));
	return nullptr;
	case Intrinsic::assume:
	case Intrinsic::var_annotation:
	// Discard annotate attributes and assumptions
	return nullptr;

	case Intrinsic::init_trampoline: {
	const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());

	SDValue Ops[6];
	Ops[0] = getRoot();
	Ops[1] = getValue(I.getArgOperand(0));
	Ops[2] = getValue(I.getArgOperand(1));
	Ops[3] = getValue(I.getArgOperand(2));
	Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
	Ops[5] = DAG.getSrcValue(F);

	Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops);

	DAG.setRoot(Res);
	return nullptr;
	}
	case Intrinsic::adjust_trampoline: {
	setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
	TLI.getPointerTy(DAG.getDataLayout()),
	getValue(I.getArgOperand(0))));
	return nullptr;
	}
	case Intrinsic::gcroot: {
	MachineFunction &MF = DAG.getMachineFunction();
	const Function *F = MF.getFunction();
	(void)F;
	assert(F->hasGC() &&
	"only valid in functions with gc specified, enforced by Verifier");
	assert(GFI && "implied by previous");
	const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
	const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));

	FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
	GFI->addStackRoot(FI->getIndex(), TypeMap);
	return nullptr;
	}
	case Intrinsic::gcread:
	case Intrinsic::gcwrite:
	llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
	case Intrinsic::flt_rounds:
	setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32));
	return nullptr;

	case Intrinsic::expect: {
	// Just replace __builtin_expect(exp, c) with EXP.
	setValue(&I, getValue(I.getArgOperand(0)));
	return nullptr;
	}

	case Intrinsic::debugtrap:
	case Intrinsic::trap: {
	StringRef TrapFuncName =
	I.getAttributes()
	.getAttribute(AttributeList::FunctionIndex, "trap-func-name")
	.getValueAsString();
	if (TrapFuncName.empty()) {
	ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
	ISD::TRAP : ISD::DEBUGTRAP;
	DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot()));
	return nullptr;
	}
	TargetLowering::ArgListTy Args;

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
	CallingConv::C, I.getType(),
	DAG.getExternalSymbol(TrapFuncName.data(),
	TLI.getPointerTy(DAG.getDataLayout())),
	std::move(Args));

	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
	DAG.setRoot(Result.second);
	return nullptr;
	}

	case Intrinsic::uadd_with_overflow:
	case Intrinsic::sadd_with_overflow:
	case Intrinsic::usub_with_overflow:
	case Intrinsic::ssub_with_overflow:
	case Intrinsic::umul_with_overflow:
	case Intrinsic::smul_with_overflow: {
	ISD::NodeType Op;
	switch (Intrinsic) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break;
	case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break;
	case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break;
	case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break;
	case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break;
	case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break;
	}
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2 = getValue(I.getArgOperand(1));

	SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
	setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
	return nullptr;
	}
	case Intrinsic::prefetch: {
	SDValue Ops[5];
	unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
	Ops[0] = getRoot();
	Ops[1] = getValue(I.getArgOperand(0));
	Ops[2] = getValue(I.getArgOperand(1));
	Ops[3] = getValue(I.getArgOperand(2));
	Ops[4] = getValue(I.getArgOperand(3));
	DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
	DAG.getVTList(MVT::Other), Ops,
	EVT::getIntegerVT(*Context, 8),
	MachinePointerInfo(I.getArgOperand(0)),
	0, /* align */
	false, /* volatile */
	rw==0, /* read */
	rw==1)); /* write */
	return nullptr;
	}
	case Intrinsic::lifetime_start:
	case Intrinsic::lifetime_end: {
	bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
	// Stack coloring is not enabled in O0, discard region information.
	if (TM.getOptLevel() == CodeGenOpt::None)
	return nullptr;

	SmallVector<Value *, 4> Allocas;
	GetUnderlyingObjects(I.getArgOperand(1), Allocas, *DL);

	for (SmallVectorImpl<Value*>::iterator Object = Allocas.begin(),
	E = Allocas.end(); Object != E; ++Object) {
	AllocaInst LifetimeObject = dyn_cast_or_null<AllocaInst>(Object);

	// Could not find an Alloca.
	if (!LifetimeObject)
	continue;

	// First check that the Alloca is static, otherwise it won't have a
	// valid frame index.
	auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
	if (SI == FuncInfo.StaticAllocaMap.end())
	return nullptr;

	int FI = SI->second;

	SDValue Ops[2];
	Ops[0] = getRoot();
	Ops[1] =
	DAG.getFrameIndex(FI, TLI.getFrameIndexTy(DAG.getDataLayout()), true);
	unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);

	Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops);
	DAG.setRoot(Res);
	}
	return nullptr;
	}
	case Intrinsic::invariant_start:
	// Discard region information.
	setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
	return nullptr;
	case Intrinsic::invariant_end:
	// Discard region information.
	return nullptr;
	case Intrinsic::clear_cache:
	return TLI.getClearCacheBuiltinName();
	case Intrinsic::donothing:
	// ignore
	return nullptr;
	case Intrinsic::experimental_stackmap: {
	visitStackmap(I);
	return nullptr;
	}
	case Intrinsic::experimental_patchpoint_void:
	case Intrinsic::experimental_patchpoint_i64: {
	visitPatchpoint(&I);
	return nullptr;
	}
	case Intrinsic::experimental_gc_statepoint: {
	LowerStatepoint(ImmutableStatepoint(&I));
	return nullptr;
	}
	case Intrinsic::experimental_gc_result: {
	visitGCResult(cast<GCResultInst>(I));
	return nullptr;
	}
	case Intrinsic::experimental_gc_relocate: {
	visitGCRelocate(cast<GCRelocateInst>(I));
	return nullptr;
	}
	case Intrinsic::instrprof_increment:
	llvm_unreachable("instrprof failed to lower an increment");
	case Intrinsic::instrprof_value_profile:
	llvm_unreachable("instrprof failed to lower a value profiling call");
	case Intrinsic::localescape: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();

	// Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
	// is the same on all targets.
	for (unsigned Idx = 0, E = I.getNumArgOperands(); Idx < E; ++Idx) {
	Value *Arg = I.getArgOperand(Idx)->stripPointerCasts();
	if (isa<ConstantPointerNull>(Arg))
	continue; // Skip null pointers. They represent a hole in index space.
	AllocaInst *Slot = cast<AllocaInst>(Arg);
	assert(FuncInfo.StaticAllocaMap.count(Slot) &&
	"can only escape static allocas");
	int FI = FuncInfo.StaticAllocaMap[Slot];
	MCSymbol *FrameAllocSym =
	MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
	GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx);
	BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
	TII->get(TargetOpcode::LOCAL_ESCAPE))
	.addSym(FrameAllocSym)
	.addFrameIndex(FI);
	}

	return nullptr;
	}

	case Intrinsic::localrecover: {
	// i8* @llvm.localrecover(i8* %fn, i8* %fp, i32 %idx)
	MachineFunction &MF = DAG.getMachineFunction();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout(), 0);

	// Get the symbol that defines the frame offset.
	auto *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
	auto *Idx = cast<ConstantInt>(I.getArgOperand(2));
	unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX));
	MCSymbol *FrameAllocSym =
	MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);

	// Create a MCSymbol for the label to avoid any target lowering
	// that would make this PC relative.
	SDValue OffsetSym = DAG.getMCSymbol(FrameAllocSym, PtrVT);
	SDValue OffsetVal =
	DAG.getNode(ISD::LOCAL_RECOVER, sdl, PtrVT, OffsetSym);

	// Add the offset to the FP.
	Value *FP = I.getArgOperand(1);
	SDValue FPVal = getValue(FP);
	SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal);
	setValue(&I, Add);

	return nullptr;
	}

	case Intrinsic::eh_exceptionpointer:
	case Intrinsic::eh_exceptioncode: {
	// Get the exception pointer vreg, copy from it, and resize it to fit.
	const auto *CPI = cast<CatchPadInst>(I.getArgOperand(0));
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
	unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC);
	SDValue N =
	DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT);
	if (Intrinsic == Intrinsic::eh_exceptioncode)
	N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32);
	setValue(&I, N);
	return nullptr;
	}
	case Intrinsic::xray_customevent: {
	// Here we want to make sure that the intrinsic behaves as if it has a
	// specific calling convention, and only for x86_64.
	// FIXME: Support other platforms later.
	const auto &Triple = DAG.getTarget().getTargetTriple();
	if (Triple.getArch() != Triple::x86_64 \|\| !Triple.isOSLinux())
	return nullptr;

	SDLoc DL = getCurSDLoc();
	SmallVector<SDValue, 8> Ops;

	// We want to say that we always want the arguments in registers.
	SDValue LogEntryVal = getValue(I.getArgOperand(0));
	SDValue StrSizeVal = getValue(I.getArgOperand(1));
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue Chain = getRoot();
	Ops.push_back(LogEntryVal);
	Ops.push_back(StrSizeVal);
	Ops.push_back(Chain);

	// We need to enforce the calling convention for the callsite, so that
	// argument ordering is enforced correctly, and that register allocation can
	// see that some registers may be assumed clobbered and have to preserve
	// them across calls to the intrinsic.
	MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHABLE_EVENT_CALL,
	DL, NodeTys, Ops);
	SDValue patchableNode = SDValue(MN, 0);
	DAG.setRoot(patchableNode);
	setValue(&I, patchableNode);
	return nullptr;
	}
	case Intrinsic::experimental_deoptimize:
	LowerDeoptimizeCall(&I);
	return nullptr;

	case Intrinsic::experimental_vector_reduce_fadd:
	case Intrinsic::experimental_vector_reduce_fmul:
	case Intrinsic::experimental_vector_reduce_add:
	case Intrinsic::experimental_vector_reduce_mul:
	case Intrinsic::experimental_vector_reduce_and:
	case Intrinsic::experimental_vector_reduce_or:
	case Intrinsic::experimental_vector_reduce_xor:
	case Intrinsic::experimental_vector_reduce_smax:
	case Intrinsic::experimental_vector_reduce_smin:
	case Intrinsic::experimental_vector_reduce_umax:
	case Intrinsic::experimental_vector_reduce_umin:
	case Intrinsic::experimental_vector_reduce_fmax:
	case Intrinsic::experimental_vector_reduce_fmin: {
	visitVectorReduce(I, Intrinsic);
	return nullptr;
	}

	}
	}

	void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
	const ConstrainedFPIntrinsic &FPI) {
	SDLoc sdl = getCurSDLoc();
	unsigned Opcode;
	switch (FPI.getIntrinsicID()) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::experimental_constrained_fadd:
	Opcode = ISD::STRICT_FADD;
	break;
	case Intrinsic::experimental_constrained_fsub:
	Opcode = ISD::STRICT_FSUB;
	break;
	case Intrinsic::experimental_constrained_fmul:
	Opcode = ISD::STRICT_FMUL;
	break;
	case Intrinsic::experimental_constrained_fdiv:
	Opcode = ISD::STRICT_FDIV;
	break;
	case Intrinsic::experimental_constrained_frem:
	Opcode = ISD::STRICT_FREM;
	break;
	case Intrinsic::experimental_constrained_sqrt:
	Opcode = ISD::STRICT_FSQRT;
	break;
	case Intrinsic::experimental_constrained_pow:
	Opcode = ISD::STRICT_FPOW;
	break;
	case Intrinsic::experimental_constrained_powi:
	Opcode = ISD::STRICT_FPOWI;
	break;
	case Intrinsic::experimental_constrained_sin:
	Opcode = ISD::STRICT_FSIN;
	break;
	case Intrinsic::experimental_constrained_cos:
	Opcode = ISD::STRICT_FCOS;
	break;
	case Intrinsic::experimental_constrained_exp:
	Opcode = ISD::STRICT_FEXP;
	break;
	case Intrinsic::experimental_constrained_exp2:
	Opcode = ISD::STRICT_FEXP2;
	break;
	case Intrinsic::experimental_constrained_log:
	Opcode = ISD::STRICT_FLOG;
	break;
	case Intrinsic::experimental_constrained_log10:
	Opcode = ISD::STRICT_FLOG10;
	break;
	case Intrinsic::experimental_constrained_log2:
	Opcode = ISD::STRICT_FLOG2;
	break;
	case Intrinsic::experimental_constrained_rint:
	Opcode = ISD::STRICT_FRINT;
	break;
	case Intrinsic::experimental_constrained_nearbyint:
	Opcode = ISD::STRICT_FNEARBYINT;
	break;
	}
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Chain = getRoot();
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs);
	ValueVTs.push_back(MVT::Other); // Out chain

	SDVTList VTs = DAG.getVTList(ValueVTs);
	SDValue Result;
	if (FPI.isUnaryOp())
	Result = DAG.getNode(Opcode, sdl, VTs,
	{ Chain, getValue(FPI.getArgOperand(0)) });
	else
	Result = DAG.getNode(Opcode, sdl, VTs,
	{ Chain, getValue(FPI.getArgOperand(0)),
	getValue(FPI.getArgOperand(1)) });

	assert(Result.getNode()->getNumValues() == 2);
	SDValue OutChain = Result.getValue(1);
	DAG.setRoot(OutChain);
	SDValue FPResult = Result.getValue(0);
	setValue(&FPI, FPResult);
	}

	std::pair<SDValue, SDValue>
	SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
	const BasicBlock *EHPadBB) {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineModuleInfo &MMI = MF.getMMI();
	MCSymbol *BeginLabel = nullptr;

	if (EHPadBB) {
	// Insert a label before the invoke call to mark the try range. This can be
	// used to detect deletion of the invoke via the MachineModuleInfo.
	BeginLabel = MMI.getContext().createTempSymbol();

	// For SjLj, keep track of which landing pads go with which invokes
	// so as to maintain the ordering of pads in the LSDA.
	unsigned CallSiteIndex = MMI.getCurrentCallSite();
	if (CallSiteIndex) {
	MF.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
	LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);

	// Now that the call site is handled, stop tracking it.
	MMI.setCurrentCallSite(0);
	}

	// Both PendingLoads and PendingExports must be flushed here;
	// this call might not return.
	(void)getRoot();
	DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel));

	CLI.setChain(getRoot());
	}
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);

	assert((CLI.IsTailCall \|\| Result.second.getNode()) &&
	"Non-null chain expected with non-tail call!");
	assert((Result.second.getNode() \|\| !Result.first.getNode()) &&
	"Null value expected with tail call!");

	if (!Result.second.getNode()) {
	// As a special case, a null chain means that a tail call has been emitted
	// and the DAG root is already updated.
	HasTailCall = true;

	// Since there's no actual continuation from this block, nothing can be
	// relying on us setting vregs for them.
	PendingExports.clear();
	} else {
	DAG.setRoot(Result.second);
	}

	if (EHPadBB) {
	// Insert a label at the end of the invoke call to mark the try range. This
	// can be used to detect deletion of the invoke via the MachineModuleInfo.
	MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
	DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel));

	// Inform MachineModuleInfo of range.
	if (MF.hasEHFunclets()) {
	assert(CLI.CS);
	WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
	EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS->getInstruction()),
	BeginLabel, EndLabel);
	} else {
	MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
	}
	}

	return Result;
	}

	void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
	bool isTailCall,
	const BasicBlock *EHPadBB) {
	auto &DL = DAG.getDataLayout();
	FunctionType *FTy = CS.getFunctionType();
	Type *RetTy = CS.getType();

	TargetLowering::ArgListTy Args;
	Args.reserve(CS.arg_size());

	const Value *SwiftErrorVal = nullptr;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// We can't tail call inside a function with a swifterror argument. Lowering
	// does not support this yet. It would have to move into the swifterror
	// register before the call.
	auto *Caller = CS.getInstruction()->getParent()->getParent();
	if (TLI.supportSwiftError() &&
	Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
	isTailCall = false;

	for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
	i != e; ++i) {
	TargetLowering::ArgListEntry Entry;
	const Value V = i;

	// Skip empty types
	if (V->getType()->isEmptyTy())
	continue;

	SDValue ArgNode = getValue(V);
	Entry.Node = ArgNode; Entry.Ty = V->getType();

	Entry.setAttributes(&CS, i - CS.arg_begin());

	// Use swifterror virtual register as input to the call.
	if (Entry.IsSwiftError && TLI.supportSwiftError()) {
	SwiftErrorVal = V;
	// We find the virtual register for the actual swifterror argument.
	// Instead of using the Value, we use the virtual register instead.
	Entry.Node = DAG.getRegister(FuncInfo
	.getOrCreateSwiftErrorVRegUseAt(
	CS.getInstruction(), FuncInfo.MBB, V)
	.first,
	EVT(TLI.getPointerTy(DL)));
	}

	Args.push_back(Entry);

	// If we have an explicit sret argument that is an Instruction, (i.e., it
	// might point to function-local memory), we can't meaningfully tail-call.
	if (Entry.IsSRet && isa<Instruction>(V))
	isTailCall = false;
	}

	// Check if target-independent constraints permit a tail call here.
	// Target-dependent constraints are checked within TLI->LowerCallTo.
	if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
	isTailCall = false;

	// Disable tail calls if there is an swifterror argument. Targets have not
	// been updated to support tail calls.
	if (TLI.supportSwiftError() && SwiftErrorVal)
	isTailCall = false;

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(getCurSDLoc())
	.setChain(getRoot())
	.setCallee(RetTy, FTy, Callee, std::move(Args), CS)
	.setTailCall(isTailCall)
	.setConvergent(CS.isConvergent());
	std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);

	if (Result.first.getNode()) {
	const Instruction *Inst = CS.getInstruction();
	Result.first = lowerRangeToAssertZExt(DAG, *Inst, Result.first);
	setValue(Inst, Result.first);
	}

	// The last element of CLI.InVals has the SDValue for swifterror return.
	// Here we copy it to a virtual register and update SwiftErrorMap for
	// book-keeping.
	if (SwiftErrorVal && TLI.supportSwiftError()) {
	// Get the last element of InVals.
	SDValue Src = CLI.InVals.back();
	unsigned VReg; bool CreatedVReg;
	std::tie(VReg, CreatedVReg) =
	FuncInfo.getOrCreateSwiftErrorVRegDefAt(CS.getInstruction());
	SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
	// We update the virtual register for the actual swifterror argument.
	if (CreatedVReg)
	FuncInfo.setCurrentSwiftErrorVReg(FuncInfo.MBB, SwiftErrorVal, VReg);
	DAG.setRoot(CopyNode);
	}
	}

	static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
	SelectionDAGBuilder &Builder) {

	// Check to see if this load can be trivially constant folded, e.g. if the
	// input is from a string literal.
	if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
	// Cast pointer to the type we really want to load.
	Type *LoadTy =
	Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
	if (LoadVT.isVector())
	LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());

	LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
	PointerType::getUnqual(LoadTy));

	if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(
	const_cast<Constant >(LoadInput), LoadTy, Builder.DL))
	return Builder.getValue(LoadCst);
	}

	// Otherwise, we have to emit the load. If the pointer is to unfoldable but
	// still constant memory, the input chain can be the entry node.
	SDValue Root;
	bool ConstantMemory = false;

	// Do not serialize (non-volatile) loads of constant memory with anything.
	if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) {
	Root = Builder.DAG.getEntryNode();
	ConstantMemory = true;
	} else {
	// Do not serialize non-volatile loads against each other.
	Root = Builder.DAG.getRoot();
	}

	SDValue Ptr = Builder.getValue(PtrVal);
	SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root,
	Ptr, MachinePointerInfo(PtrVal),
	/* Alignment = */ 1);

	if (!ConstantMemory)
	Builder.PendingLoads.push_back(LoadVal.getValue(1));
	return LoadVal;
	}

	/// Record the value for an instruction that produces an integer result,
	/// converting the type where necessary.
	void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
	SDValue Value,
	bool IsSigned) {
	EVT VT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType(), true);
	if (IsSigned)
	Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
	else
	Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
	setValue(&I, Value);
	}

	/// See if we can lower a memcmp call into an optimized form. If so, return
	/// true and lower it. Otherwise return false, and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
	const Value LHS = I.getArgOperand(0), RHS = I.getArgOperand(1);
	const Value *Size = I.getArgOperand(2);
	const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
	if (CSize && CSize->getZExtValue() == 0) {
	EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
	I.getType(), true);
	setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
	return true;
	}

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
	DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
	getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
	if (Res.first.getNode()) {
	processIntegerCallValue(I, Res.first, true);
	PendingLoads.push_back(Res.second);
	return true;
	}

	// memcmp(S1,S2,2) != 0 -> ((short)LHS != (short)RHS) != 0
	// memcmp(S1,S2,4) != 0 -> ((int)LHS != (int)RHS) != 0
	if (!CSize \|\| !isOnlyUsedInZeroEqualityComparison(&I))
	return false;

	// If the target has a fast compare for the given size, it will return a
	// preferred load type for that size. Require that the load VT is legal and
	// that the target supports unaligned loads of that type. Otherwise, return
	// INVALID.
	auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT LVT = TLI.hasFastEqualityCompare(NumBits);
	if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
	// TODO: Handle 5 byte compare as 4-byte + 1 byte.
	// TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
	// TODO: Check alignment of src and dest ptrs.
	unsigned DstAS = LHS->getType()->getPointerAddressSpace();
	unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
	if (!TLI.isTypeLegal(LVT) \|\|
	!TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) \|\|
	!TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
	LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	return LVT;
	};

	// This turns into unaligned loads. We only do this if the target natively
	// supports the MVT we'll be loading or if it is small enough (<= 4) that
	// we'll only produce a small number of byte loads.
	MVT LoadVT;
	unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
	switch (NumBitsToCompare) {
	default:
	return false;
	case 16:
	LoadVT = MVT::i16;
	break;
	case 32:
	LoadVT = MVT::i32;
	break;
	case 64:
	case 128:
	case 256:
	LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
	break;
	}

	if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
	return false;

	SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
	SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);

	// Bitcast to a wide integer type if the loads are vectors.
	if (LoadVT.isVector()) {
	EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
	LoadL = DAG.getBitcast(CmpVT, LoadL);
	LoadR = DAG.getBitcast(CmpVT, LoadR);
	}

	SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
	processIntegerCallValue(I, Cmp, false);
	return true;
	}

	/// See if we can lower a memchr call into an optimized form. If so, return
	/// true and lower it. Otherwise return false, and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
	const Value *Src = I.getArgOperand(0);
	const Value *Char = I.getArgOperand(1);
	const Value *Length = I.getArgOperand(2);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
	getValue(Src), getValue(Char), getValue(Length),
	MachinePointerInfo(Src));
	if (Res.first.getNode()) {
	setValue(&I, Res.first);
	PendingLoads.push_back(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a mempcpy call into an optimized form. If so, return
	/// true and lower it. Otherwise return false, and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
	SDValue Dst = getValue(I.getArgOperand(0));
	SDValue Src = getValue(I.getArgOperand(1));
	SDValue Size = getValue(I.getArgOperand(2));

	unsigned DstAlign = DAG.InferPtrAlignment(Dst);
	unsigned SrcAlign = DAG.InferPtrAlignment(Src);
	unsigned Align = std::min(DstAlign, SrcAlign);
	if (Align == 0) // Alignment of one or both could not be inferred.
	Align = 1; // 0 and 1 both specify no alignment, but 0 is reserved.

	bool isVol = false;
	SDLoc sdl = getCurSDLoc();

	// In the mempcpy context we need to pass in a false value for isTailCall
	// because the return pointer needs to be adjusted by the size of
	// the copied memory.
	SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Align, isVol,
	false, /isTailCall=/false,
	MachinePointerInfo(I.getArgOperand(0)),
	MachinePointerInfo(I.getArgOperand(1)));
	assert(MC.getNode() != nullptr &&
	" memcpy should not be lowered as TailCall in mempcpy context ");
	DAG.setRoot(MC);

	// Check if Size needs to be truncated or extended.
	Size = DAG.getSExtOrTrunc(Size, sdl, Dst.getValueType());

	// Adjust return pointer to point just past the last dst byte.
	SDValue DstPlusSize = DAG.getNode(ISD::ADD, sdl, Dst.getValueType(),
	Dst, Size);
	setValue(&I, DstPlusSize);
	return true;
	}

	/// See if we can lower a strcpy call into an optimized form. If so, return
	/// true and lower it, otherwise return false and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
	const Value Arg0 = I.getArgOperand(0), Arg1 = I.getArgOperand(1);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
	getValue(Arg0), getValue(Arg1),
	MachinePointerInfo(Arg0),
	MachinePointerInfo(Arg1), isStpcpy);
	if (Res.first.getNode()) {
	setValue(&I, Res.first);
	DAG.setRoot(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a strcmp call into an optimized form. If so, return
	/// true and lower it, otherwise return false and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
	const Value Arg0 = I.getArgOperand(0), Arg1 = I.getArgOperand(1);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
	getValue(Arg0), getValue(Arg1),
	MachinePointerInfo(Arg0),
	MachinePointerInfo(Arg1));
	if (Res.first.getNode()) {
	processIntegerCallValue(I, Res.first, true);
	PendingLoads.push_back(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a strlen call into an optimized form. If so, return
	/// true and lower it, otherwise return false and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
	const Value *Arg0 = I.getArgOperand(0);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
	getValue(Arg0), MachinePointerInfo(Arg0));
	if (Res.first.getNode()) {
	processIntegerCallValue(I, Res.first, false);
	PendingLoads.push_back(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a strnlen call into an optimized form. If so, return
	/// true and lower it, otherwise return false and it will be lowered like a
	/// normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
	const Value Arg0 = I.getArgOperand(0), Arg1 = I.getArgOperand(1);

	const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
	std::pair<SDValue, SDValue> Res =
	TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
	getValue(Arg0), getValue(Arg1),
	MachinePointerInfo(Arg0));
	if (Res.first.getNode()) {
	processIntegerCallValue(I, Res.first, false);
	PendingLoads.push_back(Res.second);
	return true;
	}

	return false;
	}

	/// See if we can lower a unary floating-point operation into an SDNode with
	/// the specified Opcode. If so, return true and lower it, otherwise return
	/// false and it will be lowered like a normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
	unsigned Opcode) {
	// We already checked this call's prototype; verify it doesn't modify errno.
	if (!I.onlyReadsMemory())
	return false;

	SDValue Tmp = getValue(I.getArgOperand(0));
	setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp));
	return true;
	}

	/// See if we can lower a binary floating-point operation into an SDNode with
	/// the specified Opcode. If so, return true and lower it. Otherwise return
	/// false, and it will be lowered like a normal call.
	/// The caller already checked that \p I calls the appropriate LibFunc with a
	/// correct prototype.
	bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
	unsigned Opcode) {
	// We already checked this call's prototype; verify it doesn't modify errno.
	if (!I.onlyReadsMemory())
	return false;

	SDValue Tmp0 = getValue(I.getArgOperand(0));
	SDValue Tmp1 = getValue(I.getArgOperand(1));
	EVT VT = Tmp0.getValueType();
	setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1));
	return true;
	}

	void SelectionDAGBuilder::visitCall(const CallInst &I) {
	// Handle inline assembly differently.
	if (isa<InlineAsm>(I.getCalledValue())) {
	visitInlineAsm(&I);
	return;
	}

	MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
	computeUsesVAFloatArgument(I, MMI);

	const char *RenameFn = nullptr;
	if (Function *F = I.getCalledFunction()) {
	if (F->isDeclaration()) {
	if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) {
	if (unsigned IID = II->getIntrinsicID(F)) {
	RenameFn = visitIntrinsicCall(I, IID);
	if (!RenameFn)
	return;
	}
	}
	if (Intrinsic::ID IID = F->getIntrinsicID()) {
	RenameFn = visitIntrinsicCall(I, IID);
	if (!RenameFn)
	return;
	}
	}

	// Check for well-known libc/libm calls. If the function is internal, it
	// can't be a library call. Don't do the check if marked as nobuiltin for
	// some reason.
	LibFunc Func;
	if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() &&
	LibInfo->getLibFunc(*F, Func) &&
	LibInfo->hasOptimizedCodeGen(Func)) {
	switch (Func) {
	default: break;
	case LibFunc_copysign:
	case LibFunc_copysignf:
	case LibFunc_copysignl:
	// We already checked this call's prototype; verify it doesn't modify
	// errno.
	if (I.onlyReadsMemory()) {
	SDValue LHS = getValue(I.getArgOperand(0));
	SDValue RHS = getValue(I.getArgOperand(1));
	setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
	LHS.getValueType(), LHS, RHS));
	return;
	}
	break;
	case LibFunc_fabs:
	case LibFunc_fabsf:
	case LibFunc_fabsl:
	if (visitUnaryFloatCall(I, ISD::FABS))
	return;
	break;
	case LibFunc_fmin:
	case LibFunc_fminf:
	case LibFunc_fminl:
	if (visitBinaryFloatCall(I, ISD::FMINNUM))
	return;
	break;
	case LibFunc_fmax:
	case LibFunc_fmaxf:
	case LibFunc_fmaxl:
	if (visitBinaryFloatCall(I, ISD::FMAXNUM))
	return;
	break;
	case LibFunc_sin:
	case LibFunc_sinf:
	case LibFunc_sinl:
	if (visitUnaryFloatCall(I, ISD::FSIN))
	return;
	break;
	case LibFunc_cos:
	case LibFunc_cosf:
	case LibFunc_cosl:
	if (visitUnaryFloatCall(I, ISD::FCOS))
	return;
	break;
	case LibFunc_sqrt:
	case LibFunc_sqrtf:
	case LibFunc_sqrtl:
	case LibFunc_sqrt_finite:
	case LibFunc_sqrtf_finite:
	case LibFunc_sqrtl_finite:
	if (visitUnaryFloatCall(I, ISD::FSQRT))
	return;
	break;
	case LibFunc_floor:
	case LibFunc_floorf:
	case LibFunc_floorl:
	if (visitUnaryFloatCall(I, ISD::FFLOOR))
	return;
	break;
	case LibFunc_nearbyint:
	case LibFunc_nearbyintf:
	case LibFunc_nearbyintl:
	if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
	return;
	break;
	case LibFunc_ceil:
	case LibFunc_ceilf:
	case LibFunc_ceill:
	if (visitUnaryFloatCall(I, ISD::FCEIL))
	return;
	break;
	case LibFunc_rint:
	case LibFunc_rintf:
	case LibFunc_rintl:
	if (visitUnaryFloatCall(I, ISD::FRINT))
	return;
	break;
	case LibFunc_round:
	case LibFunc_roundf:
	case LibFunc_roundl:
	if (visitUnaryFloatCall(I, ISD::FROUND))
	return;
	break;
	case LibFunc_trunc:
	case LibFunc_truncf:
	case LibFunc_truncl:
	if (visitUnaryFloatCall(I, ISD::FTRUNC))
	return;
	break;
	case LibFunc_log2:
	case LibFunc_log2f:
	case LibFunc_log2l:
	if (visitUnaryFloatCall(I, ISD::FLOG2))
	return;
	break;
	case LibFunc_exp2:
	case LibFunc_exp2f:
	case LibFunc_exp2l:
	if (visitUnaryFloatCall(I, ISD::FEXP2))
	return;
	break;
	case LibFunc_memcmp:
	if (visitMemCmpCall(I))
	return;
	break;
	case LibFunc_mempcpy:
	if (visitMemPCpyCall(I))
	return;
	break;
	case LibFunc_memchr:
	if (visitMemChrCall(I))
	return;
	break;
	case LibFunc_strcpy:
	if (visitStrCpyCall(I, false))
	return;
	break;
	case LibFunc_stpcpy:
	if (visitStrCpyCall(I, true))
	return;
	break;
	case LibFunc_strcmp:
	if (visitStrCmpCall(I))
	return;
	break;
	case LibFunc_strlen:
	if (visitStrLenCall(I))
	return;
	break;
	case LibFunc_strnlen:
	if (visitStrNLenCall(I))
	return;
	break;
	}
	}
	}

	SDValue Callee;
	if (!RenameFn)
	Callee = getValue(I.getCalledValue());
	else
	Callee = DAG.getExternalSymbol(
	RenameFn,
	DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));

	// Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
	// have to do anything here to lower funclet bundles.
	assert(!I.hasOperandBundlesOtherThan(
	{LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
	"Cannot lower calls with arbitrary operand bundles!");

	if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
	LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
	else
	// Check if we can potentially perform a tail call. More detailed checking
	// is be done within LowerCallTo, after more information about the call is
	// known.
	LowerCallTo(&I, Callee, I.isTailCall());
	}

	namespace {

	/// AsmOperandInfo - This contains information for each constraint that we are
	/// lowering.
	class SDISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
	public:
	/// CallOperand - If this is the result output operand or a clobber
	/// this is null, otherwise it is the incoming operand to the CallInst.
	/// This gets modified as the asm is processed.
	SDValue CallOperand;

	/// AssignedRegs - If this is a register or register class operand, this
	/// contains the set of register corresponding to the operand.
	RegsForValue AssignedRegs;

	explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info)
	: TargetLowering::AsmOperandInfo(info), CallOperand(nullptr,0) {
	}

	/// Whether or not this operand accesses memory
	bool hasMemory(const TargetLowering &TLI) const {
	// Indirect operand accesses access memory.
	if (isIndirect)
	return true;

	for (const auto &Code : Codes)
	if (TLI.getConstraintType(Code) == TargetLowering::C_Memory)
	return true;

	return false;
	}

	/// getCallOperandValEVT - Return the EVT of the Value* that this operand
	/// corresponds to. If there is no Value* for this operand, it returns
	/// MVT::Other.
	EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI,
	const DataLayout &DL) const {
	if (!CallOperandVal) return MVT::Other;

	if (isa<BasicBlock>(CallOperandVal))
	return TLI.getPointerTy(DL);

	llvm::Type *OpTy = CallOperandVal->getType();

	// FIXME: code duplicated from TargetLowering::ParseConstraints().
	// If this is an indirect operand, the operand is a pointer to the
	// accessed type.
	if (isIndirect) {
	llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
	if (!PtrTy)
	report_fatal_error("Indirect operand for inline asm not a pointer!");
	OpTy = PtrTy->getElementType();
	}

	// Look for vector wrapped in a struct. e.g. { <16 x i8> }.
	if (StructType *STy = dyn_cast<StructType>(OpTy))
	if (STy->getNumElements() == 1)
	OpTy = STy->getElementType(0);

	// If OpTy is not a single value, it may be a struct/union that we
	// can tile with integers.
	if (!OpTy->isSingleValueType() && OpTy->isSized()) {
	unsigned BitSize = DL.getTypeSizeInBits(OpTy);
	switch (BitSize) {
	default: break;
	case 1:
	case 8:
	case 16:
	case 32:
	case 64:
	case 128:
	OpTy = IntegerType::get(Context, BitSize);
	break;
	}
	}

	return TLI.getValueType(DL, OpTy, true);
	}
	};

	typedef SmallVector<SDISelAsmOperandInfo,16> SDISelAsmOperandInfoVector;

	} // end anonymous namespace

	/// Make sure that the output operand \p OpInfo and its corresponding input
	/// operand \p MatchingOpInfo have compatible constraint types (otherwise error
	/// out).
	static void patchMatchingInput(const SDISelAsmOperandInfo &OpInfo,
	SDISelAsmOperandInfo &MatchingOpInfo,
	SelectionDAG &DAG) {
	if (OpInfo.ConstraintVT == MatchingOpInfo.ConstraintVT)
	return;

	const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
	const auto &TLI = DAG.getTargetLoweringInfo();

	std::pair<unsigned, const TargetRegisterClass *> MatchRC =
	TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
	OpInfo.ConstraintVT);
	std::pair<unsigned, const TargetRegisterClass *> InputRC =
	TLI.getRegForInlineAsmConstraint(TRI, MatchingOpInfo.ConstraintCode,
	MatchingOpInfo.ConstraintVT);
	if ((OpInfo.ConstraintVT.isInteger() !=
	MatchingOpInfo.ConstraintVT.isInteger()) \|\|
	(MatchRC.second != InputRC.second)) {
	// FIXME: error out in a more elegant fashion
	report_fatal_error("Unsupported asm: input constraint"
	" with a matching output constraint of"
	" incompatible type!");
	}
	MatchingOpInfo.ConstraintVT = OpInfo.ConstraintVT;
	}

	/// Get a direct memory input to behave well as an indirect operand.
	/// This may introduce stores, hence the need for a \p Chain.
	/// \return The (possibly updated) chain.
	static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
	SDISelAsmOperandInfo &OpInfo,
	SelectionDAG &DAG) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we don't have an indirect input, put it in the constpool if we can,
	// otherwise spill it to a stack slot.
	// TODO: This isn't quite right. We need to handle these according to
	// the addressing mode that the constraint wants. Also, this may take
	// an additional register for the computation and we don't want that
	// either.

	// If the operand is a float, integer, or vector constant, spill to a
	// constant pool entry to get its address.
	const Value *OpVal = OpInfo.CallOperandVal;
	if (isa<ConstantFP>(OpVal) \|\| isa<ConstantInt>(OpVal) \|\|
	isa<ConstantVector>(OpVal) \|\| isa<ConstantDataVector>(OpVal)) {
	OpInfo.CallOperand = DAG.getConstantPool(
	cast<Constant>(OpVal), TLI.getPointerTy(DAG.getDataLayout()));
	return Chain;
	}

	// Otherwise, create a stack slot and emit a store to it before the asm.
	Type *Ty = OpVal->getType();
	auto &DL = DAG.getDataLayout();
	uint64_t TySize = DL.getTypeAllocSize(Ty);
	unsigned Align = DL.getPrefTypeAlignment(Ty);
	MachineFunction &MF = DAG.getMachineFunction();
	int SSFI = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
	Chain = DAG.getStore(Chain, Location, OpInfo.CallOperand, StackSlot,
	MachinePointerInfo::getFixedStack(MF, SSFI));
	OpInfo.CallOperand = StackSlot;

	return Chain;
	}

	/// GetRegistersForValue - Assign registers (virtual or physical) for the
	/// specified operand. We prefer to assign virtual registers, to allow the
	/// register allocator to handle the assignment process. However, if the asm
	/// uses features that we can't model on machineinstrs, we have SDISel do the
	/// allocation. This produces generally horrible, but correct, code.
	///
	/// OpInfo describes the operand.
	///
	static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &DL,
	SDISelAsmOperandInfo &OpInfo) {
	LLVMContext &Context = *DAG.getContext();

	MachineFunction &MF = DAG.getMachineFunction();
	SmallVector<unsigned, 4> Regs;
	const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();

	// If this is a constraint for a single physreg, or a constraint for a
	// register class, find it.
	std::pair<unsigned, const TargetRegisterClass *> PhysReg =
	TLI.getRegForInlineAsmConstraint(&TRI, OpInfo.ConstraintCode,
	OpInfo.ConstraintVT);

	unsigned NumRegs = 1;
	if (OpInfo.ConstraintVT != MVT::Other) {
	// If this is a FP input in an integer register (or visa versa) insert a bit
	// cast of the input value. More generally, handle any case where the input
	// value disagrees with the register class we plan to stick this in.
	if (OpInfo.Type == InlineAsm::isInput && PhysReg.second &&
	!TRI.isTypeLegalForClass(*PhysReg.second, OpInfo.ConstraintVT)) {
	// Try to convert to the first EVT that the reg class contains. If the
	// types are identical size, use a bitcast to convert (e.g. two differing
	// vector types).
	MVT RegVT = TRI.legalclasstypes_begin(PhysReg.second);
	if (RegVT.getSizeInBits() == OpInfo.CallOperand.getValueSizeInBits()) {
	OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
	RegVT, OpInfo.CallOperand);
	OpInfo.ConstraintVT = RegVT;
	} else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
	// If the input is a FP value and we want it in FP registers, do a
	// bitcast to the corresponding integer type. This turns an f64 value
	// into i64, which can be passed with two i32 values on a 32-bit
	// machine.
	RegVT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
	OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
	RegVT, OpInfo.CallOperand);
	OpInfo.ConstraintVT = RegVT;
	}
	}

	NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT);
	}

	MVT RegVT;
	EVT ValueVT = OpInfo.ConstraintVT;

	// If this is a constraint for a specific physical register, like {r17},
	// assign it now.
	if (unsigned AssignedReg = PhysReg.first) {
	const TargetRegisterClass *RC = PhysReg.second;
	if (OpInfo.ConstraintVT == MVT::Other)
	ValueVT = TRI.legalclasstypes_begin(RC);

	// Get the actual register value type. This is important, because the user
	// may have asked for (e.g.) the AX register in i32 type. We need to
	// remember that AX is actually i16 to get the right extension.
	RegVT = TRI.legalclasstypes_begin(RC);

	// This is a explicit reference to a physical register.
	Regs.push_back(AssignedReg);

	// If this is an expanded reference, add the rest of the regs to Regs.
	if (NumRegs != 1) {
	TargetRegisterClass::iterator I = RC->begin();
	for (; *I != AssignedReg; ++I)
	assert(I != RC->end() && "Didn't find reg!");

	// Already added the first reg.
	--NumRegs; ++I;
	for (; NumRegs; --NumRegs, ++I) {
	assert(I != RC->end() && "Ran out of registers to allocate!");
	Regs.push_back(*I);
	}
	}

	OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
	return;
	}

	// Otherwise, if this was a reference to an LLVM register class, create vregs
	// for this reference.
	if (const TargetRegisterClass *RC = PhysReg.second) {
	RegVT = TRI.legalclasstypes_begin(RC);
	if (OpInfo.ConstraintVT == MVT::Other)
	ValueVT = RegVT;

	// Create the appropriate number of virtual registers.
	MachineRegisterInfo &RegInfo = MF.getRegInfo();
	for (; NumRegs; --NumRegs)
	Regs.push_back(RegInfo.createVirtualRegister(RC));

	OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
	return;
	}

	// Otherwise, we couldn't allocate enough registers for this.
	}

	static unsigned
	findMatchingInlineAsmOperand(unsigned OperandNo,
	const std::vector<SDValue> &AsmNodeOperands) {
	// Scan until we find the definition we already emitted of this operand.
	unsigned CurOp = InlineAsm::Op_FirstOperand;
	for (; OperandNo; --OperandNo) {
	// Advance to the next operand.
	unsigned OpFlag =
	cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
	assert((InlineAsm::isRegDefKind(OpFlag) \|\|
	InlineAsm::isRegDefEarlyClobberKind(OpFlag) \|\|
	InlineAsm::isMemKind(OpFlag)) &&
	"Skipped past definitions?");
	CurOp += InlineAsm::getNumOperandRegisters(OpFlag) + 1;
	}
	return CurOp;
	}

	/// Fill \p Regs with \p NumRegs new virtual registers of type \p RegVT
	/// \return true if it has succeeded, false otherwise
	static bool createVirtualRegs(SmallVector<unsigned, 4> &Regs, unsigned NumRegs,
	MVT RegVT, SelectionDAG &DAG) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
	for (unsigned i = 0, e = NumRegs; i != e; ++i) {
	if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT))
	Regs.push_back(RegInfo.createVirtualRegister(RC));
	else
	return false;
	}
	return true;
	}

	class ExtraFlags {
	unsigned Flags = 0;

	public:
	explicit ExtraFlags(ImmutableCallSite CS) {
	const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
	if (IA->hasSideEffects())
	Flags \|= InlineAsm::Extra_HasSideEffects;
	if (IA->isAlignStack())
	Flags \|= InlineAsm::Extra_IsAlignStack;
	if (CS.isConvergent())
	Flags \|= InlineAsm::Extra_IsConvergent;
	Flags \|= IA->getDialect() * InlineAsm::Extra_AsmDialect;
	}

	void update(const llvm::TargetLowering::AsmOperandInfo &OpInfo) {
	// Ideally, we would only check against memory constraints. However, the
	// meaning of an Other constraint can be target-specific and we can't easily
	// reason about it. Therefore, be conservative and set MayLoad/MayStore
	// for Other constraints as well.
	if (OpInfo.ConstraintType == TargetLowering::C_Memory \|\|
	OpInfo.ConstraintType == TargetLowering::C_Other) {
	if (OpInfo.Type == InlineAsm::isInput)
	Flags \|= InlineAsm::Extra_MayLoad;
	else if (OpInfo.Type == InlineAsm::isOutput)
	Flags \|= InlineAsm::Extra_MayStore;
	else if (OpInfo.Type == InlineAsm::isClobber)
	Flags \|= (InlineAsm::Extra_MayLoad \| InlineAsm::Extra_MayStore);
	}
	}

	unsigned get() const { return Flags; }
	};

	/// visitInlineAsm - Handle a call to an InlineAsm object.
	///
	void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
	const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());

	/// ConstraintOperands - Information about all of the constraints.
	SDISelAsmOperandInfoVector ConstraintOperands;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(
	DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), CS);

	bool hasMemory = false;

	// Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
	ExtraFlags ExtraInfo(CS);

	unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
	unsigned ResNo = 0; // ResNo - The result number of the next output.
	for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
	ConstraintOperands.push_back(SDISelAsmOperandInfo(TargetConstraints[i]));
	SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();

	MVT OpVT = MVT::Other;

	// Compute the value type for each operand.
	if (OpInfo.Type == InlineAsm::isInput \|\|
	(OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) {
	OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));

	// Process the call argument. BasicBlocks are labels, currently appearing
	// only in asm's.
	if (const BasicBlock *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) {
	OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
	} else {
	OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
	}

	OpVT =
	OpInfo
	.getCallOperandValEVT(*DAG.getContext(), TLI, DAG.getDataLayout())
	.getSimpleVT();
	}

	if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
	// The return value of the call is this value. As such, there is no
	// corresponding argument.
	assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
	if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
	OpVT = TLI.getSimpleValueType(DAG.getDataLayout(),
	STy->getElementType(ResNo));
	} else {
	assert(ResNo == 0 && "Asm only has one result!");
	OpVT = TLI.getSimpleValueType(DAG.getDataLayout(), CS.getType());
	}
	++ResNo;
	}

	OpInfo.ConstraintVT = OpVT;

	if (!hasMemory)
	hasMemory = OpInfo.hasMemory(TLI);

	// Determine if this InlineAsm MayLoad or MayStore based on the constraints.
	// FIXME: Could we compute this on OpInfo rather than TargetConstraints[i]?
	auto TargetConstraint = TargetConstraints[i];

	// Compute the constraint code and ConstraintType to use.
	TLI.ComputeConstraintToUse(TargetConstraint, SDValue());

	ExtraInfo.update(TargetConstraint);
	}

	SDValue Chain, Flag;

	// We won't need to flush pending loads if this asm doesn't touch
	// memory and is nonvolatile.
	if (hasMemory \|\| IA->hasSideEffects())
	Chain = getRoot();
	else
	Chain = DAG.getRoot();

	// Second pass over the constraints: compute which constraint option to use
	// and assign registers to constraints that want a specific physreg.
	for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
	SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];

	// If this is an output operand with a matching input operand, look up the
	// matching input. If their types mismatch, e.g. one is an integer, the
	// other is floating point, or their sizes are different, flag it as an
	// error.
	if (OpInfo.hasMatchingInput()) {
	SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
	patchMatchingInput(OpInfo, Input, DAG);
	}

	// Compute the constraint code and ConstraintType to use.
	TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);

	if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
	OpInfo.Type == InlineAsm::isClobber)
	continue;

	// If this is a memory input, and if the operand is not indirect, do what we
	// need to to provide an address for the memory input.
	if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
	!OpInfo.isIndirect) {
	assert((OpInfo.isMultipleAlternative \|\|
	(OpInfo.Type == InlineAsm::isInput)) &&
	"Can only indirectify direct input operands!");

	// Memory operands really want the address of the value.
	Chain = getAddressForMemoryInput(Chain, getCurSDLoc(), OpInfo, DAG);

	// There is no longer a Value* corresponding to this operand.
	OpInfo.CallOperandVal = nullptr;

	// It is now an indirect operand.
	OpInfo.isIndirect = true;
	}

	// If this constraint is for a specific register, allocate it before
	// anything else.
	if (OpInfo.ConstraintType == TargetLowering::C_Register)
	GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo);
	}

	// Third pass - Loop over all of the operands, assigning virtual or physregs
	// to register class operands.
	for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
	SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];

	// C_Register operands have already been allocated, Other/Memory don't need
	// to be.
	if (OpInfo.ConstraintType == TargetLowering::C_RegisterClass)
	GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo);
	}

	// AsmNodeOperands - The operands for the ISD::INLINEASM node.
	std::vector<SDValue> AsmNodeOperands;
	AsmNodeOperands.push_back(SDValue()); // reserve space for input chain
	AsmNodeOperands.push_back(DAG.getTargetExternalSymbol(
	IA->getAsmString().c_str(), TLI.getPointerTy(DAG.getDataLayout())));

	// If we have a !srcloc metadata node associated with it, we want to attach
	// this to the ultimately generated inline asm machineinstr. To do this, we
	// pass in the third operand as this (potentially null) inline asm MDNode.
	const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
	AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));

	// Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
	// bits as operand 3.
	AsmNodeOperands.push_back(DAG.getTargetConstant(
	ExtraInfo.get(), getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));

	// Loop over all of the inputs, copying the operand values into the
	// appropriate registers and processing the output regs.
	RegsForValue RetValRegs;

	// IndirectStoresToEmit - The set of stores to emit after the inline asm node.
	std::vector<std::pair<RegsForValue, Value*> > IndirectStoresToEmit;

	for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
	SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];

	switch (OpInfo.Type) {
	case InlineAsm::isOutput: {
	if (OpInfo.ConstraintType != TargetLowering::C_RegisterClass &&
	OpInfo.ConstraintType != TargetLowering::C_Register) {
	// Memory output, or 'other' output (e.g. 'X' constraint).
	assert(OpInfo.isIndirect && "Memory output must be indirect operand");

	unsigned ConstraintID =
	TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
	assert(ConstraintID != InlineAsm::Constraint_Unknown &&
	"Failed to convert memory constraint code to constraint id.");

	// Add information to the INLINEASM node to know about this output.
	unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
	OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
	AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags, getCurSDLoc(),
	MVT::i32));
	AsmNodeOperands.push_back(OpInfo.CallOperand);
	break;
	}

	// Otherwise, this is a register or register class output.

	// Copy the output from the appropriate register. Find a register that
	// we can use.
	if (OpInfo.AssignedRegs.Regs.empty()) {
	emitInlineAsmError(
	CS, "couldn't allocate output register for constraint '" +
	Twine(OpInfo.ConstraintCode) + "'");
	return;
	}

	// If this is an indirect operand, store through the pointer after the
	// asm.
	if (OpInfo.isIndirect) {
	IndirectStoresToEmit.push_back(std::make_pair(OpInfo.AssignedRegs,
	OpInfo.CallOperandVal));
	} else {
	// This is the result value of the call.
	assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
	// Concatenate this output onto the outputs list.
	RetValRegs.append(OpInfo.AssignedRegs);
	}

	// Add information to the INLINEASM node to know that this register is
	// set.
	OpInfo.AssignedRegs
	.AddInlineAsmOperands(OpInfo.isEarlyClobber
	? InlineAsm::Kind_RegDefEarlyClobber
	: InlineAsm::Kind_RegDef,
	false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
	break;
	}
	case InlineAsm::isInput: {
	SDValue InOperandVal = OpInfo.CallOperand;

	if (OpInfo.isMatchingInputConstraint()) {
	// If this is required to match an output register we have already set,
	// just use its register.
	auto CurOp = findMatchingInlineAsmOperand(OpInfo.getMatchedOperand(),
	AsmNodeOperands);
	unsigned OpFlag =
	cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
	if (InlineAsm::isRegDefKind(OpFlag) \|\|
	InlineAsm::isRegDefEarlyClobberKind(OpFlag)) {
	// Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
	if (OpInfo.isIndirect) {
	// This happens on gcc/testsuite/gcc.dg/pr8788-1.c
	emitInlineAsmError(CS, "inline asm not supported yet:"
	" don't know how to handle tied "
	"indirect register inputs");
	return;
	}

	MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType();
	SmallVector<unsigned, 4> Regs;

	if (!createVirtualRegs(Regs,
	InlineAsm::getNumOperandRegisters(OpFlag),
	RegVT, DAG)) {
	emitInlineAsmError(CS, "inline asm error: This value type register "
	"class is not natively supported!");
	return;
	}

	RegsForValue MatchedRegs(Regs, RegVT, InOperandVal.getValueType());

	SDLoc dl = getCurSDLoc();
	// Use the produced MatchedRegs object to
	MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag,
	CS.getInstruction());
	MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
	true, OpInfo.getMatchedOperand(), dl,
	DAG, AsmNodeOperands);
	break;
	}

	assert(InlineAsm::isMemKind(OpFlag) && "Unknown matching constraint!");
	assert(InlineAsm::getNumOperandRegisters(OpFlag) == 1 &&
	"Unexpected number of operands");
	// Add information to the INLINEASM node to know about this input.
	// See InlineAsm.h isUseOperandTiedToDef.
	OpFlag = InlineAsm::convertMemFlagWordToMatchingFlagWord(OpFlag);
	OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
	OpInfo.getMatchedOperand());
	AsmNodeOperands.push_back(DAG.getTargetConstant(
	OpFlag, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
	AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
	break;
	}

	// Treat indirect 'X' constraint as memory.
	if (OpInfo.ConstraintType == TargetLowering::C_Other &&
	OpInfo.isIndirect)
	OpInfo.ConstraintType = TargetLowering::C_Memory;

	if (OpInfo.ConstraintType == TargetLowering::C_Other) {
	std::vector<SDValue> Ops;
	TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
	Ops, DAG);
	if (Ops.empty()) {
	emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
	Twine(OpInfo.ConstraintCode) + "'");
	return;
	}

	// Add information to the INLINEASM node to know about this input.
	unsigned ResOpType =
	InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
	AsmNodeOperands.push_back(DAG.getTargetConstant(
	ResOpType, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
	AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
	break;
	}

	if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
	assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
	assert(InOperandVal.getValueType() ==
	TLI.getPointerTy(DAG.getDataLayout()) &&
	"Memory operands expect pointer values");

	unsigned ConstraintID =
	TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
	assert(ConstraintID != InlineAsm::Constraint_Unknown &&
	"Failed to convert memory constraint code to constraint id.");

	// Add information to the INLINEASM node to know about this input.
	unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
	ResOpType = InlineAsm::getFlagWordForMem(ResOpType, ConstraintID);
	AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
	getCurSDLoc(),
	MVT::i32));
	AsmNodeOperands.push_back(InOperandVal);
	break;
	}

	assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass \|\|
	OpInfo.ConstraintType == TargetLowering::C_Register) &&
	"Unknown constraint type!");

	// TODO: Support this.
	if (OpInfo.isIndirect) {
	emitInlineAsmError(
	CS, "Don't know how to handle indirect register inputs yet "
	"for constraint '" +
	Twine(OpInfo.ConstraintCode) + "'");
	return;
	}

	// Copy the input into the appropriate registers.
	if (OpInfo.AssignedRegs.Regs.empty()) {
	emitInlineAsmError(CS, "couldn't allocate input reg for constraint '" +
	Twine(OpInfo.ConstraintCode) + "'");
	return;
	}

	SDLoc dl = getCurSDLoc();

	OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl,
	Chain, &Flag, CS.getInstruction());

	OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
	dl, DAG, AsmNodeOperands);
	break;
	}
	case InlineAsm::isClobber: {
	// Add the clobbered value to the operand list, so that the register
	// allocator is aware that the physreg got clobbered.
	if (!OpInfo.AssignedRegs.Regs.empty())
	OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_Clobber,
	false, 0, getCurSDLoc(), DAG,
	AsmNodeOperands);
	break;
	}
	}
	}

	// Finish up input operands. Set the input chain and add the flag last.
	AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
	if (Flag.getNode()) AsmNodeOperands.push_back(Flag);

	Chain = DAG.getNode(ISD::INLINEASM, getCurSDLoc(),
	DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
	Flag = Chain.getValue(1);

	// If this asm returns a register value, copy the result from that register
	// and set it as the value of the call.
	if (!RetValRegs.Regs.empty()) {
	SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
	Chain, &Flag, CS.getInstruction());

	// FIXME: Why don't we do this for inline asms with MRVs?
	if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
	EVT ResultType = TLI.getValueType(DAG.getDataLayout(), CS.getType());

	// If any of the results of the inline asm is a vector, it may have the
	// wrong width/num elts. This can happen for register classes that can
	// contain multiple different value types. The preg or vreg allocated may
	// not have the same VT as was expected. Convert it to the right type
	// with bit_convert.
	if (ResultType != Val.getValueType() && Val.getValueType().isVector()) {
	Val = DAG.getNode(ISD::BITCAST, getCurSDLoc(),
	ResultType, Val);

	} else if (ResultType != Val.getValueType() &&
	ResultType.isInteger() && Val.getValueType().isInteger()) {
	// If a result value was tied to an input value, the computed result may
	// have a wider width than the expected result. Extract the relevant
	// portion.
	Val = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultType, Val);
	}

	assert(ResultType == Val.getValueType() && "Asm result value mismatch!");
	}

	setValue(CS.getInstruction(), Val);
	// Don't need to use this as a chain in this case.
	if (!IA->hasSideEffects() && !hasMemory && IndirectStoresToEmit.empty())
	return;
	}

	std::vector<std::pair<SDValue, const Value *> > StoresToEmit;

	// Process indirect outputs, first output all of the flagged copies out of
	// physregs.
	for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) {
	RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
	const Value *Ptr = IndirectStoresToEmit[i].second;
	SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
	Chain, &Flag, IA);
	StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
	}

	// Emit the non-flagged stores from the physregs.
	SmallVector<SDValue, 8> OutChains;
	for (unsigned i = 0, e = StoresToEmit.size(); i != e; ++i) {
	SDValue Val = DAG.getStore(Chain, getCurSDLoc(), StoresToEmit[i].first,
	getValue(StoresToEmit[i].second),
	MachinePointerInfo(StoresToEmit[i].second));
	OutChains.push_back(Val);
	}

	if (!OutChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, OutChains);

	DAG.setRoot(Chain);
	}

	void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
	const Twine &Message) {
	LLVMContext &Ctx = *DAG.getContext();
	Ctx.emitError(CS.getInstruction(), Message);

	// Make sure we leave the DAG in a valid state
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	auto VT = TLI.getValueType(DAG.getDataLayout(), CS.getType());
	setValue(CS.getInstruction(), DAG.getUNDEF(VT));
	}

	void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
	DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(),
	MVT::Other, getRoot(),
	getValue(I.getArgOperand(0)),
	DAG.getSrcValue(I.getArgOperand(0))));
	}

	void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	SDValue V = DAG.getVAArg(TLI.getValueType(DAG.getDataLayout(), I.getType()),
	getCurSDLoc(), getRoot(), getValue(I.getOperand(0)),
	DAG.getSrcValue(I.getOperand(0)),
	DL.getABITypeAlignment(I.getType()));
	setValue(&I, V);
	DAG.setRoot(V.getValue(1));
	}

	void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
	DAG.setRoot(DAG.getNode(ISD::VAEND, getCurSDLoc(),
	MVT::Other, getRoot(),
	getValue(I.getArgOperand(0)),
	DAG.getSrcValue(I.getArgOperand(0))));
	}

	void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
	DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurSDLoc(),
	MVT::Other, getRoot(),
	getValue(I.getArgOperand(0)),
	getValue(I.getArgOperand(1)),
	DAG.getSrcValue(I.getArgOperand(0)),
	DAG.getSrcValue(I.getArgOperand(1))));
	}

	SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
	const Instruction &I,
	SDValue Op) {
	const MDNode *Range = I.getMetadata(LLVMContext::MD_range);
	if (!Range)
	return Op;

	ConstantRange CR = getConstantRangeFromMetadata(*Range);
	if (CR.isFullSet() \|\| CR.isEmptySet() \|\| CR.isWrappedSet())
	return Op;

	APInt Lo = CR.getUnsignedMin();
	if (!Lo.isMinValue())
	return Op;

	APInt Hi = CR.getUnsignedMax();
	unsigned Bits = Hi.getActiveBits();

	EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);

	SDLoc SL = getCurSDLoc();

	SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(), Op,
	DAG.getValueType(SmallVT));
	unsigned NumVals = Op.getNode()->getNumValues();
	if (NumVals == 1)
	return ZExt;

	SmallVector<SDValue, 4> Ops;

	Ops.push_back(ZExt);
	for (unsigned I = 1; I != NumVals; ++I)
	Ops.push_back(Op.getValue(I));

	return DAG.getMergeValues(Ops, SL);
	}

	/// \brief Populate a CallLowerinInfo (into \p CLI) based on the properties of
	/// the call being lowered.
	///
	/// This is a helper for lowering intrinsics that follow a target calling
	/// convention or require stack pointer adjustment. Only a subset of the
	/// intrinsic's operands need to participate in the calling convention.
	void SelectionDAGBuilder::populateCallLoweringInfo(
	TargetLowering::CallLoweringInfo &CLI, ImmutableCallSite CS,
	unsigned ArgIdx, unsigned NumArgs, SDValue Callee, Type *ReturnTy,
	bool IsPatchPoint) {
	TargetLowering::ArgListTy Args;
	Args.reserve(NumArgs);

	// Populate the argument list.
	// Attributes for args start at offset 1, after the return attribute.
	for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs;
	ArgI != ArgE; ++ArgI) {
	const Value *V = CS->getOperand(ArgI);

	assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");

	TargetLowering::ArgListEntry Entry;
	Entry.Node = getValue(V);
	Entry.Ty = V->getType();
	Entry.setAttributes(&CS, ArgIdx);
	Args.push_back(Entry);
	}

	CLI.setDebugLoc(getCurSDLoc())
	.setChain(getRoot())
	.setCallee(CS.getCallingConv(), ReturnTy, Callee, std::move(Args))
	.setDiscardResult(CS->use_empty())
	.setIsPatchPoint(IsPatchPoint);
	}

	/// \brief Add a stack map intrinsic call's live variable operands to a stackmap
	/// or patchpoint target node's operand list.
	///
	/// Constants are converted to TargetConstants purely as an optimization to
	/// avoid constant materialization and register allocation.
	///
	/// FrameIndex operands are converted to TargetFrameIndex so that ISEL does not
	/// generate addess computation nodes, and so ExpandISelPseudo can convert the
	/// TargetFrameIndex into a DirectMemRefOp StackMap location. This avoids
	/// address materialization and register allocation, but may also be required
	/// for correctness. If a StackMap (or PatchPoint) intrinsic directly uses an
	/// alloca in the entry block, then the runtime may assume that the alloca's
	/// StackMap location can be read immediately after compilation and that the
	/// location is valid at any point during execution (this is similar to the
	/// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
	/// only available in a register, then the runtime would need to trap when
	/// execution reaches the StackMap in order to read the alloca's location.
	static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
	const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
	SelectionDAGBuilder &Builder) {
	for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
	SDValue OpVal = Builder.getValue(CS.getArgument(i));
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
	Ops.push_back(
	Builder.DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
	Ops.push_back(
	Builder.DAG.getTargetConstant(C->getSExtValue(), DL, MVT::i64));
	} else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) {
	const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
	Ops.push_back(Builder.DAG.getTargetFrameIndex(
	FI->getIndex(), TLI.getFrameIndexTy(Builder.DAG.getDataLayout())));
	} else
	Ops.push_back(OpVal);
	}
	}

	/// \brief Lower llvm.experimental.stackmap directly to its target opcode.
	void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
	// void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
	// [live variables...])

	assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");

	SDValue Chain, InFlag, Callee, NullPtr;
	SmallVector<SDValue, 32> Ops;

	SDLoc DL = getCurSDLoc();
	Callee = getValue(CI.getCalledValue());
	NullPtr = DAG.getIntPtrConstant(0, DL, true);

	// The stackmap intrinsic only records the live variables (the arguemnts
	// passed to it) and emits NOPS (if requested). Unlike the patchpoint
	// intrinsic, this won't be lowered to a function call. This means we don't
	// have to worry about calling conventions and target specific lowering code.
	// Instead we perform the call lowering right here.
	//
	// chain, flag = CALLSEQ_START(chain, 0, 0)
	// chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
	// chain, flag = CALLSEQ_END(chain, 0, 0, flag)
	//
	Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL);
	InFlag = Chain.getValue(1);

	// Add the <id> and <numBytes> constants.
	SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos));
	Ops.push_back(DAG.getTargetConstant(
	cast<ConstantSDNode>(IDVal)->getZExtValue(), DL, MVT::i64));
	SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos));
	Ops.push_back(DAG.getTargetConstant(
	cast<ConstantSDNode>(NBytesVal)->getZExtValue(), DL,
	MVT::i32));

	// Push live variables for the stack map.
	addStackMapLiveVars(&CI, 2, DL, Ops, *this);

	// We are not pushing any register mask info here on the operands list,
	// because the stackmap doesn't clobber anything.

	// Push the chain and the glue flag.
	Ops.push_back(Chain);
	Ops.push_back(InFlag);

	// Create the STACKMAP node.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDNode *SM = DAG.getMachineNode(TargetOpcode::STACKMAP, DL, NodeTys, Ops);
	Chain = SDValue(SM, 0);
	InFlag = Chain.getValue(1);

	Chain = DAG.getCALLSEQ_END(Chain, NullPtr, NullPtr, InFlag, DL);

	// Stackmaps don't generate values, so nothing goes into the NodeMap.

	// Set the root to the target-lowered call chain.
	DAG.setRoot(Chain);

	// Inform the Frame Information that we have a stackmap in this function.
	FuncInfo.MF->getFrameInfo().setHasStackMap();
	}

	/// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
	void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
	const BasicBlock *EHPadBB) {
	// void\|i64 @llvm.experimental.patchpoint.void\|i64(i64 <id>,
	// i32 <numBytes>,
	// i8* <target>,
	// i32 <numArgs>,
	// [Args...],
	// [live variables...])

	CallingConv::ID CC = CS.getCallingConv();
	bool IsAnyRegCC = CC == CallingConv::AnyReg;
	bool HasDef = !CS->getType()->isVoidTy();
	SDLoc dl = getCurSDLoc();
	SDValue Callee = getValue(CS->getOperand(PatchPointOpers::TargetPos));

	// Handle immediate and symbolic callees.
	if (auto* ConstCallee = dyn_cast<ConstantSDNode>(Callee))
	Callee = DAG.getIntPtrConstant(ConstCallee->getZExtValue(), dl,
	/isTarget=/true);
	else if (auto* SymbolicCallee = dyn_cast<GlobalAddressSDNode>(Callee))
	Callee = DAG.getTargetGlobalAddress(SymbolicCallee->getGlobal(),
	SDLoc(SymbolicCallee),
	SymbolicCallee->getValueType(0));

	// Get the real number of arguments participating in the call <numArgs>
	SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos));
	unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();

	// Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
	// Intrinsics include all meta-operands up to but not including CC.
	unsigned NumMetaOpers = PatchPointOpers::CCPos;
	assert(CS.arg_size() >= NumMetaOpers + NumArgs &&
	"Not enough arguments provided to the patchpoint intrinsic");

	// For AnyRegCC the arguments are lowered later on manually.
	unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
	Type *ReturnTy =
	IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType();

	TargetLowering::CallLoweringInfo CLI(DAG);
	populateCallLoweringInfo(CLI, CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy,
	true);
	std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);

	SDNode *CallEnd = Result.second.getNode();
	if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
	CallEnd = CallEnd->getOperand(0).getNode();

	/// Get a call instruction from the call sequence chain.
	/// Tail calls are not allowed.
	assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
	"Expected a callseq node.");
	SDNode *Call = CallEnd->getOperand(0).getNode();
	bool HasGlue = Call->getGluedNode();

	// Replace the target specific call node with the patchable intrinsic.
	SmallVector<SDValue, 8> Ops;

	// Add the <id> and <numBytes> constants.
	SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos));
	Ops.push_back(DAG.getTargetConstant(
	cast<ConstantSDNode>(IDVal)->getZExtValue(), dl, MVT::i64));
	SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos));
	Ops.push_back(DAG.getTargetConstant(
	cast<ConstantSDNode>(NBytesVal)->getZExtValue(), dl,
	MVT::i32));

	// Add the callee.
	Ops.push_back(Callee);

	// Adjust <numArgs> to account for any arguments that have been passed on the
	// stack instead.
	// Call Node: Chain, Target, {Args}, RegMask, [Glue]
	unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3);
	NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs;
	Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, dl, MVT::i32));

	// Add the calling convention
	Ops.push_back(DAG.getTargetConstant((unsigned)CC, dl, MVT::i32));

	// Add the arguments we omitted previously. The register allocator should
	// place these in any free register.
	if (IsAnyRegCC)
	for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
	Ops.push_back(getValue(CS.getArgument(i)));

	// Push the arguments from the call instruction up to the register mask.
	SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
	Ops.append(Call->op_begin() + 2, e);

	// Push live variables for the stack map.
	addStackMapLiveVars(CS, NumMetaOpers + NumArgs, dl, Ops, *this);

	// Push the register mask info.
	if (HasGlue)
	Ops.push_back(*(Call->op_end()-2));
	else
	Ops.push_back(*(Call->op_end()-1));

	// Push the chain (this is originally the first operand of the call, but
	// becomes now the last or second to last operand).
	Ops.push_back(*(Call->op_begin()));

	// Push the glue flag (last operand).
	if (HasGlue)
	Ops.push_back(*(Call->op_end()-1));

	SDVTList NodeTys;
	if (IsAnyRegCC && HasDef) {
	// Create the return types based on the intrinsic definition
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SmallVector<EVT, 3> ValueVTs;
	ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
	assert(ValueVTs.size() == 1 && "Expected only one return value type.");

	// There is always a chain and a glue type at the end
	ValueVTs.push_back(MVT::Other);
	ValueVTs.push_back(MVT::Glue);
	NodeTys = DAG.getVTList(ValueVTs);
	} else
	NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// Replace the target specific call node with a PATCHPOINT node.
	MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
	dl, NodeTys, Ops);

	// Update the NodeMap.
	if (HasDef) {
	if (IsAnyRegCC)
	setValue(CS.getInstruction(), SDValue(MN, 0));
	else
	setValue(CS.getInstruction(), Result.first);
	}

	// Fixup the consumers of the intrinsic. The chain and glue may be used in the
	// call sequence. Furthermore the location of the chain and glue can change
	// when the AnyReg calling convention is used and the intrinsic returns a
	// value.
	if (IsAnyRegCC && HasDef) {
	SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
	SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
	DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
	} else
	DAG.ReplaceAllUsesWith(Call, MN);
	DAG.DeleteNode(Call);

	// Inform the Frame Information that we have a patchpoint in this function.
	FuncInfo.MF->getFrameInfo().setHasPatchPoint();
	}

	void SelectionDAGBuilder::visitVectorReduce(const CallInst &I,
	unsigned Intrinsic) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Op1 = getValue(I.getArgOperand(0));
	SDValue Op2;
	if (I.getNumArgOperands() > 1)
	Op2 = getValue(I.getArgOperand(1));
	SDLoc dl = getCurSDLoc();
	EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
	SDValue Res;
	FastMathFlags FMF;
	if (isa<FPMathOperator>(I))
	FMF = I.getFastMathFlags();
	SDNodeFlags SDFlags;
	SDFlags.setNoNaNs(FMF.noNaNs());

	switch (Intrinsic) {
	case Intrinsic::experimental_vector_reduce_fadd:
	if (FMF.unsafeAlgebra())
	Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2);
	else
	Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2);
	break;
	case Intrinsic::experimental_vector_reduce_fmul:
	if (FMF.unsafeAlgebra())
	Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2);
	else
	Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2);
	break;
	case Intrinsic::experimental_vector_reduce_add:
	Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_mul:
	Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_and:
	Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_or:
	Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_xor:
	Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_smax:
	Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_smin:
	Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_umax:
	Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_umin:
	Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1);
	break;
	case Intrinsic::experimental_vector_reduce_fmax: {
	Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags);
	break;
	}
	case Intrinsic::experimental_vector_reduce_fmin: {
	Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags);
	break;
	}
	default:
	llvm_unreachable("Unhandled vector reduce intrinsic");
	}
	setValue(&I, Res);
	}

	/// Returns an AttributeList representing the attributes applied to the return
	/// value of the given call.
	static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
	SmallVector<Attribute::AttrKind, 2> Attrs;
	if (CLI.RetSExt)
	Attrs.push_back(Attribute::SExt);
	if (CLI.RetZExt)
	Attrs.push_back(Attribute::ZExt);
	if (CLI.IsInReg)
	Attrs.push_back(Attribute::InReg);

	return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
	Attrs);
	}

	/// TargetLowering::LowerCallTo - This is the default LowerCallTo
	/// implementation, which just calls LowerCall.
	/// FIXME: When all targets are
	/// migrated to using LowerCall, this hook should be integrated into SDISel.
	std::pair<SDValue, SDValue>
	TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
	// Handle the incoming return values from the call.
	CLI.Ins.clear();
	Type *OrigRetTy = CLI.RetTy;
	SmallVector<EVT, 4> RetTys;
	SmallVector<uint64_t, 4> Offsets;
	auto &DL = CLI.DAG.getDataLayout();
	ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);

	if (CLI.IsPostTypeLegalization) {
	// If we are lowering a libcall after legalization, split the return type.
	SmallVector<EVT, 4> OldRetTys = std::move(RetTys);
	SmallVector<uint64_t, 4> OldOffsets = std::move(Offsets);
	for (size_t i = 0, e = OldRetTys.size(); i != e; ++i) {
	EVT RetVT = OldRetTys[i];
	uint64_t Offset = OldOffsets[i];
	MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), RetVT);
	unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), RetVT);
	unsigned RegisterVTSize = RegisterVT.getSizeInBits();
	RetTys.append(NumRegs, RegisterVT);
	for (unsigned j = 0; j != NumRegs; ++j)
	Offsets.push_back(Offset + j * RegisterVTSize);
	}
	}

	SmallVector<ISD::OutputArg, 4> Outs;
	GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL);

	bool CanLowerReturn =
	this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
	CLI.IsVarArg, Outs, CLI.RetTy->getContext());

	SDValue DemoteStackSlot;
	int DemoteStackIdx = -100;
	if (!CanLowerReturn) {
	// FIXME: equivalent assert?
	// assert(!CS.hasInAllocaArgument() &&
	// "sret demotion is incompatible with inalloca");
	uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy);
	unsigned Align = DL.getPrefTypeAlignment(CLI.RetTy);
	MachineFunction &MF = CLI.DAG.getMachineFunction();
	DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
	Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy);

	DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getFrameIndexTy(DL));
	ArgListEntry Entry;
	Entry.Node = DemoteStackSlot;
	Entry.Ty = StackSlotPtrType;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Entry.IsInReg = false;
	Entry.IsSRet = true;
	Entry.IsNest = false;
	Entry.IsByVal = false;
	Entry.IsReturned = false;
	Entry.IsSwiftSelf = false;
	Entry.IsSwiftError = false;
	Entry.Alignment = Align;
	CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
	CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());

	// sret demotion isn't compatible with tail-calls, since the sret argument
	// points into the callers stack frame.
	CLI.IsTailCall = false;
	} else {
	for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
	EVT VT = RetTys[I];
	MVT RegisterVT =
	getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
	unsigned NumRegs =
	getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);
	for (unsigned i = 0; i != NumRegs; ++i) {
	ISD::InputArg MyFlags;
	MyFlags.VT = RegisterVT;
	MyFlags.ArgVT = VT;
	MyFlags.Used = CLI.IsReturnValueUsed;
	if (CLI.RetSExt)
	MyFlags.Flags.setSExt();
	if (CLI.RetZExt)
	MyFlags.Flags.setZExt();
	if (CLI.IsInReg)
	MyFlags.Flags.setInReg();
	CLI.Ins.push_back(MyFlags);
	}
	}
	}

	// We push in swifterror return as the last element of CLI.Ins.
	ArgListTy &Args = CLI.getArgs();
	if (supportSwiftError()) {
	for (unsigned i = 0, e = Args.size(); i != e; ++i) {
	if (Args[i].IsSwiftError) {
	ISD::InputArg MyFlags;
	MyFlags.VT = getPointerTy(DL);
	MyFlags.ArgVT = EVT(getPointerTy(DL));
	MyFlags.Flags.setSwiftError();
	CLI.Ins.push_back(MyFlags);
	}
	}
	}

	// Handle all of the outgoing arguments.
	CLI.Outs.clear();
	CLI.OutVals.clear();
	for (unsigned i = 0, e = Args.size(); i != e; ++i) {
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
	// FIXME: Split arguments if CLI.IsPostTypeLegalization
	Type *FinalType = Args[i].Ty;
	if (Args[i].IsByVal)
	FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
	bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
	FinalType, CLI.CallConv, CLI.IsVarArg);
	for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
	++Value) {
	EVT VT = ValueVTs[Value];
	Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
	SDValue Op = SDValue(Args[i].Node.getNode(),
	Args[i].Node.getResNo() + Value);
	ISD::ArgFlagsTy Flags;

	// Certain targets (such as MIPS), may have a different ABI alignment
	// for a type depending on the context. Give the target a chance to
	// specify the alignment it wants.
	unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL);

	if (Args[i].IsZExt)
	Flags.setZExt();
	if (Args[i].IsSExt)
	Flags.setSExt();
	if (Args[i].IsInReg) {
	// If we are using vectorcall calling convention, a structure that is
	// passed InReg - is surely an HVA
	if (CLI.CallConv == CallingConv::X86_VectorCall &&
	isa<StructType>(FinalType)) {
	// The first value of a structure is marked
	if (0 == Value)
	Flags.setHvaStart();
	Flags.setHva();
	}
	// Set InReg Flag
	Flags.setInReg();
	}
	if (Args[i].IsSRet)
	Flags.setSRet();
	if (Args[i].IsSwiftSelf)
	Flags.setSwiftSelf();
	if (Args[i].IsSwiftError)
	Flags.setSwiftError();
	if (Args[i].IsByVal)
	Flags.setByVal();
	if (Args[i].IsInAlloca) {
	Flags.setInAlloca();
	// Set the byval flag for CCAssignFn callbacks that don't know about
	// inalloca. This way we can know how many bytes we should've allocated
	// and how many bytes a callee cleanup function will pop. If we port
	// inalloca to more targets, we'll have to add custom inalloca handling
	// in the various CC lowering callbacks.
	Flags.setByVal();
	}
	if (Args[i].IsByVal \|\| Args[i].IsInAlloca) {
	PointerType *Ty = cast<PointerType>(Args[i].Ty);
	Type *ElementTy = Ty->getElementType();
	Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
	// For ByVal, alignment should come from FE. BE will guess if this
	// info is not there but there are cases it cannot get right.
	unsigned FrameAlign;
	if (Args[i].Alignment)
	FrameAlign = Args[i].Alignment;
	else
	FrameAlign = getByValTypeAlignment(ElementTy, DL);
	Flags.setByValAlign(FrameAlign);
	}
	if (Args[i].IsNest)
	Flags.setNest();
	if (NeedsRegBlock)
	Flags.setInConsecutiveRegs();
	Flags.setOrigAlign(OriginalAlignment);

	MVT PartVT = getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
	unsigned NumParts =
	getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);
	SmallVector<SDValue, 4> Parts(NumParts);
	ISD::NodeType ExtendKind = ISD::ANY_EXTEND;

	if (Args[i].IsSExt)
	ExtendKind = ISD::SIGN_EXTEND;
	else if (Args[i].IsZExt)
	ExtendKind = ISD::ZERO_EXTEND;

	// Conservatively only handle 'returned' on non-vectors for now
	if (Args[i].IsReturned && !Op.getValueType().isVector()) {
	assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues &&
	"unexpected use of 'returned'");
	// Before passing 'returned' to the target lowering code, ensure that
	// either the register MVT and the actual EVT are the same size or that
	// the return value and argument are extended in the same way; in these
	// cases it's safe to pass the argument register value unchanged as the
	// return register value (although it's at the target's option whether
	// to do so)
	// TODO: allow code generation to take advantage of partially preserved
	// registers rather than clobbering the entire register when the
	// parameter extension method is not compatible with the return
	// extension method
	if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) \|\|
	(ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt &&
	CLI.RetZExt == Args[i].IsZExt))
	Flags.setReturned();
	}

	getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
	CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind,
	true);

	for (unsigned j = 0; j != NumParts; ++j) {
	// if it isn't first piece, alignment must be 1
	ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
	i < CLI.NumFixedArgs,
	i, j*Parts[j].getValueType().getStoreSize());
	if (NumParts > 1 && j == 0)
	MyFlags.Flags.setSplit();
	else if (j != 0) {
	MyFlags.Flags.setOrigAlign(1);
	if (j == NumParts - 1)
	MyFlags.Flags.setSplitEnd();
	}

	CLI.Outs.push_back(MyFlags);
	CLI.OutVals.push_back(Parts[j]);
	}

	if (NeedsRegBlock && Value == NumValues - 1)
	CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
	}
	}

	SmallVector<SDValue, 4> InVals;
	CLI.Chain = LowerCall(CLI, InVals);

	// Update CLI.InVals to use outside of this function.
	CLI.InVals = InVals;

	// Verify that the target's LowerCall behaved as expected.
	assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
	"LowerCall didn't return a valid chain!");
	assert((!CLI.IsTailCall \|\| InVals.empty()) &&
	"LowerCall emitted a return value for a tail call!");
	assert((CLI.IsTailCall \|\| InVals.size() == CLI.Ins.size()) &&
	"LowerCall didn't emit the correct number of values!");

	// For a tail call, the return value is merely live-out and there aren't
	// any nodes in the DAG representing it. Return a special value to
	// indicate that a tail call has been emitted and no more Instructions
	// should be processed in the current block.
	if (CLI.IsTailCall) {
	CLI.DAG.setRoot(CLI.Chain);
	return std::make_pair(SDValue(), SDValue());
	}

	#ifndef NDEBUG
	for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
	assert(InVals[i].getNode() && "LowerCall emitted a null value!");
	assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
	"LowerCall emitted a value with the wrong type!");
	}
	#endif

	SmallVector<SDValue, 4> ReturnValues;
	if (!CanLowerReturn) {
	// The instruction result is the result of loading from the
	// hidden sret parameter.
	SmallVector<EVT, 1> PVTs;
	Type *PtrRetTy = PointerType::getUnqual(OrigRetTy);

	ComputeValueVTs(*this, DL, PtrRetTy, PVTs);
	assert(PVTs.size() == 1 && "Pointers should fit in one register");
	EVT PtrVT = PVTs[0];

	unsigned NumValues = RetTys.size();
	ReturnValues.resize(NumValues);
	SmallVector<SDValue, 4> Chains(NumValues);

	// An aggregate return value cannot wrap around the address space, so
	// offsets to its parts don't wrap either.
	SDNodeFlags Flags;
	Flags.setNoUnsignedWrap(true);

	for (unsigned i = 0; i < NumValues; ++i) {
	SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
	CLI.DAG.getConstant(Offsets[i], CLI.DL,
	PtrVT), Flags);
	SDValue L = CLI.DAG.getLoad(
	RetTys[i], CLI.DL, CLI.Chain, Add,
	MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
	DemoteStackIdx, Offsets[i]),
	/* Alignment = */ 1);
	ReturnValues[i] = L;
	Chains[i] = L.getValue(1);
	}

	CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains);
	} else {
	// Collect the legal value parts into potentially illegal values
	// that correspond to the original function's return values.
	Optional<ISD::NodeType> AssertOp;
	if (CLI.RetSExt)
	AssertOp = ISD::AssertSext;
	else if (CLI.RetZExt)
	AssertOp = ISD::AssertZext;
	unsigned CurReg = 0;
	for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
	EVT VT = RetTys[I];
	MVT RegisterVT =
	getRegisterTypeForCallingConv(CLI.RetTy->getContext(), VT);
	unsigned NumRegs =
	getNumRegistersForCallingConv(CLI.RetTy->getContext(), VT);

	ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
	NumRegs, RegisterVT, VT, nullptr,
	AssertOp, true));
	CurReg += NumRegs;
	}

	// For a function returning void, there is no return value. We can't create
	// such a node, so we just return a null return value in that case. In
	// that case, nothing will actually look at the value.
	if (ReturnValues.empty())
	return std::make_pair(SDValue(), CLI.Chain);
	}

	SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
	CLI.DAG.getVTList(RetTys), ReturnValues);
	return std::make_pair(Res, CLI.Chain);
	}

	void TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	if (SDValue Res = LowerOperation(SDValue(N, 0), DAG))
	Results.push_back(Res);
	}

	SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	llvm_unreachable("LowerOperation not implemented for this target!");
	}

	void
	SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
	SDValue Op = getNonRegisterValue(V);
	assert((Op.getOpcode() != ISD::CopyFromReg \|\|
	cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
	"Copy from a reg to the same reg!");
	assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// If this is an InlineAsm we have to match the registers required, not the
	// notional registers required by the type.
	- bool IsABIRegCopy =
	- V && ((isa<CallInst>(V) &&
	- !(static_cast<const CallInst *>(V))->isInlineAsm()) \|\|
	- isa<ReturnInst>(V));

	RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
	- V->getType(), IsABIRegCopy);
	+ V->getType(), isABIRegCopy(V));
	SDValue Chain = DAG.getEntryNode();

	ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
	FuncInfo.PreferredExtendType.end())
	? ISD::ANY_EXTEND
	: FuncInfo.PreferredExtendType[V];
	RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
	PendingExports.push_back(Chain);
	}

	#include "llvm/CodeGen/SelectionDAGISel.h"

	/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
	/// entry block, return true. This includes arguments used by switches, since
	/// the switch may expand into multiple basic blocks.
	static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
	// With FastISel active, we may be splitting blocks, so force creation
	// of virtual registers for all non-dead arguments.
	if (FastISel)
	return A->use_empty();

	const BasicBlock &Entry = A->getParent()->front();
	for (const User *U : A->users())
	if (cast<Instruction>(U)->getParent() != &Entry \|\| isa<SwitchInst>(U))
	return false; // Use not in entry block.

	return true;
	}

	typedef DenseMap<const Argument *,
	std::pair<const AllocaInst , const StoreInst >>
	ArgCopyElisionMapTy;

	/// Scan the entry block of the function in FuncInfo for arguments that look
	/// like copies into a local alloca. Record any copied arguments in
	/// ArgCopyElisionCandidates.
	static void
	findArgumentCopyElisionCandidates(const DataLayout &DL,
	FunctionLoweringInfo *FuncInfo,
	ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
	// Record the state of every static alloca used in the entry block. Argument
	// allocas are all used in the entry block, so we need approximately as many
	// entries as we have arguments.
	enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
	SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
	unsigned NumArgs = FuncInfo->Fn->arg_size();
	StaticAllocas.reserve(NumArgs * 2);

	auto GetInfoIfStaticAlloca = [&](const Value V) -> StaticAllocaInfo {
	if (!V)
	return nullptr;
	V = V->stripPointerCasts();
	const auto *AI = dyn_cast<AllocaInst>(V);
	if (!AI \|\| !AI->isStaticAlloca() \|\| !FuncInfo->StaticAllocaMap.count(AI))
	return nullptr;
	auto Iter = StaticAllocas.insert({AI, Unknown});
	return &Iter.first->second;
	};

	// Look for stores of arguments to static allocas. Look through bitcasts and
	// GEPs to handle type coercions, as long as the alloca is fully initialized
	// by the store. Any non-store use of an alloca escapes it and any subsequent
	// unanalyzed store might write it.
	// FIXME: Handle structs initialized with multiple stores.
	for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
	// Look for stores, and handle non-store uses conservatively.
	const auto *SI = dyn_cast<StoreInst>(&I);
	if (!SI) {
	// We will look through cast uses, so ignore them completely.
	if (I.isCast())
	continue;
	// Ignore debug info intrinsics, they don't escape or store to allocas.
	if (isa<DbgInfoIntrinsic>(I))
	continue;
	// This is an unknown instruction. Assume it escapes or writes to all
	// static alloca operands.
	for (const Use &U : I.operands()) {
	if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
	*Info = StaticAllocaInfo::Clobbered;
	}
	continue;
	}

	// If the stored value is a static alloca, mark it as escaped.
	if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
	*Info = StaticAllocaInfo::Clobbered;

	// Check if the destination is a static alloca.
	const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
	StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
	if (!Info)
	continue;
	const AllocaInst *AI = cast<AllocaInst>(Dst);

	// Skip allocas that have been initialized or clobbered.
	if (*Info != StaticAllocaInfo::Unknown)
	continue;

	// Check if the stored value is an argument, and that this store fully
	// initializes the alloca. Don't elide copies from the same argument twice.
	const Value *Val = SI->getValueOperand()->stripPointerCasts();
	const auto *Arg = dyn_cast<Argument>(Val);
	if (!Arg \|\| Arg->hasInAllocaAttr() \|\| Arg->hasByValAttr() \|\|
	Arg->getType()->isEmptyTy() \|\|
	DL.getTypeStoreSize(Arg->getType()) !=
	DL.getTypeAllocSize(AI->getAllocatedType()) \|\|
	ArgCopyElisionCandidates.count(Arg)) {
	*Info = StaticAllocaInfo::Clobbered;
	continue;
	}

	DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n');

	// Mark this alloca and store for argument copy elision.
	*Info = StaticAllocaInfo::Elidable;
	ArgCopyElisionCandidates.insert({Arg, {AI, SI}});

	// Stop scanning if we've seen all arguments. This will happen early in -O0
	// builds, which is useful, because -O0 builds have large entry blocks and
	// many allocas.
	if (ArgCopyElisionCandidates.size() == NumArgs)
	break;
	}
	}

	/// Try to elide argument copies from memory into a local alloca. Succeeds if
	/// ArgVal is a load from a suitable fixed stack object.
	static void tryToElideArgumentCopy(
	FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains,
	DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
	SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
	ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
	SDValue ArgVal, bool &ArgHasUses) {
	// Check if this is a load from a fixed stack object.
	auto *LNode = dyn_cast<LoadSDNode>(ArgVal);
	if (!LNode)
	return;
	auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
	if (!FINode)
	return;

	// Check that the fixed stack object is the right size and alignment.
	// Look at the alignment that the user wrote on the alloca instead of looking
	// at the stack object.
	auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
	assert(ArgCopyIter != ArgCopyElisionCandidates.end());
	const AllocaInst *AI = ArgCopyIter->second.first;
	int FixedIndex = FINode->getIndex();
	int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
	int OldIndex = AllocaIndex;
	MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
	if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
	DEBUG(dbgs() << " argument copy elision failed due to bad fixed stack "
	"object size\n");
	return;
	}
	unsigned RequiredAlignment = AI->getAlignment();
	if (!RequiredAlignment) {
	RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
	AI->getAllocatedType());
	}
	if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
	DEBUG(dbgs() << " argument copy elision failed: alignment of alloca "
	"greater than stack argument alignment ("
	<< RequiredAlignment << " vs "
	<< MFI.getObjectAlignment(FixedIndex) << ")\n");
	return;
	}

	// Perform the elision. Delete the old stack object and replace its only use
	// in the variable info map. Mark the stack object as mutable.
	DEBUG({
	dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
	<< " Replacing frame index " << OldIndex << " with " << FixedIndex
	<< '\n';
	});
	MFI.RemoveStackObject(OldIndex);
	MFI.setIsImmutableObjectIndex(FixedIndex, false);
	AllocaIndex = FixedIndex;
	ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
	Chains.push_back(ArgVal.getValue(1));

	// Avoid emitting code for the store implementing the copy.
	const StoreInst *SI = ArgCopyIter->second.second;
	ElidedArgCopyInstrs.insert(SI);

	// Check for uses of the argument again so that we can avoid exporting ArgVal
	// if it is't used by anything other than the store.
	for (const Value *U : Arg.users()) {
	if (U != SI) {
	ArgHasUses = true;
	break;
	}
	}
	}

	void SelectionDAGISel::LowerArguments(const Function &F) {
	SelectionDAG &DAG = SDB->DAG;
	SDLoc dl = SDB->getCurSDLoc();
	const DataLayout &DL = DAG.getDataLayout();
	SmallVector<ISD::InputArg, 16> Ins;

	if (!FuncInfo->CanLowerReturn) {
	// Put in an sret pointer parameter before all the other parameters.
	SmallVector<EVT, 1> ValueVTs;
	ComputeValueVTs(*TLI, DAG.getDataLayout(),
	PointerType::getUnqual(F.getReturnType()), ValueVTs);

	// NOTE: Assuming that a pointer will never break down to more than one VT
	// or one register.
	ISD::ArgFlagsTy Flags;
	Flags.setSRet();
	MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
	ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
	ISD::InputArg::NoArgIndex, 0);
	Ins.push_back(RetArg);
	}

	// Look for stores of arguments to static allocas. Mark such arguments with a
	// flag to ask the target to give us the memory location of that argument if
	// available.
	ArgCopyElisionMapTy ArgCopyElisionCandidates;
	findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);

	// Set up the incoming argument description vector.
	for (const Argument &Arg : F.args()) {
	unsigned ArgNo = Arg.getArgNo();
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
	bool isArgValueUsed = !Arg.use_empty();
	unsigned PartBase = 0;
	Type *FinalType = Arg.getType();
	if (Arg.hasAttribute(Attribute::ByVal))
	FinalType = cast<PointerType>(FinalType)->getElementType();
	bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
	FinalType, F.getCallingConv(), F.isVarArg());
	for (unsigned Value = 0, NumValues = ValueVTs.size();
	Value != NumValues; ++Value) {
	EVT VT = ValueVTs[Value];
	Type ArgTy = VT.getTypeForEVT(DAG.getContext());
	ISD::ArgFlagsTy Flags;

	// Certain targets (such as MIPS), may have a different ABI alignment
	// for a type depending on the context. Give the target a chance to
	// specify the alignment it wants.
	unsigned OriginalAlignment =
	TLI->getABIAlignmentForCallingConv(ArgTy, DL);

	if (Arg.hasAttribute(Attribute::ZExt))
	Flags.setZExt();
	if (Arg.hasAttribute(Attribute::SExt))
	Flags.setSExt();
	if (Arg.hasAttribute(Attribute::InReg)) {
	// If we are using vectorcall calling convention, a structure that is
	// passed InReg - is surely an HVA
	if (F.getCallingConv() == CallingConv::X86_VectorCall &&
	isa<StructType>(Arg.getType())) {
	// The first value of a structure is marked
	if (0 == Value)
	Flags.setHvaStart();
	Flags.setHva();
	}
	// Set InReg Flag
	Flags.setInReg();
	}
	if (Arg.hasAttribute(Attribute::StructRet))
	Flags.setSRet();
	if (Arg.hasAttribute(Attribute::SwiftSelf))
	Flags.setSwiftSelf();
	if (Arg.hasAttribute(Attribute::SwiftError))
	Flags.setSwiftError();
	if (Arg.hasAttribute(Attribute::ByVal))
	Flags.setByVal();
	if (Arg.hasAttribute(Attribute::InAlloca)) {
	Flags.setInAlloca();
	// Set the byval flag for CCAssignFn callbacks that don't know about
	// inalloca. This way we can know how many bytes we should've allocated
	// and how many bytes a callee cleanup function will pop. If we port
	// inalloca to more targets, we'll have to add custom inalloca handling
	// in the various CC lowering callbacks.
	Flags.setByVal();
	}
	if (F.getCallingConv() == CallingConv::X86_INTR) {
	// IA Interrupt passes frame (1st parameter) by value in the stack.
	if (ArgNo == 0)
	Flags.setByVal();
	}
	if (Flags.isByVal() \|\| Flags.isInAlloca()) {
	PointerType *Ty = cast<PointerType>(Arg.getType());
	Type *ElementTy = Ty->getElementType();
	Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
	// For ByVal, alignment should be passed from FE. BE will guess if
	// this info is not there but there are cases it cannot get right.
	unsigned FrameAlign;
	if (Arg.getParamAlignment())
	FrameAlign = Arg.getParamAlignment();
	else
	FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
	Flags.setByValAlign(FrameAlign);
	}
	if (Arg.hasAttribute(Attribute::Nest))
	Flags.setNest();
	if (NeedsRegBlock)
	Flags.setInConsecutiveRegs();
	Flags.setOrigAlign(OriginalAlignment);
	if (ArgCopyElisionCandidates.count(&Arg))
	Flags.setCopyElisionCandidate();

	MVT RegisterVT =
	TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(), VT);
	unsigned NumRegs =
	TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), VT);
	for (unsigned i = 0; i != NumRegs; ++i) {
	ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
	ArgNo, PartBase+i*RegisterVT.getStoreSize());
	if (NumRegs > 1 && i == 0)
	MyFlags.Flags.setSplit();
	// if it isn't first piece, alignment must be 1
	else if (i > 0) {
	MyFlags.Flags.setOrigAlign(1);
	if (i == NumRegs - 1)
	MyFlags.Flags.setSplitEnd();
	}
	Ins.push_back(MyFlags);
	}
	if (NeedsRegBlock && Value == NumValues - 1)
	Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
	PartBase += VT.getStoreSize();
	}
	}

	// Call the target to set up the argument values.
	SmallVector<SDValue, 8> InVals;
	SDValue NewRoot = TLI->LowerFormalArguments(
	DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals);

	// Verify that the target's LowerFormalArguments behaved as expected.
	assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
	"LowerFormalArguments didn't return a valid chain!");
	assert(InVals.size() == Ins.size() &&
	"LowerFormalArguments didn't emit the correct number of values!");
	DEBUG({
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	assert(InVals[i].getNode() &&
	"LowerFormalArguments emitted a null value!");
	assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
	"LowerFormalArguments emitted a value with the wrong type!");
	}
	});

	// Update the DAG with the new chain value resulting from argument lowering.
	DAG.setRoot(NewRoot);

	// Set up the argument values.
	unsigned i = 0;
	if (!FuncInfo->CanLowerReturn) {
	// Create a virtual register for the sret pointer, and put in a copy
	// from the sret argument into it.
	SmallVector<EVT, 1> ValueVTs;
	ComputeValueVTs(*TLI, DAG.getDataLayout(),
	PointerType::getUnqual(F.getReturnType()), ValueVTs);
	MVT VT = ValueVTs[0].getSimpleVT();
	MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
	Optional<ISD::NodeType> AssertOp = None;
	SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
	RegVT, VT, nullptr, AssertOp);

	MachineFunction& MF = SDB->DAG.getMachineFunction();
	MachineRegisterInfo& RegInfo = MF.getRegInfo();
	unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
	FuncInfo->DemoteRegister = SRetReg;
	NewRoot =
	SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
	DAG.setRoot(NewRoot);

	// i indexes lowered arguments. Bump it past the hidden sret argument.
	++i;
	}

	SmallVector<SDValue, 4> Chains;
	DenseMap<int, int> ArgCopyElisionFrameIndexMap;
	for (const Argument &Arg : F.args()) {
	SmallVector<SDValue, 4> ArgValues;
	SmallVector<EVT, 4> ValueVTs;
	ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
	unsigned NumValues = ValueVTs.size();
	if (NumValues == 0)
	continue;

	bool ArgHasUses = !Arg.use_empty();

	// Elide the copying store if the target loaded this argument from a
	// suitable fixed stack object.
	if (Ins[i].Flags.isCopyElisionCandidate()) {
	tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
	ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
	InVals[i], ArgHasUses);
	}

	// If this argument is unused then remember its value. It is used to generate
	// debugging information.
	bool isSwiftErrorArg =
	TLI->supportSwiftError() &&
	Arg.hasAttribute(Attribute::SwiftError);
	if (!ArgHasUses && !isSwiftErrorArg) {
	SDB->setUnusedArgValue(&Arg, InVals[i]);

	// Also remember any frame index for use in FastISel.
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
	FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
	}

	for (unsigned Val = 0; Val != NumValues; ++Val) {
	EVT VT = ValueVTs[Val];
	MVT PartVT =
	TLI->getRegisterTypeForCallingConv(*CurDAG->getContext(), VT);
	unsigned NumParts =
	TLI->getNumRegistersForCallingConv(*CurDAG->getContext(), VT);

	// Even an apparant 'unused' swifterror argument needs to be returned. So
	// we do generate a copy for it that can be used on return from the
	// function.
	if (ArgHasUses \|\| isSwiftErrorArg) {
	Optional<ISD::NodeType> AssertOp;
	if (Arg.hasAttribute(Attribute::SExt))
	AssertOp = ISD::AssertSext;
	else if (Arg.hasAttribute(Attribute::ZExt))
	AssertOp = ISD::AssertZext;

	ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
	PartVT, VT, nullptr, AssertOp,
	true));
	}

	i += NumParts;
	}

	// We don't need to do anything else for unused arguments.
	if (ArgValues.empty())
	continue;

	// Note down frame index.
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
	FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());

	SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
	SDB->getCurSDLoc());

	SDB->setValue(&Arg, Res);
	if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
	if (LoadSDNode *LNode =
	dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
	if (FrameIndexSDNode *FI =
	dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
	FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
	}

	// Update the SwiftErrorVRegDefMap.
	if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) {
	unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg))
	FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB,
	FuncInfo->SwiftErrorArg, Reg);
	}

	// If this argument is live outside of the entry block, insert a copy from
	// wherever we got it to the vreg that other BB's will reference it as.
	if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) {
	// If we can, though, try to skip creating an unnecessary vreg.
	// FIXME: This isn't very clean... it would be nice to make this more
	// general. It's also subtly incompatible with the hacks FastISel
	// uses with vregs.
	unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
	if (TargetRegisterInfo::isVirtualRegister(Reg)) {
	FuncInfo->ValueMap[&Arg] = Reg;
	continue;
	}
	}
	if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) {
	FuncInfo->InitializeRegForValue(&Arg);
	SDB->CopyToExportRegsIfNeeded(&Arg);
	}
	}

	if (!Chains.empty()) {
	Chains.push_back(NewRoot);
	NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	DAG.setRoot(NewRoot);

	assert(i == InVals.size() && "Argument register count mismatch!");

	// If any argument copy elisions occurred and we have debug info, update the
	// stale frame indices used in the dbg.declare variable info table.
	MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
	if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
	for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
	auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
	if (I != ArgCopyElisionFrameIndexMap.end())
	VI.Slot = I->second;
	}
	}

	// Finally, if the target has anything special to do, allow it to do so.
	EmitFunctionEntryCode();
	}

	/// Handle PHI nodes in successor blocks. Emit code into the SelectionDAG to
	/// ensure constants are generated when needed. Remember the virtual registers
	/// that need to be added to the Machine PHI nodes as input. We cannot just
	/// directly add them, because expansion might result in multiple MBB's for one
	/// BB. As such, the start of the BB might correspond to a different MBB than
	/// the end.
	///
	void
	SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
	const TerminatorInst *TI = LLVMBB->getTerminator();

	SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;

	// Check PHI nodes in successors that expect a value to be available from this
	// block.
	for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
	const BasicBlock *SuccBB = TI->getSuccessor(succ);
	if (!isa<PHINode>(SuccBB->begin())) continue;
	MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];

	// If this terminator has multiple identical successors (common for
	// switches), only handle each succ once.
	if (!SuccsHandled.insert(SuccMBB).second)
	continue;

	MachineBasicBlock::iterator MBBI = SuccMBB->begin();

	// At this point we know that there is a 1-1 correspondence between LLVM PHI
	// nodes and Machine PHI nodes, but the incoming operands have not been
	// emitted yet.
	for (BasicBlock::const_iterator I = SuccBB->begin();
	const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
	// Ignore dead phi's.
	if (PN->use_empty()) continue;

	// Skip empty types
	if (PN->getType()->isEmptyTy())
	continue;

	unsigned Reg;
	const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);

	if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
	unsigned &RegOut = ConstantsOut[C];
	if (RegOut == 0) {
	RegOut = FuncInfo.CreateRegs(C->getType());
	CopyValueToVirtualRegister(C, RegOut);
	}
	Reg = RegOut;
	} else {
	DenseMap<const Value *, unsigned>::iterator I =
	FuncInfo.ValueMap.find(PHIOp);
	if (I != FuncInfo.ValueMap.end())
	Reg = I->second;
	else {
	assert(isa<AllocaInst>(PHIOp) &&
	FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
	"Didn't codegen value into a register!??");
	Reg = FuncInfo.CreateRegs(PHIOp->getType());
	CopyValueToVirtualRegister(PHIOp, Reg);
	}
	}

	// Remember that this register needs to added to the machine PHI node as
	// the input for this MBB.
	SmallVector<EVT, 4> ValueVTs;
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	ComputeValueVTs(TLI, DAG.getDataLayout(), PN->getType(), ValueVTs);
	for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
	EVT VT = ValueVTs[vti];
	unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
	for (unsigned i = 0, e = NumRegisters; i != e; ++i)
	FuncInfo.PHINodesToUpdate.push_back(
	std::make_pair(&*MBBI++, Reg + i));
	Reg += NumRegisters;
	}
	}
	}

	ConstantsOut.clear();
	}

	/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
	/// is 0.
	MachineBasicBlock *
	SelectionDAGBuilder::StackProtectorDescriptor::
	AddSuccessorMBB(const BasicBlock *BB,
	MachineBasicBlock *ParentMBB,
	bool IsLikely,
	MachineBasicBlock *SuccMBB) {
	// If SuccBB has not been created yet, create it.
	if (!SuccMBB) {
	MachineFunction *MF = ParentMBB->getParent();
	MachineFunction::iterator BBI(ParentMBB);
	SuccMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(++BBI, SuccMBB);
	}
	// Add it as a successor of ParentMBB.
	ParentMBB->addSuccessor(
	SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
	return SuccMBB;
	}

	MachineBasicBlock SelectionDAGBuilder::NextBlock(MachineBasicBlock MBB) {
	MachineFunction::iterator I(MBB);
	if (++I == FuncInfo.MF->end())
	return nullptr;
	return &*I;
	}

	/// During lowering new call nodes can be created (such as memset, etc.).
	/// Those will become new roots of the current DAG, but complications arise
	/// when they are tail calls. In such cases, the call lowering will update
	/// the root, but the builder still needs to know that a tail call has been
	/// lowered in order to avoid generating an additional return.
	void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
	// If the node is null, we do have a tail call.
	if (MaybeTC.getNode() != nullptr)
	DAG.setRoot(MaybeTC);
	else
	HasTailCall = true;
	}

	uint64_t
	SelectionDAGBuilder::getJumpTableRange(const CaseClusterVector &Clusters,
	unsigned First, unsigned Last) const {
	assert(Last >= First);
	const APInt &LowCase = Clusters[First].Low->getValue();
	const APInt &HighCase = Clusters[Last].High->getValue();
	assert(LowCase.getBitWidth() == HighCase.getBitWidth());

	// FIXME: A range of consecutive cases has 100% density, but only requires one
	// comparison to lower. We should discriminate against such consecutive ranges
	// in jump tables.

	return (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100) + 1;
	}

	uint64_t SelectionDAGBuilder::getJumpTableNumCases(
	const SmallVectorImpl<unsigned> &TotalCases, unsigned First,
	unsigned Last) const {
	assert(Last >= First);
	assert(TotalCases[Last] >= TotalCases[First]);
	uint64_t NumCases =
	TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]);
	return NumCases;
	}

	bool SelectionDAGBuilder::buildJumpTable(const CaseClusterVector &Clusters,
	unsigned First, unsigned Last,
	const SwitchInst *SI,
	MachineBasicBlock *DefaultMBB,
	CaseCluster &JTCluster) {
	assert(First <= Last);

	auto Prob = BranchProbability::getZero();
	unsigned NumCmps = 0;
	std::vector<MachineBasicBlock*> Table;
	DenseMap<MachineBasicBlock*, BranchProbability> JTProbs;

	// Initialize probabilities in JTProbs.
	for (unsigned I = First; I <= Last; ++I)
	JTProbs[Clusters[I].MBB] = BranchProbability::getZero();

	for (unsigned I = First; I <= Last; ++I) {
	assert(Clusters[I].Kind == CC_Range);
	Prob += Clusters[I].Prob;
	const APInt &Low = Clusters[I].Low->getValue();
	const APInt &High = Clusters[I].High->getValue();
	NumCmps += (Low == High) ? 1 : 2;
	if (I != First) {
	// Fill the gap between this and the previous cluster.
	const APInt &PreviousHigh = Clusters[I - 1].High->getValue();
	assert(PreviousHigh.slt(Low));
	uint64_t Gap = (Low - PreviousHigh).getLimitedValue() - 1;
	for (uint64_t J = 0; J < Gap; J++)
	Table.push_back(DefaultMBB);
	}
	uint64_t ClusterSize = (High - Low).getLimitedValue() + 1;
	for (uint64_t J = 0; J < ClusterSize; ++J)
	Table.push_back(Clusters[I].MBB);
	JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
	}

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumDests = JTProbs.size();
	if (TLI.isSuitableForBitTests(
	NumDests, NumCmps, Clusters[First].Low->getValue(),
	Clusters[Last].High->getValue(), DAG.getDataLayout())) {
	// Clusters[First..Last] should be lowered as bit tests instead.
	return false;
	}

	// Create the MBB that will load from and jump through the table.
	// Note: We create it here, but it's not inserted into the function yet.
	MachineFunction *CurMF = FuncInfo.MF;
	MachineBasicBlock *JumpTableMBB =
	CurMF->CreateMachineBasicBlock(SI->getParent());

	// Add successors. Note: use table order for determinism.
	SmallPtrSet<MachineBasicBlock *, 8> Done;
	for (MachineBasicBlock *Succ : Table) {
	if (Done.count(Succ))
	continue;
	addSuccessorWithProb(JumpTableMBB, Succ, JTProbs[Succ]);
	Done.insert(Succ);
	}
	JumpTableMBB->normalizeSuccProbs();

	unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding())
	->createJumpTableIndex(Table);

	// Set up the jump table info.
	JumpTable JT(-1U, JTI, JumpTableMBB, nullptr);
	JumpTableHeader JTH(Clusters[First].Low->getValue(),
	Clusters[Last].High->getValue(), SI->getCondition(),
	nullptr, false);
	JTCases.emplace_back(std::move(JTH), std::move(JT));

	JTCluster = CaseCluster::jumpTable(Clusters[First].Low, Clusters[Last].High,
	JTCases.size() - 1, Prob);
	return true;
	}

	void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
	const SwitchInst *SI,
	MachineBasicBlock *DefaultMBB) {
	#ifndef NDEBUG
	// Clusters must be non-empty, sorted, and only contain Range clusters.
	assert(!Clusters.empty());
	for (CaseCluster &C : Clusters)
	assert(C.Kind == CC_Range);
	for (unsigned i = 1, e = Clusters.size(); i < e; ++i)
	assert(Clusters[i - 1].High->getValue().slt(Clusters[i].Low->getValue()));
	#endif

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.areJTsAllowed(SI->getParent()->getParent()))
	return;

	const int64_t N = Clusters.size();
	const unsigned MinJumpTableEntries = TLI.getMinimumJumpTableEntries();
	const unsigned SmallNumberOfEntries = MinJumpTableEntries / 2;

	if (N < 2 \|\| N < MinJumpTableEntries)
	return;

	// TotalCases[i]: Total nbr of cases in Clusters[0..i].
	SmallVector<unsigned, 8> TotalCases(N);
	for (unsigned i = 0; i < N; ++i) {
	const APInt &Hi = Clusters[i].High->getValue();
	const APInt &Lo = Clusters[i].Low->getValue();
	TotalCases[i] = (Hi - Lo).getLimitedValue() + 1;
	if (i != 0)
	TotalCases[i] += TotalCases[i - 1];
	}

	// Cheap case: the whole range may be suitable for jump table.
	uint64_t Range = getJumpTableRange(Clusters,0, N - 1);
	uint64_t NumCases = getJumpTableNumCases(TotalCases, 0, N - 1);
	assert(NumCases < UINT64_MAX / 100);
	assert(Range >= NumCases);
	if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) {
	CaseCluster JTCluster;
	if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {
	Clusters[0] = JTCluster;
	Clusters.resize(1);
	return;
	}
	}

	// The algorithm below is not suitable for -O0.
	if (TM.getOptLevel() == CodeGenOpt::None)
	return;

	// Split Clusters into minimum number of dense partitions. The algorithm uses
	// the same idea as Kannan & Proebsting "Correction to 'Producing Good Code
	// for the Case Statement'" (1994), but builds the MinPartitions array in
	// reverse order to make it easier to reconstruct the partitions in ascending
	// order. In the choice between two optimal partitionings, it picks the one
	// which yields more jump tables.

	// MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
	SmallVector<unsigned, 8> MinPartitions(N);
	// LastElement[i] is the last element of the partition starting at i.
	SmallVector<unsigned, 8> LastElement(N);
	// PartitionsScore[i] is used to break ties when choosing between two
	// partitionings resulting in the same number of partitions.
	SmallVector<unsigned, 8> PartitionsScore(N);
	// For PartitionsScore, a small number of comparisons is considered as good as
	// a jump table and a single comparison is considered better than a jump
	// table.
	enum PartitionScores : unsigned {
	NoTable = 0,
	Table = 1,
	FewCases = 1,
	SingleCase = 2
	};

	// Base case: There is only one way to partition Clusters[N-1].
	MinPartitions[N - 1] = 1;
	LastElement[N - 1] = N - 1;
	PartitionsScore[N - 1] = PartitionScores::SingleCase;

	// Note: loop indexes are signed to avoid underflow.
	for (int64_t i = N - 2; i >= 0; i--) {
	// Find optimal partitioning of Clusters[i..N-1].
	// Baseline: Put Clusters[i] into a partition on its own.
	MinPartitions[i] = MinPartitions[i + 1] + 1;
	LastElement[i] = i;
	PartitionsScore[i] = PartitionsScore[i + 1] + PartitionScores::SingleCase;

	// Search for a solution that results in fewer partitions.
	for (int64_t j = N - 1; j > i; j--) {
	// Try building a partition from Clusters[i..j].
	uint64_t Range = getJumpTableRange(Clusters, i, j);
	uint64_t NumCases = getJumpTableNumCases(TotalCases, i, j);
	assert(NumCases < UINT64_MAX / 100);
	assert(Range >= NumCases);
	if (TLI.isSuitableForJumpTable(SI, NumCases, Range)) {
	unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
	unsigned Score = j == N - 1 ? 0 : PartitionsScore[j + 1];
	int64_t NumEntries = j - i + 1;

	if (NumEntries == 1)
	Score += PartitionScores::SingleCase;
	else if (NumEntries <= SmallNumberOfEntries)
	Score += PartitionScores::FewCases;
	else if (NumEntries >= MinJumpTableEntries)
	Score += PartitionScores::Table;

	// If this leads to fewer partitions, or to the same number of
	// partitions with better score, it is a better partitioning.
	if (NumPartitions < MinPartitions[i] \|\|
	(NumPartitions == MinPartitions[i] && Score > PartitionsScore[i])) {
	MinPartitions[i] = NumPartitions;
	LastElement[i] = j;
	PartitionsScore[i] = Score;
	}
	}
	}
	}

	// Iterate over the partitions, replacing some with jump tables in-place.
	unsigned DstIndex = 0;
	for (unsigned First = 0, Last; First < N; First = Last + 1) {
	Last = LastElement[First];
	assert(Last >= First);
	assert(DstIndex <= First);
	unsigned NumClusters = Last - First + 1;

	CaseCluster JTCluster;
	if (NumClusters >= MinJumpTableEntries &&
	buildJumpTable(Clusters, First, Last, SI, DefaultMBB, JTCluster)) {
	Clusters[DstIndex++] = JTCluster;
	} else {
	for (unsigned I = First; I <= Last; ++I)
	std::memmove(&Clusters[DstIndex++], &Clusters[I], sizeof(Clusters[I]));
	}
	}
	Clusters.resize(DstIndex);
	}

	bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
	unsigned First, unsigned Last,
	const SwitchInst *SI,
	CaseCluster &BTCluster) {
	assert(First <= Last);
	if (First == Last)
	return false;

	BitVector Dests(FuncInfo.MF->getNumBlockIDs());
	unsigned NumCmps = 0;
	for (int64_t I = First; I <= Last; ++I) {
	assert(Clusters[I].Kind == CC_Range);
	Dests.set(Clusters[I].MBB->getNumber());
	NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
	}
	unsigned NumDests = Dests.count();

	APInt Low = Clusters[First].Low->getValue();
	APInt High = Clusters[Last].High->getValue();
	assert(Low.slt(High));

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();
	if (!TLI.isSuitableForBitTests(NumDests, NumCmps, Low, High, DL))
	return false;

	APInt LowBound;
	APInt CmpRange;

	const int BitWidth = TLI.getPointerTy(DL).getSizeInBits();
	assert(TLI.rangeFitsInWord(Low, High, DL) &&
	"Case range must fit in bit mask!");

	// Check if the clusters cover a contiguous range such that no value in the
	// range will jump to the default statement.
	bool ContiguousRange = true;
	for (int64_t I = First + 1; I <= Last; ++I) {
	if (Clusters[I].Low->getValue() != Clusters[I - 1].High->getValue() + 1) {
	ContiguousRange = false;
	break;
	}
	}

	if (Low.isStrictlyPositive() && High.slt(BitWidth)) {
	// Optimize the case where all the case values fit in a word without having
	// to subtract minValue. In this case, we can optimize away the subtraction.
	LowBound = APInt::getNullValue(Low.getBitWidth());
	CmpRange = High;
	ContiguousRange = false;
	} else {
	LowBound = Low;
	CmpRange = High - Low;
	}

	CaseBitsVector CBV;
	auto TotalProb = BranchProbability::getZero();
	for (unsigned i = First; i <= Last; ++i) {
	// Find the CaseBits for this destination.
	unsigned j;
	for (j = 0; j < CBV.size(); ++j)
	if (CBV[j].BB == Clusters[i].MBB)
	break;
	if (j == CBV.size())
	CBV.push_back(
	CaseBits(0, Clusters[i].MBB, 0, BranchProbability::getZero()));
	CaseBits *CB = &CBV[j];

	// Update Mask, Bits and ExtraProb.
	uint64_t Lo = (Clusters[i].Low->getValue() - LowBound).getZExtValue();
	uint64_t Hi = (Clusters[i].High->getValue() - LowBound).getZExtValue();
	assert(Hi >= Lo && Hi < 64 && "Invalid bit case!");
	CB->Mask \|= (-1ULL >> (63 - (Hi - Lo))) << Lo;
	CB->Bits += Hi - Lo + 1;
	CB->ExtraProb += Clusters[i].Prob;
	TotalProb += Clusters[i].Prob;
	}

	BitTestInfo BTI;
	std::sort(CBV.begin(), CBV.end(), [](const CaseBits &a, const CaseBits &b) {
	// Sort by probability first, number of bits second.
	if (a.ExtraProb != b.ExtraProb)
	return a.ExtraProb > b.ExtraProb;
	return a.Bits > b.Bits;
	});

	for (auto &CB : CBV) {
	MachineBasicBlock *BitTestBB =
	FuncInfo.MF->CreateMachineBasicBlock(SI->getParent());
	BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraProb));
	}
	BitTestCases.emplace_back(std::move(LowBound), std::move(CmpRange),
	SI->getCondition(), -1U, MVT::Other, false,
	ContiguousRange, nullptr, nullptr, std::move(BTI),
	TotalProb);

	BTCluster = CaseCluster::bitTests(Clusters[First].Low, Clusters[Last].High,
	BitTestCases.size() - 1, TotalProb);
	return true;
	}

	void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters,
	const SwitchInst *SI) {
	// Partition Clusters into as few subsets as possible, where each subset has a
	// range that fits in a machine word and has <= 3 unique destinations.

	#ifndef NDEBUG
	// Clusters must be sorted and contain Range or JumpTable clusters.
	assert(!Clusters.empty());
	assert(Clusters[0].Kind == CC_Range \|\| Clusters[0].Kind == CC_JumpTable);
	for (const CaseCluster &C : Clusters)
	assert(C.Kind == CC_Range \|\| C.Kind == CC_JumpTable);
	for (unsigned i = 1; i < Clusters.size(); ++i)
	assert(Clusters[i-1].High->getValue().slt(Clusters[i].Low->getValue()));
	#endif

	// The algorithm below is not suitable for -O0.
	if (TM.getOptLevel() == CodeGenOpt::None)
	return;

	// If target does not have legal shift left, do not emit bit tests at all.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	const DataLayout &DL = DAG.getDataLayout();

	EVT PTy = TLI.getPointerTy(DL);
	if (!TLI.isOperationLegal(ISD::SHL, PTy))
	return;

	int BitWidth = PTy.getSizeInBits();
	const int64_t N = Clusters.size();

	// MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
	SmallVector<unsigned, 8> MinPartitions(N);
	// LastElement[i] is the last element of the partition starting at i.
	SmallVector<unsigned, 8> LastElement(N);

	// FIXME: This might not be the best algorithm for finding bit test clusters.

	// Base case: There is only one way to partition Clusters[N-1].
	MinPartitions[N - 1] = 1;
	LastElement[N - 1] = N - 1;

	// Note: loop indexes are signed to avoid underflow.
	for (int64_t i = N - 2; i >= 0; --i) {
	// Find optimal partitioning of Clusters[i..N-1].
	// Baseline: Put Clusters[i] into a partition on its own.
	MinPartitions[i] = MinPartitions[i + 1] + 1;
	LastElement[i] = i;

	// Search for a solution that results in fewer partitions.
	// Note: the search is limited by BitWidth, reducing time complexity.
	for (int64_t j = std::min(N - 1, i + BitWidth - 1); j > i; --j) {
	// Try building a partition from Clusters[i..j].

	// Check the range.
	if (!TLI.rangeFitsInWord(Clusters[i].Low->getValue(),
	Clusters[j].High->getValue(), DL))
	continue;

	// Check nbr of destinations and cluster types.
	// FIXME: This works, but doesn't seem very efficient.
	bool RangesOnly = true;
	BitVector Dests(FuncInfo.MF->getNumBlockIDs());
	for (int64_t k = i; k <= j; k++) {
	if (Clusters[k].Kind != CC_Range) {
	RangesOnly = false;
	break;
	}
	Dests.set(Clusters[k].MBB->getNumber());
	}
	if (!RangesOnly \|\| Dests.count() > 3)
	break;

	// Check if it's a better partition.
	unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
	if (NumPartitions < MinPartitions[i]) {
	// Found a better partition.
	MinPartitions[i] = NumPartitions;
	LastElement[i] = j;
	}
	}
	}

	// Iterate over the partitions, replacing with bit-test clusters in-place.
	unsigned DstIndex = 0;
	for (unsigned First = 0, Last; First < N; First = Last + 1) {
	Last = LastElement[First];
	assert(First <= Last);
	assert(DstIndex <= First);

	CaseCluster BitTestCluster;
	if (buildBitTests(Clusters, First, Last, SI, BitTestCluster)) {
	Clusters[DstIndex++] = BitTestCluster;
	} else {
	size_t NumClusters = Last - First + 1;
	std::memmove(&Clusters[DstIndex], &Clusters[First],
	sizeof(Clusters[0]) * NumClusters);
	DstIndex += NumClusters;
	}
	}
	Clusters.resize(DstIndex);
	}

	void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
	MachineBasicBlock *SwitchMBB,
	MachineBasicBlock *DefaultMBB) {
	MachineFunction *CurMF = FuncInfo.MF;
	MachineBasicBlock *NextMBB = nullptr;
	MachineFunction::iterator BBI(W.MBB);
	if (++BBI != FuncInfo.MF->end())
	NextMBB = &*BBI;

	unsigned Size = W.LastCluster - W.FirstCluster + 1;

	BranchProbabilityInfo *BPI = FuncInfo.BPI;

	if (Size == 2 && W.MBB == SwitchMBB) {
	// If any two of the cases has the same destination, and if one value
	// is the same as the other, but has one bit unset that the other has set,
	// use bit manipulation to do two compares at once. For example:
	// "if (X == 6 \|\| X == 4)" -> "if ((X\|2) == 6)"
	// TODO: This could be extended to merge any 2 cases in switches with 3
	// cases.
	// TODO: Handle cases where W.CaseBB != SwitchBB.
	CaseCluster &Small = *W.FirstCluster;
	CaseCluster &Big = *W.LastCluster;

	if (Small.Low == Small.High && Big.Low == Big.High &&
	Small.MBB == Big.MBB) {
	const APInt &SmallValue = Small.Low->getValue();
	const APInt &BigValue = Big.Low->getValue();

	// Check that there is only one bit different.
	APInt CommonBit = BigValue ^ SmallValue;
	if (CommonBit.isPowerOf2()) {
	SDValue CondLHS = getValue(Cond);
	EVT VT = CondLHS.getValueType();
	SDLoc DL = getCurSDLoc();

	SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS,
	DAG.getConstant(CommonBit, DL, VT));
	SDValue Cond = DAG.getSetCC(
	DL, MVT::i1, Or, DAG.getConstant(BigValue \| SmallValue, DL, VT),
	ISD::SETEQ);

	// Update successor info.
	// Both Small and Big will jump to Small.BB, so we sum up the
	// probabilities.
	addSuccessorWithProb(SwitchMBB, Small.MBB, Small.Prob + Big.Prob);
	if (BPI)
	addSuccessorWithProb(
	SwitchMBB, DefaultMBB,
	// The default destination is the first successor in IR.
	BPI->getEdgeProbability(SwitchMBB->getBasicBlock(), (unsigned)0));
	else
	addSuccessorWithProb(SwitchMBB, DefaultMBB);

	// Insert the true branch.
	SDValue BrCond =
	DAG.getNode(ISD::BRCOND, DL, MVT::Other, getControlRoot(), Cond,
	DAG.getBasicBlock(Small.MBB));
	// Insert the false branch.
	BrCond = DAG.getNode(ISD::BR, DL, MVT::Other, BrCond,
	DAG.getBasicBlock(DefaultMBB));

	DAG.setRoot(BrCond);
	return;
	}
	}
	}

	if (TM.getOptLevel() != CodeGenOpt::None) {
	// Order cases by probability so the most likely case will be checked first.
	std::sort(W.FirstCluster, W.LastCluster + 1,
	[](const CaseCluster &a, const CaseCluster &b) {
	return a.Prob > b.Prob;
	});

	// Rearrange the case blocks so that the last one falls through if possible
	// without without changing the order of probabilities.
	for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) {
	--I;
	if (I->Prob > W.LastCluster->Prob)
	break;
	if (I->Kind == CC_Range && I->MBB == NextMBB) {
	std::swap(I, W.LastCluster);
	break;
	}
	}
	}

	// Compute total probability.
	BranchProbability DefaultProb = W.DefaultProb;
	BranchProbability UnhandledProbs = DefaultProb;
	for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I)
	UnhandledProbs += I->Prob;

	MachineBasicBlock *CurMBB = W.MBB;
	for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) {
	MachineBasicBlock *Fallthrough;
	if (I == W.LastCluster) {
	// For the last cluster, fall through to the default destination.
	Fallthrough = DefaultMBB;
	} else {
	Fallthrough = CurMF->CreateMachineBasicBlock(CurMBB->getBasicBlock());
	CurMF->insert(BBI, Fallthrough);
	// Put Cond in a virtual register to make it available from the new blocks.
	ExportFromCurrentBlock(Cond);
	}
	UnhandledProbs -= I->Prob;

	switch (I->Kind) {
	case CC_JumpTable: {
	// FIXME: Optimize away range check based on pivot comparisons.
	JumpTableHeader *JTH = &JTCases[I->JTCasesIndex].first;
	JumpTable *JT = &JTCases[I->JTCasesIndex].second;

	// The jump block hasn't been inserted yet; insert it here.
	MachineBasicBlock *JumpMBB = JT->MBB;
	CurMF->insert(BBI, JumpMBB);

	auto JumpProb = I->Prob;
	auto FallthroughProb = UnhandledProbs;

	// If the default statement is a target of the jump table, we evenly
	// distribute the default probability to successors of CurMBB. Also
	// update the probability on the edge from JumpMBB to Fallthrough.
	for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(),
	SE = JumpMBB->succ_end();
	SI != SE; ++SI) {
	if (*SI == DefaultMBB) {
	JumpProb += DefaultProb / 2;
	FallthroughProb -= DefaultProb / 2;
	JumpMBB->setSuccProbability(SI, DefaultProb / 2);
	JumpMBB->normalizeSuccProbs();
	break;
	}
	}

	addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
	addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
	CurMBB->normalizeSuccProbs();

	// The jump table header will be inserted in our current block, do the
	// range check, and fall through to our fallthrough block.
	JTH->HeaderBB = CurMBB;
	JT->Default = Fallthrough; // FIXME: Move Default to JumpTableHeader.

	// If we're in the right place, emit the jump table header right now.
	if (CurMBB == SwitchMBB) {
	visitJumpTableHeader(JT, JTH, SwitchMBB);
	JTH->Emitted = true;
	}
	break;
	}
	case CC_BitTests: {
	// FIXME: Optimize away range check based on pivot comparisons.
	BitTestBlock *BTB = &BitTestCases[I->BTCasesIndex];

	// The bit test blocks haven't been inserted yet; insert them here.
	for (BitTestCase &BTC : BTB->Cases)
	CurMF->insert(BBI, BTC.ThisBB);

	// Fill in fields of the BitTestBlock.
	BTB->Parent = CurMBB;
	BTB->Default = Fallthrough;

	BTB->DefaultProb = UnhandledProbs;
	// If the cases in bit test don't form a contiguous range, we evenly
	// distribute the probability on the edge to Fallthrough to two
	// successors of CurMBB.
	if (!BTB->ContiguousRange) {
	BTB->Prob += DefaultProb / 2;
	BTB->DefaultProb -= DefaultProb / 2;
	}

	// If we're in the right place, emit the bit test header right now.
	if (CurMBB == SwitchMBB) {
	visitBitTestHeader(*BTB, SwitchMBB);
	BTB->Emitted = true;
	}
	break;
	}
	case CC_Range: {
	const Value RHS, LHS, *MHS;
	ISD::CondCode CC;
	if (I->Low == I->High) {
	// Check Cond == I->Low.
	CC = ISD::SETEQ;
	LHS = Cond;
	RHS=I->Low;
	MHS = nullptr;
	} else {
	// Check I->Low <= Cond <= I->High.
	CC = ISD::SETLE;
	LHS = I->Low;
	MHS = Cond;
	RHS = I->High;
	}

	// The false probability is the sum of all unhandled cases.
	CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB, I->Prob,
	UnhandledProbs);

	if (CurMBB == SwitchMBB)
	visitSwitchCase(CB, SwitchMBB);
	else
	SwitchCases.push_back(CB);

	break;
	}
	}
	CurMBB = Fallthrough;
	}
	}

	unsigned SelectionDAGBuilder::caseClusterRank(const CaseCluster &CC,
	CaseClusterIt First,
	CaseClusterIt Last) {
	return std::count_if(First, Last + 1, [&](const CaseCluster &X) {
	if (X.Prob != CC.Prob)
	return X.Prob > CC.Prob;

	// Ties are broken by comparing the case value.
	return X.Low->getValue().slt(CC.Low->getValue());
	});
	}

	void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
	const SwitchWorkListItem &W,
	Value *Cond,
	MachineBasicBlock *SwitchMBB) {
	assert(W.FirstCluster->Low->getValue().slt(W.LastCluster->Low->getValue()) &&
	"Clusters not sorted?");

	assert(W.LastCluster - W.FirstCluster + 1 >= 2 && "Too small to split!");

	// Balance the tree based on branch probabilities to create a near-optimal (in
	// terms of search time given key frequency) binary search tree. See e.g. Kurt
	// Mehlhorn "Nearly Optimal Binary Search Trees" (1975).
	CaseClusterIt LastLeft = W.FirstCluster;
	CaseClusterIt FirstRight = W.LastCluster;
	auto LeftProb = LastLeft->Prob + W.DefaultProb / 2;
	auto RightProb = FirstRight->Prob + W.DefaultProb / 2;

	// Move LastLeft and FirstRight towards each other from opposite directions to
	// find a partitioning of the clusters which balances the probability on both
	// sides. If LeftProb and RightProb are equal, alternate which side is
	// taken to ensure 0-probability nodes are distributed evenly.
	unsigned I = 0;
	while (LastLeft + 1 < FirstRight) {
	if (LeftProb < RightProb \|\| (LeftProb == RightProb && (I & 1)))
	LeftProb += (++LastLeft)->Prob;
	else
	RightProb += (--FirstRight)->Prob;
	I++;
	}

	for (;;) {
	// Our binary search tree differs from a typical BST in that ours can have up
	// to three values in each leaf. The pivot selection above doesn't take that
	// into account, which means the tree might require more nodes and be less
	// efficient. We compensate for this here.

	unsigned NumLeft = LastLeft - W.FirstCluster + 1;
	unsigned NumRight = W.LastCluster - FirstRight + 1;

	if (std::min(NumLeft, NumRight) < 3 && std::max(NumLeft, NumRight) > 3) {
	// If one side has less than 3 clusters, and the other has more than 3,
	// consider taking a cluster from the other side.

	if (NumLeft < NumRight) {
	// Consider moving the first cluster on the right to the left side.
	CaseCluster &CC = *FirstRight;
	unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
	unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
	if (LeftSideRank <= RightSideRank) {
	// Moving the cluster to the left does not demote it.
	++LastLeft;
	++FirstRight;
	continue;
	}
	} else {
	assert(NumRight < NumLeft);
	// Consider moving the last element on the left to the right side.
	CaseCluster &CC = *LastLeft;
	unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
	unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
	if (RightSideRank <= LeftSideRank) {
	// Moving the cluster to the right does not demot it.
	--LastLeft;
	--FirstRight;
	continue;
	}
	}
	}
	break;
	}

	assert(LastLeft + 1 == FirstRight);
	assert(LastLeft >= W.FirstCluster);
	assert(FirstRight <= W.LastCluster);

	// Use the first element on the right as pivot since we will make less-than
	// comparisons against it.
	CaseClusterIt PivotCluster = FirstRight;
	assert(PivotCluster > W.FirstCluster);
	assert(PivotCluster <= W.LastCluster);

	CaseClusterIt FirstLeft = W.FirstCluster;
	CaseClusterIt LastRight = W.LastCluster;

	const ConstantInt *Pivot = PivotCluster->Low;

	// New blocks will be inserted immediately after the current one.
	MachineFunction::iterator BBI(W.MBB);
	++BBI;

	// We will branch to the LHS if Value < Pivot. If LHS is a single cluster,
	// we can branch to its destination directly if it's squeezed exactly in
	// between the known lower bound and Pivot - 1.
	MachineBasicBlock *LeftMBB;
	if (FirstLeft == LastLeft && FirstLeft->Kind == CC_Range &&
	FirstLeft->Low == W.GE &&
	(FirstLeft->High->getValue() + 1LL) == Pivot->getValue()) {
	LeftMBB = FirstLeft->MBB;
	} else {
	LeftMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
	FuncInfo.MF->insert(BBI, LeftMBB);
	WorkList.push_back(
	{LeftMBB, FirstLeft, LastLeft, W.GE, Pivot, W.DefaultProb / 2});
	// Put Cond in a virtual register to make it available from the new blocks.
	ExportFromCurrentBlock(Cond);
	}

	// Similarly, we will branch to the RHS if Value >= Pivot. If RHS is a
	// single cluster, RHS.Low == Pivot, and we can branch to its destination
	// directly if RHS.High equals the current upper bound.
	MachineBasicBlock *RightMBB;
	if (FirstRight == LastRight && FirstRight->Kind == CC_Range &&
	W.LT && (FirstRight->High->getValue() + 1ULL) == W.LT->getValue()) {
	RightMBB = FirstRight->MBB;
	} else {
	RightMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
	FuncInfo.MF->insert(BBI, RightMBB);
	WorkList.push_back(
	{RightMBB, FirstRight, LastRight, Pivot, W.LT, W.DefaultProb / 2});
	// Put Cond in a virtual register to make it available from the new blocks.
	ExportFromCurrentBlock(Cond);
	}

	// Create the CaseBlock record that will be used to lower the branch.
	CaseBlock CB(ISD::SETLT, Cond, Pivot, nullptr, LeftMBB, RightMBB, W.MBB,
	LeftProb, RightProb);

	if (W.MBB == SwitchMBB)
	visitSwitchCase(CB, SwitchMBB);
	else
	SwitchCases.push_back(CB);
	}

	void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
	// Extract cases from the switch.
	BranchProbabilityInfo *BPI = FuncInfo.BPI;
	CaseClusterVector Clusters;
	Clusters.reserve(SI.getNumCases());
	for (auto I : SI.cases()) {
	MachineBasicBlock *Succ = FuncInfo.MBBMap[I.getCaseSuccessor()];
	const ConstantInt *CaseVal = I.getCaseValue();
	BranchProbability Prob =
	BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex())
	: BranchProbability(1, SI.getNumCases() + 1);
	Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob));
	}

	MachineBasicBlock *DefaultMBB = FuncInfo.MBBMap[SI.getDefaultDest()];

	// Cluster adjacent cases with the same destination. We do this at all
	// optimization levels because it's cheap to do and will make codegen faster
	// if there are many clusters.
	sortAndRangeify(Clusters);

	if (TM.getOptLevel() != CodeGenOpt::None) {
	// Replace an unreachable default with the most popular destination.
	// FIXME: Exploit unreachable default more aggressively.
	bool UnreachableDefault =
	isa<UnreachableInst>(SI.getDefaultDest()->getFirstNonPHIOrDbg());
	if (UnreachableDefault && !Clusters.empty()) {
	DenseMap<const BasicBlock *, unsigned> Popularity;
	unsigned MaxPop = 0;
	const BasicBlock *MaxBB = nullptr;
	for (auto I : SI.cases()) {
	const BasicBlock *BB = I.getCaseSuccessor();
	if (++Popularity[BB] > MaxPop) {
	MaxPop = Popularity[BB];
	MaxBB = BB;
	}
	}
	// Set new default.
	assert(MaxPop > 0 && MaxBB);
	DefaultMBB = FuncInfo.MBBMap[MaxBB];

	// Remove cases that were pointing to the destination that is now the
	// default.
	CaseClusterVector New;
	New.reserve(Clusters.size());
	for (CaseCluster &CC : Clusters) {
	if (CC.MBB != DefaultMBB)
	New.push_back(CC);
	}
	Clusters = std::move(New);
	}
	}

	// If there is only the default destination, jump there directly.
	MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
	if (Clusters.empty()) {
	SwitchMBB->addSuccessor(DefaultMBB);
	if (DefaultMBB != NextBlock(SwitchMBB)) {
	DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
	getControlRoot(), DAG.getBasicBlock(DefaultMBB)));
	}
	return;
	}

	findJumpTables(Clusters, &SI, DefaultMBB);
	findBitTestClusters(Clusters, &SI);

	DEBUG({
	dbgs() << "Case clusters: ";
	for (const CaseCluster &C : Clusters) {
	if (C.Kind == CC_JumpTable) dbgs() << "JT:";
	if (C.Kind == CC_BitTests) dbgs() << "BT:";

	C.Low->getValue().print(dbgs(), true);
	if (C.Low != C.High) {
	dbgs() << '-';
	C.High->getValue().print(dbgs(), true);
	}
	dbgs() << ' ';
	}
	dbgs() << '\n';
	});

	assert(!Clusters.empty());
	SwitchWorkList WorkList;
	CaseClusterIt First = Clusters.begin();
	CaseClusterIt Last = Clusters.end() - 1;
	auto DefaultProb = getEdgeProbability(SwitchMBB, DefaultMBB);
	WorkList.push_back({SwitchMBB, First, Last, nullptr, nullptr, DefaultProb});

	while (!WorkList.empty()) {
	SwitchWorkListItem W = WorkList.back();
	WorkList.pop_back();
	unsigned NumClusters = W.LastCluster - W.FirstCluster + 1;

	if (NumClusters > 3 && TM.getOptLevel() != CodeGenOpt::None &&
	!DefaultMBB->getParent()->getFunction()->optForMinSize()) {
	// For optimized builds, lower large range as a balanced binary tree.
	splitWorkItem(WorkList, W, SI.getCondition(), SwitchMBB);
	continue;
	}

	lowerWorkItem(W, SI.getCondition(), SwitchMBB, DefaultMBB);
	}
	}
	Index: head/contrib/llvm/lib/CodeGen/StackColoring.cpp
	===================================================================
	--- head/contrib/llvm/lib/CodeGen/StackColoring.cpp (revision 322319)
	+++ head/contrib/llvm/lib/CodeGen/StackColoring.cpp (revision 322320)
	@@ -1,1236 +1,1280 @@
	//===-- StackColoring.cpp -------------------------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass implements the stack-coloring optimization that looks for
	// lifetime markers machine instructions (LIFESTART_BEGIN and LIFESTART_END),
	// which represent the possible lifetime of stack slots. It attempts to
	// merge disjoint stack slots and reduce the used stack space.
	// NOTE: This pass is not StackSlotColoring, which optimizes spill slots.
	//
	// TODO: In the future we plan to improve stack coloring in the following ways:
	// 1. Allow merging multiple small slots into a single larger slot at different
	// offsets.
	// 2. Merge this pass with StackSlotColoring and allow merging of allocas with
	// spill slots.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/BitVector.h"
	#include "llvm/ADT/DepthFirstIterator.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/CodeGen/LiveInterval.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineLoopInfo.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/Passes.h"
	#include "llvm/CodeGen/PseudoSourceValue.h"
	+#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/SlotIndexes.h"
	#include "llvm/CodeGen/StackProtector.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetRegisterInfo.h"

	using namespace llvm;

	#define DEBUG_TYPE "stack-coloring"

	static cl::opt<bool>
	DisableColoring("no-stack-coloring",
	cl::init(false), cl::Hidden,
	cl::desc("Disable stack coloring"));

	/// The user may write code that uses allocas outside of the declared lifetime
	/// zone. This can happen when the user returns a reference to a local
	/// data-structure. We can detect these cases and decide not to optimize the
	/// code. If this flag is enabled, we try to save the user. This option
	/// is treated as overriding LifetimeStartOnFirstUse below.
	static cl::opt<bool>
	ProtectFromEscapedAllocas("protect-from-escaped-allocas",
	cl::init(false), cl::Hidden,
	cl::desc("Do not optimize lifetime zones that "
	"are broken"));

	/// Enable enhanced dataflow scheme for lifetime analysis (treat first
	/// use of stack slot as start of slot lifetime, as opposed to looking
	/// for LIFETIME_START marker). See "Implementation notes" below for
	/// more info.
	static cl::opt<bool>
	LifetimeStartOnFirstUse("stackcoloring-lifetime-start-on-first-use",
	cl::init(true), cl::Hidden,
	cl::desc("Treat stack lifetimes as starting on first use, not on START marker."));


	STATISTIC(NumMarkerSeen, "Number of lifetime markers found.");
	STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
	STATISTIC(StackSlotMerged, "Number of stack slot merged.");
	STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");

	//===----------------------------------------------------------------------===//
	// StackColoring Pass
	//===----------------------------------------------------------------------===//
	//
	// Stack Coloring reduces stack usage by merging stack slots when they
	// can't be used together. For example, consider the following C program:
	//
	// void bar(char *, int);
	// void foo(bool var) {
	// A: {
	// char z[4096];
	// bar(z, 0);
	// }
	//
	// char *p;
	// char x[4096];
	// char y[4096];
	// if (var) {
	// p = x;
	// } else {
	// bar(y, 1);
	// p = y + 1024;
	// }
	// B:
	// bar(p, 2);
	// }
	//
	// Naively-compiled, this program would use 12k of stack space. However, the
	// stack slot corresponding to `z` is always destroyed before either of the
	// stack slots for `x` or `y` are used, and then `x` is only used if `var`
	// is true, while `y` is only used if `var` is false. So in no time are 2
	// of the stack slots used together, and therefore we can merge them,
	// compiling the function using only a single 4k alloca:
	//
	// void foo(bool var) { // equivalent
	// char x[4096];
	// char *p;
	// bar(x, 0);
	// if (var) {
	// p = x;
	// } else {
	// bar(x, 1);
	// p = x + 1024;
	// }
	// bar(p, 2);
	// }
	//
	// This is an important optimization if we want stack space to be under
	// control in large functions, both open-coded ones and ones created by
	// inlining.
	//
	// Implementation Notes:
	// ---------------------
	//
	// An important part of the above reasoning is that `z` can't be accessed
	// while the latter 2 calls to `bar` are running. This is justified because
	// `z`'s lifetime is over after we exit from block `A:`, so any further
	// accesses to it would be UB. The way we represent this information
	// in LLVM is by having frontends delimit blocks with `lifetime.start`
	// and `lifetime.end` intrinsics.
	//
	// The effect of these intrinsics seems to be as follows (maybe I should
	// specify this in the reference?):
	//
	// L1) at start, each stack-slot is marked as out-of-scope, unless no
	// lifetime intrinsic refers to that stack slot, in which case
	// it is marked as in-scope.
	// L2) on a `lifetime.start`, a stack slot is marked as in-scope and
	// the stack slot is overwritten with `undef`.
	// L3) on a `lifetime.end`, a stack slot is marked as out-of-scope.
	// L4) on function exit, all stack slots are marked as out-of-scope.
	// L5) `lifetime.end` is a no-op when called on a slot that is already
	// out-of-scope.
	// L6) memory accesses to out-of-scope stack slots are UB.
	// L7) when a stack-slot is marked as out-of-scope, all pointers to it
	// are invalidated, unless the slot is "degenerate". This is used to
	// justify not marking slots as in-use until the pointer to them is
	// used, but feels a bit hacky in the presence of things like LICM. See
	// the "Degenerate Slots" section for more details.
	//
	// Now, let's ground stack coloring on these rules. We'll define a slot
	// as in-use at a (dynamic) point in execution if it either can be
	// written to at that point, or if it has a live and non-undef content
	// at that point.
	//
	// Obviously, slots that are never in-use together can be merged, and
	// in our example `foo`, the slots for `x`, `y` and `z` are never
	// in-use together (of course, sometimes slots that are in-use together
	// might still be mergable, but we don't care about that here).
	//
	// In this implementation, we successively merge pairs of slots that are
	// not in-use together. We could be smarter - for example, we could merge
	// a single large slot with 2 small slots, or we could construct the
	// interference graph and run a "smart" graph coloring algorithm, but with
	// that aside, how do we find out whether a pair of slots might be in-use
	// together?
	//
	// From our rules, we see that out-of-scope slots are never in-use,
	// and from (L7) we see that "non-degenerate" slots remain non-in-use
	// until their address is taken. Therefore, we can approximate slot activity
	// using dataflow.
	//
	// A subtle point: naively, we might try to figure out which pairs of
	// stack-slots interfere by propagating `S in-use` through the CFG for every
	// stack-slot `S`, and having `S` and `T` interfere if there is a CFG point in
	// which they are both in-use.
	//
	// That is sound, but overly conservative in some cases: in our (artificial)
	// example `foo`, either `x` or `y` might be in use at the label `B:`, but
	// as `x` is only in use if we came in from the `var` edge and `y` only
	// if we came from the `!var` edge, they still can't be in use together.
	// See PR32488 for an important real-life case.
	//
	// If we wanted to find all points of interference precisely, we could
	// propagate `S in-use` and `S&T in-use` predicates through the CFG. That
	// would be precise, but requires propagating `O(n^2)` dataflow facts.
	//
	// However, we aren't interested in the set of points of interference
	// between 2 stack slots, only whether there is such a point. So we
	// can rely on a little trick: for `S` and `T` to be in-use together,
	// one of them needs to become in-use while the other is in-use (or
	// they might both become in use simultaneously). We can check this
	// by also keeping track of the points at which a stack slot might start
	// being in-use.
	//
	// Exact first use:
	// ----------------
	//
	// Consider the following motivating example:
	//
	// int foo() {
	// char b1[1024], b2[1024];
	// if (...) {
	// char b3[1024];
	// <uses of b1, b3>;
	// return x;
	// } else {
	// char b4[1024], b5[1024];
	// <uses of b2, b4, b5>;
	// return y;
	// }
	// }
	//
	// In the code above, "b3" and "b4" are declared in distinct lexical
	// scopes, meaning that it is easy to prove that they can share the
	// same stack slot. Variables "b1" and "b2" are declared in the same
	// scope, meaning that from a lexical point of view, their lifetimes
	// overlap. From a control flow pointer of view, however, the two
	// variables are accessed in disjoint regions of the CFG, thus it
	// should be possible for them to share the same stack slot. An ideal
	// stack allocation for the function above would look like:
	//
	// slot 0: b1, b2
	// slot 1: b3, b4
	// slot 2: b5
	//
	// Achieving this allocation is tricky, however, due to the way
	// lifetime markers are inserted. Here is a simplified view of the
	// control flow graph for the code above:
	//
	// +------ block 0 -------+
	// 0\| LIFETIME_START b1, b2 \|
	// 1\| <test 'if' condition> \|
	// +-----------------------+
	// ./ \.
	// +------ block 1 -------+ +------ block 2 -------+
	// 2\| LIFETIME_START b3 \| 5\| LIFETIME_START b4, b5 \|
	// 3\| <uses of b1, b3> \| 6\| <uses of b2, b4, b5> \|
	// 4\| LIFETIME_END b3 \| 7\| LIFETIME_END b4, b5 \|
	// +-----------------------+ +-----------------------+
	// \. /.
	// +------ block 3 -------+
	// 8\| <cleanupcode> \|
	// 9\| LIFETIME_END b1, b2 \|
	// 10\| return \|
	// +-----------------------+
	//
	// If we create live intervals for the variables above strictly based
	// on the lifetime markers, we'll get the set of intervals on the
	// left. If we ignore the lifetime start markers and instead treat a
	// variable's lifetime as beginning with the first reference to the
	// var, then we get the intervals on the right.
	//
	// LIFETIME_START First Use
	// b1: [0,9] [3,4] [8,9]
	// b2: [0,9] [6,9]
	// b3: [2,4] [3,4]
	// b4: [5,7] [6,7]
	// b5: [5,7] [6,7]
	//
	// For the intervals on the left, the best we can do is overlap two
	// variables (b3 and b4, for example); this gives us a stack size of
	// 4*1024 bytes, not ideal. When treating first-use as the start of a
	// lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
	// byte stack (better).
	//
	// Degenerate Slots:
	// -----------------
	//
	// Relying entirely on first-use of stack slots is problematic,
	// however, due to the fact that optimizations can sometimes migrate
	// uses of a variable outside of its lifetime start/end region. Here
	// is an example:
	//
	// int bar() {
	// char b1[1024], b2[1024];
	// if (...) {
	// <uses of b2>
	// return y;
	// } else {
	// <uses of b1>
	// while (...) {
	// char b3[1024];
	// <uses of b3>
	// }
	// }
	// }
	//
	// Before optimization, the control flow graph for the code above
	// might look like the following:
	//
	// +------ block 0 -------+
	// 0\| LIFETIME_START b1, b2 \|
	// 1\| <test 'if' condition> \|
	// +-----------------------+
	// ./ \.
	// +------ block 1 -------+ +------- block 2 -------+
	// 2\| <uses of b2> \| 3\| <uses of b1> \|
	// +-----------------------+ +-----------------------+
	// \| \|
	// \| +------- block 3 -------+ <-\.
	// \| 4\| <while condition> \| \|
	// \| +-----------------------+ \|
	// \| / \| \|
	// \| / +------- block 4 -------+
	// \ / 5\| LIFETIME_START b3 \| \|
	// \ / 6\| <uses of b3> \| \|
	// \ / 7\| LIFETIME_END b3 \| \|
	// \ \| +------------------------+ \|
	// \ \| \ /
	// +------ block 5 -----+ \---------------
	// 8\| <cleanupcode> \|
	// 9\| LIFETIME_END b1, b2 \|
	// 10\| return \|
	// +---------------------+
	//
	// During optimization, however, it can happen that an instruction
	// computing an address in "b3" (for example, a loop-invariant GEP) is
	// hoisted up out of the loop from block 4 to block 2. [Note that
	// this is not an actual load from the stack, only an instruction that
	// computes the address to be loaded]. If this happens, there is now a
	// path leading from the first use of b3 to the return instruction
	// that does not encounter the b3 LIFETIME_END, hence b3's lifetime is
	// now larger than if we were computing live intervals strictly based
	// on lifetime markers. In the example above, this lengthened lifetime
	// would mean that it would appear illegal to overlap b3 with b2.
	//
	// To deal with this such cases, the code in ::collectMarkers() below
	// tries to identify "degenerate" slots -- those slots where on a single
	// forward pass through the CFG we encounter a first reference to slot
	// K before we hit the slot K lifetime start marker. For such slots,
	// we fall back on using the lifetime start marker as the beginning of
	// the variable's lifetime. NB: with this implementation, slots can
	// appear degenerate in cases where there is unstructured control flow:
	//
	// if (q) goto mid;
	// if (x > 9) {
	// int b[100];
	// memcpy(&b[0], ...);
	// mid: b[k] = ...;
	// abc(&b);
	// }
	//
	// If in RPO ordering chosen to walk the CFG we happen to visit the b[k]
	// before visiting the memcpy block (which will contain the lifetime start
	// for "b" then it will appear that 'b' has a degenerate lifetime.
	//

	namespace {
	/// StackColoring - A machine pass for merging disjoint stack allocations,
	/// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
	class StackColoring : public MachineFunctionPass {
	MachineFrameInfo *MFI;
	MachineFunction *MF;

	/// A class representing liveness information for a single basic block.
	/// Each bit in the BitVector represents the liveness property
	/// for a different stack slot.
	struct BlockLifetimeInfo {
	/// Which slots BEGINs in each basic block.
	BitVector Begin;
	/// Which slots ENDs in each basic block.
	BitVector End;
	/// Which slots are marked as LIVE_IN, coming into each basic block.
	BitVector LiveIn;
	/// Which slots are marked as LIVE_OUT, coming out of each basic block.
	BitVector LiveOut;
	};

	/// Maps active slots (per bit) for each basic block.
	typedef DenseMap<const MachineBasicBlock*, BlockLifetimeInfo> LivenessMap;
	LivenessMap BlockLiveness;

	/// Maps serial numbers to basic blocks.
	DenseMap<const MachineBasicBlock*, int> BasicBlocks;
	/// Maps basic blocks to a serial number.
	SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering;

	/// Maps slots to their use interval. Outside of this interval, slots
	/// values are either dead or `undef` and they will not be written to.
	SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals;
	/// Maps slots to the points where they can become in-use.
	SmallVector<SmallVector<SlotIndex, 4>, 16> LiveStarts;
	/// VNInfo is used for the construction of LiveIntervals.
	VNInfo::Allocator VNInfoAllocator;
	/// SlotIndex analysis object.
	SlotIndexes *Indexes;
	/// The stack protector object.
	StackProtector *SP;

	/// The list of lifetime markers found. These markers are to be removed
	/// once the coloring is done.
	SmallVector<MachineInstr*, 8> Markers;

	/// Record the FI slots for which we have seen some sort of
	/// lifetime marker (either start or end).
	BitVector InterestingSlots;

	/// FI slots that need to be handled conservatively (for these
	/// slots lifetime-start-on-first-use is disabled).
	BitVector ConservativeSlots;

	/// Number of iterations taken during data flow analysis.
	unsigned NumIterations;

	public:
	static char ID;
	StackColoring() : MachineFunctionPass(ID) {
	initializeStackColoringPass(*PassRegistry::getPassRegistry());
	}
	void getAnalysisUsage(AnalysisUsage &AU) const override;
	bool runOnMachineFunction(MachineFunction &MF) override;

	private:
	/// Debug.
	void dump() const;
	void dumpIntervals() const;
	void dumpBB(MachineBasicBlock *MBB) const;
	void dumpBV(const char *tag, const BitVector &BV) const;

	/// Removes all of the lifetime marker instructions from the function.
	/// \returns true if any markers were removed.
	bool removeAllMarkers();

	/// Scan the machine function and find all of the lifetime markers.
	/// Record the findings in the BEGIN and END vectors.
	/// \returns the number of markers found.
	unsigned collectMarkers(unsigned NumSlot);

	/// Perform the dataflow calculation and calculate the lifetime for each of
	/// the slots, based on the BEGIN/END vectors. Set the LifetimeLIVE_IN and
	/// LifetimeLIVE_OUT maps that represent which stack slots are live coming
	/// in and out blocks.
	void calculateLocalLiveness();

	/// Returns TRUE if we're using the first-use-begins-lifetime method for
	/// this slot (if FALSE, then the start marker is treated as start of lifetime).
	bool applyFirstUse(int Slot) {
	if (!LifetimeStartOnFirstUse \|\| ProtectFromEscapedAllocas)
	return false;
	if (ConservativeSlots.test(Slot))
	return false;
	return true;
	}

	/// Examines the specified instruction and returns TRUE if the instruction
	/// represents the start or end of an interesting lifetime. The slot or slots
	/// starting or ending are added to the vector "slots" and "isStart" is set
	/// accordingly.
	/// \returns True if inst contains a lifetime start or end
	bool isLifetimeStartOrEnd(const MachineInstr &MI,
	SmallVector<int, 4> &slots,
	bool &isStart);

	/// Construct the LiveIntervals for the slots.
	void calculateLiveIntervals(unsigned NumSlots);

	/// Go over the machine function and change instructions which use stack
	/// slots to use the joint slots.
	void remapInstructions(DenseMap<int, int> &SlotRemap);

	/// The input program may contain instructions which are not inside lifetime
	/// markers. This can happen due to a bug in the compiler or due to a bug in
	/// user code (for example, returning a reference to a local variable).
	/// This procedure checks all of the instructions in the function and
	/// invalidates lifetime ranges which do not contain all of the instructions
	/// which access that frame slot.
	void removeInvalidSlotRanges();

	/// Map entries which point to other entries to their destination.
	/// A->B->C becomes A->C.
	void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);

	/// Used in collectMarkers
	typedef DenseMap<const MachineBasicBlock*, BitVector> BlockBitVecMap;
	};
	} // end anonymous namespace

	char StackColoring::ID = 0;
	char &llvm::StackColoringID = StackColoring::ID;

	INITIALIZE_PASS_BEGIN(StackColoring, DEBUG_TYPE,
	"Merge disjoint stack slots", false, false)
	INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
	INITIALIZE_PASS_DEPENDENCY(StackProtector)
	INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE,
	"Merge disjoint stack slots", false, false)

	void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
	AU.addRequired<SlotIndexes>();
	AU.addRequired<StackProtector>();
	MachineFunctionPass::getAnalysisUsage(AU);
	}

	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
	LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,
	const BitVector &BV) const {
	dbgs() << tag << " : { ";
	for (unsigned I = 0, E = BV.size(); I != E; ++I)
	dbgs() << BV.test(I) << " ";
	dbgs() << "}\n";
	}

	LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
	LivenessMap::const_iterator BI = BlockLiveness.find(MBB);
	assert(BI != BlockLiveness.end() && "Block not found");
	const BlockLifetimeInfo &BlockInfo = BI->second;

	dumpBV("BEGIN", BlockInfo.Begin);
	dumpBV("END", BlockInfo.End);
	dumpBV("LIVE_IN", BlockInfo.LiveIn);
	dumpBV("LIVE_OUT", BlockInfo.LiveOut);
	}

	LLVM_DUMP_METHOD void StackColoring::dump() const {
	for (MachineBasicBlock *MBB : depth_first(MF)) {
	dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
	<< MBB->getName() << "]\n";
	dumpBB(MBB);
	}
	}

	LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
	for (unsigned I = 0, E = Intervals.size(); I != E; ++I) {
	dbgs() << "Interval[" << I << "]:\n";
	Intervals[I]->dump();
	}
	}
	#endif

	static inline int getStartOrEndSlot(const MachineInstr &MI)
	{
	assert((MI.getOpcode() == TargetOpcode::LIFETIME_START \|\|
	MI.getOpcode() == TargetOpcode::LIFETIME_END) &&
	"Expected LIFETIME_START or LIFETIME_END op");
	const MachineOperand &MO = MI.getOperand(0);
	int Slot = MO.getIndex();
	if (Slot >= 0)
	return Slot;
	return -1;
	}

	//
	// At the moment the only way to end a variable lifetime is with
	// a VARIABLE_LIFETIME op (which can't contain a start). If things
	// change and the IR allows for a single inst that both begins
	// and ends lifetime(s), this interface will need to be reworked.
	//
	bool StackColoring::isLifetimeStartOrEnd(const MachineInstr &MI,
	SmallVector<int, 4> &slots,
	bool &isStart)
	{
	if (MI.getOpcode() == TargetOpcode::LIFETIME_START \|\|
	MI.getOpcode() == TargetOpcode::LIFETIME_END) {
	int Slot = getStartOrEndSlot(MI);
	if (Slot < 0)
	return false;
	if (!InterestingSlots.test(Slot))
	return false;
	slots.push_back(Slot);
	if (MI.getOpcode() == TargetOpcode::LIFETIME_END) {
	isStart = false;
	return true;
	}
	if (! applyFirstUse(Slot)) {
	isStart = true;
	return true;
	}
	} else if (LifetimeStartOnFirstUse && !ProtectFromEscapedAllocas) {
	if (! MI.isDebugValue()) {
	bool found = false;
	for (const MachineOperand &MO : MI.operands()) {
	if (!MO.isFI())
	continue;
	int Slot = MO.getIndex();
	if (Slot<0)
	continue;
	if (InterestingSlots.test(Slot) && applyFirstUse(Slot)) {
	slots.push_back(Slot);
	found = true;
	}
	}
	if (found) {
	isStart = true;
	return true;
	}
	}
	}
	return false;
	}

	unsigned StackColoring::collectMarkers(unsigned NumSlot)
	{
	unsigned MarkersFound = 0;
	BlockBitVecMap SeenStartMap;
	InterestingSlots.clear();
	InterestingSlots.resize(NumSlot);
	ConservativeSlots.clear();
	ConservativeSlots.resize(NumSlot);

	// number of start and end lifetime ops for each slot
	SmallVector<int, 8> NumStartLifetimes(NumSlot, 0);
	SmallVector<int, 8> NumEndLifetimes(NumSlot, 0);

	// Step 1: collect markers and populate the "InterestingSlots"
	// and "ConservativeSlots" sets.
	for (MachineBasicBlock *MBB : depth_first(MF)) {

	// Compute the set of slots for which we've seen a START marker but have
	// not yet seen an END marker at this point in the walk (e.g. on entry
	// to this bb).
	BitVector BetweenStartEnd;
	BetweenStartEnd.resize(NumSlot);
	for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
	PE = MBB->pred_end(); PI != PE; ++PI) {
	BlockBitVecMap::const_iterator I = SeenStartMap.find(*PI);
	if (I != SeenStartMap.end()) {
	BetweenStartEnd \|= I->second;
	}
	}

	// Walk the instructions in the block to look for start/end ops.
	for (MachineInstr &MI : *MBB) {
	if (MI.getOpcode() == TargetOpcode::LIFETIME_START \|\|
	MI.getOpcode() == TargetOpcode::LIFETIME_END) {
	int Slot = getStartOrEndSlot(MI);
	if (Slot < 0)
	continue;
	InterestingSlots.set(Slot);
	if (MI.getOpcode() == TargetOpcode::LIFETIME_START) {
	BetweenStartEnd.set(Slot);
	NumStartLifetimes[Slot] += 1;
	} else {
	BetweenStartEnd.reset(Slot);
	NumEndLifetimes[Slot] += 1;
	}
	const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
	if (Allocation) {
	DEBUG(dbgs() << "Found a lifetime ");
	DEBUG(dbgs() << (MI.getOpcode() == TargetOpcode::LIFETIME_START
	? "start"
	: "end"));
	DEBUG(dbgs() << " marker for slot #" << Slot);
	DEBUG(dbgs() << " with allocation: " << Allocation->getName()
	<< "\n");
	}
	Markers.push_back(&MI);
	MarkersFound += 1;
	} else {
	for (const MachineOperand &MO : MI.operands()) {
	if (!MO.isFI())
	continue;
	int Slot = MO.getIndex();
	if (Slot < 0)
	continue;
	if (! BetweenStartEnd.test(Slot)) {
	ConservativeSlots.set(Slot);
	}
	}
	}
	}
	BitVector &SeenStart = SeenStartMap[MBB];
	SeenStart \|= BetweenStartEnd;
	}
	if (!MarkersFound) {
	return 0;
	}

	// PR27903: slots with multiple start or end lifetime ops are not
	// safe to enable for "lifetime-start-on-first-use".
	for (unsigned slot = 0; slot < NumSlot; ++slot)
	if (NumStartLifetimes[slot] > 1 \|\| NumEndLifetimes[slot] > 1)
	ConservativeSlots.set(slot);
	DEBUG(dumpBV("Conservative slots", ConservativeSlots));

	// Step 2: compute begin/end sets for each block

	// NOTE: We use a depth-first iteration to ensure that we obtain a
	// deterministic numbering.
	for (MachineBasicBlock *MBB : depth_first(MF)) {

	// Assign a serial number to this basic block.
	BasicBlocks[MBB] = BasicBlockNumbering.size();
	BasicBlockNumbering.push_back(MBB);

	// Keep a reference to avoid repeated lookups.
	BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB];

	BlockInfo.Begin.resize(NumSlot);
	BlockInfo.End.resize(NumSlot);

	SmallVector<int, 4> slots;
	for (MachineInstr &MI : *MBB) {
	bool isStart = false;
	slots.clear();
	if (isLifetimeStartOrEnd(MI, slots, isStart)) {
	if (!isStart) {
	assert(slots.size() == 1 && "unexpected: MI ends multiple slots");
	int Slot = slots[0];
	if (BlockInfo.Begin.test(Slot)) {
	BlockInfo.Begin.reset(Slot);
	}
	BlockInfo.End.set(Slot);
	} else {
	for (auto Slot : slots) {
	DEBUG(dbgs() << "Found a use of slot #" << Slot);
	DEBUG(dbgs() << " at BB#" << MBB->getNumber() << " index ");
	DEBUG(Indexes->getInstructionIndex(MI).print(dbgs()));
	const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
	if (Allocation) {
	DEBUG(dbgs() << " with allocation: "<< Allocation->getName());
	}
	DEBUG(dbgs() << "\n");
	if (BlockInfo.End.test(Slot)) {
	BlockInfo.End.reset(Slot);
	}
	BlockInfo.Begin.set(Slot);
	}
	}
	}
	}
	}

	// Update statistics.
	NumMarkerSeen += MarkersFound;
	return MarkersFound;
	}

	void StackColoring::calculateLocalLiveness()
	{
	unsigned NumIters = 0;
	bool changed = true;
	while (changed) {
	changed = false;
	++NumIters;

	for (const MachineBasicBlock *BB : BasicBlockNumbering) {

	// Use an iterator to avoid repeated lookups.
	LivenessMap::iterator BI = BlockLiveness.find(BB);
	assert(BI != BlockLiveness.end() && "Block not found");
	BlockLifetimeInfo &BlockInfo = BI->second;

	// Compute LiveIn by unioning together the LiveOut sets of all preds.
	BitVector LocalLiveIn;
	for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(),
	PE = BB->pred_end(); PI != PE; ++PI) {
	LivenessMap::const_iterator I = BlockLiveness.find(*PI);
	assert(I != BlockLiveness.end() && "Predecessor not found");
	LocalLiveIn \|= I->second.LiveOut;
	}

	// Compute LiveOut by subtracting out lifetimes that end in this
	// block, then adding in lifetimes that begin in this block. If
	// we have both BEGIN and END markers in the same basic block
	// then we know that the BEGIN marker comes after the END,
	// because we already handle the case where the BEGIN comes
	// before the END when collecting the markers (and building the
	// BEGIN/END vectors).
	BitVector LocalLiveOut = LocalLiveIn;
	LocalLiveOut.reset(BlockInfo.End);
	LocalLiveOut \|= BlockInfo.Begin;

	// Update block LiveIn set, noting whether it has changed.
	if (LocalLiveIn.test(BlockInfo.LiveIn)) {
	changed = true;
	BlockInfo.LiveIn \|= LocalLiveIn;
	}

	// Update block LiveOut set, noting whether it has changed.
	if (LocalLiveOut.test(BlockInfo.LiveOut)) {
	changed = true;
	BlockInfo.LiveOut \|= LocalLiveOut;
	}
	}
	}// while changed.

	NumIterations = NumIters;
	}

	void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
	SmallVector<SlotIndex, 16> Starts;
	SmallVector<bool, 16> DefinitelyInUse;

	// For each block, find which slots are active within this block
	// and update the live intervals.
	for (const MachineBasicBlock &MBB : *MF) {
	Starts.clear();
	Starts.resize(NumSlots);
	DefinitelyInUse.clear();
	DefinitelyInUse.resize(NumSlots);

	// Start the interval of the slots that we previously found to be 'in-use'.
	BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
	for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
	pos = MBBLiveness.LiveIn.find_next(pos)) {
	Starts[pos] = Indexes->getMBBStartIdx(&MBB);
	}

	// Create the interval for the basic blocks containing lifetime begin/end.
	for (const MachineInstr &MI : MBB) {

	SmallVector<int, 4> slots;
	bool IsStart = false;
	if (!isLifetimeStartOrEnd(MI, slots, IsStart))
	continue;
	SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
	for (auto Slot : slots) {
	if (IsStart) {
	// If a slot is already definitely in use, we don't have to emit
	// a new start marker because there is already a pre-existing
	// one.
	if (!DefinitelyInUse[Slot]) {
	LiveStarts[Slot].push_back(ThisIndex);
	DefinitelyInUse[Slot] = true;
	}
	if (!Starts[Slot].isValid())
	Starts[Slot] = ThisIndex;
	} else {
	if (Starts[Slot].isValid()) {
	VNInfo *VNI = Intervals[Slot]->getValNumInfo(0);
	Intervals[Slot]->addSegment(
	LiveInterval::Segment(Starts[Slot], ThisIndex, VNI));
	Starts[Slot] = SlotIndex(); // Invalidate the start index
	DefinitelyInUse[Slot] = false;
	}
	}
	}
	}

	// Finish up started segments
	for (unsigned i = 0; i < NumSlots; ++i) {
	if (!Starts[i].isValid())
	continue;

	SlotIndex EndIdx = Indexes->getMBBEndIdx(&MBB);
	VNInfo *VNI = Intervals[i]->getValNumInfo(0);
	Intervals[i]->addSegment(LiveInterval::Segment(Starts[i], EndIdx, VNI));
	}
	}
	}

	bool StackColoring::removeAllMarkers() {
	unsigned Count = 0;
	for (MachineInstr *MI : Markers) {
	MI->eraseFromParent();
	Count++;
	}
	Markers.clear();

	DEBUG(dbgs()<<"Removed "<<Count<<" markers.\n");
	return Count;
	}

	void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
	unsigned FixedInstr = 0;
	unsigned FixedMemOp = 0;
	unsigned FixedDbg = 0;

	// Remap debug information that refers to stack slots.
	for (auto &VI : MF->getVariableDbgInfo()) {
	if (!VI.Var)
	continue;
	if (SlotRemap.count(VI.Slot)) {
	DEBUG(dbgs() << "Remapping debug info for ["
	<< cast<DILocalVariable>(VI.Var)->getName() << "].\n");
	VI.Slot = SlotRemap[VI.Slot];
	FixedDbg++;
	}
	}

	// Keep a list of allocas which need to be remapped.
	DenseMap<const AllocaInst, const AllocaInst> Allocas;
	+
	+ // Keep a list of allocas which has been affected by the remap.
	+ SmallPtrSet<const AllocaInst*, 32> MergedAllocas;
	+
	for (const std::pair<int, int> &SI : SlotRemap) {
	const AllocaInst *From = MFI->getObjectAllocation(SI.first);
	const AllocaInst *To = MFI->getObjectAllocation(SI.second);
	assert(To && From && "Invalid allocation object");
	Allocas[From] = To;

	// AA might be used later for instruction scheduling, and we need it to be
	// able to deduce the correct aliasing releationships between pointers
	// derived from the alloca being remapped and the target of that remapping.
	// The only safe way, without directly informing AA about the remapping
	// somehow, is to directly update the IR to reflect the change being made
	// here.
	Instruction Inst = const_cast<AllocaInst >(To);
	if (From->getType() != To->getType()) {
	BitCastInst *Cast = new BitCastInst(Inst, From->getType());
	Cast->insertAfter(Inst);
	Inst = Cast;
	}

	+ // We keep both slots to maintain AliasAnalysis metadata later.
	+ MergedAllocas.insert(From);
	+ MergedAllocas.insert(To);
	+
	// Allow the stack protector to adjust its value map to account for the
	// upcoming replacement.
	SP->adjustForColoring(From, To);

	// The new alloca might not be valid in a llvm.dbg.declare for this
	// variable, so undef out the use to make the verifier happy.
	AllocaInst FromAI = const_cast<AllocaInst >(From);
	if (FromAI->isUsedByMetadata())
	ValueAsMetadata::handleRAUW(FromAI, UndefValue::get(FromAI->getType()));
	for (auto &Use : FromAI->uses()) {
	if (BitCastInst *BCI = dyn_cast<BitCastInst>(Use.get()))
	if (BCI->isUsedByMetadata())
	ValueAsMetadata::handleRAUW(BCI, UndefValue::get(BCI->getType()));
	}

	// Note that this will not replace uses in MMOs (which we'll update below),
	// or anywhere else (which is why we won't delete the original
	// instruction).
	FromAI->replaceAllUsesWith(Inst);
	}

	// Remap all instructions to the new stack slots.
	for (MachineBasicBlock &BB : *MF)
	for (MachineInstr &I : BB) {
	// Skip lifetime markers. We'll remove them soon.
	if (I.getOpcode() == TargetOpcode::LIFETIME_START \|\|
	I.getOpcode() == TargetOpcode::LIFETIME_END)
	continue;

	// Update the MachineMemOperand to use the new alloca.
	for (MachineMemOperand *MMO : I.memoperands()) {
	- // FIXME: In order to enable the use of TBAA when using AA in CodeGen,
	- // we'll also need to update the TBAA nodes in MMOs with values
	- // derived from the merged allocas. When doing this, we'll need to use
	- // the same variant of GetUnderlyingObjects that is used by the
	- // instruction scheduler (that can look through ptrtoint/inttoptr
	- // pairs).
	-
	// We've replaced IR-level uses of the remapped allocas, so we only
	// need to replace direct uses here.
	const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(MMO->getValue());
	if (!AI)
	continue;

	if (!Allocas.count(AI))
	continue;

	MMO->setValue(Allocas[AI]);
	FixedMemOp++;
	}

	// Update all of the machine instruction operands.
	for (MachineOperand &MO : I.operands()) {
	if (!MO.isFI())
	continue;
	int FromSlot = MO.getIndex();

	// Don't touch arguments.
	if (FromSlot<0)
	continue;

	// Only look at mapped slots.
	if (!SlotRemap.count(FromSlot))
	continue;

	// In a debug build, check that the instruction that we are modifying is
	// inside the expected live range. If the instruction is not inside
	// the calculated range then it means that the alloca usage moved
	// outside of the lifetime markers, or that the user has a bug.
	// NOTE: Alloca address calculations which happen outside the lifetime
	// zone are are okay, despite the fact that we don't have a good way
	// for validating all of the usages of the calculation.
	#ifndef NDEBUG
	bool TouchesMemory = I.mayLoad() \|\| I.mayStore();
	// If we don't protect the user from escaped allocas, don't bother
	// validating the instructions.
	if (!I.isDebugValue() && TouchesMemory && ProtectFromEscapedAllocas) {
	SlotIndex Index = Indexes->getInstructionIndex(I);
	const LiveInterval Interval = &Intervals[FromSlot];
	assert(Interval->find(Index) != Interval->end() &&
	"Found instruction usage outside of live range.");
	}
	#endif

	// Fix the machine instructions.
	int ToSlot = SlotRemap[FromSlot];
	MO.setIndex(ToSlot);
	FixedInstr++;
	}
	+
	+ // We adjust AliasAnalysis information for merged stack slots.
	+ MachineSDNode::mmo_iterator NewMemOps =
	+ MF->allocateMemRefsArray(I.getNumMemOperands());
	+ unsigned MemOpIdx = 0;
	+ bool ReplaceMemOps = false;
	+ for (MachineMemOperand *MMO : I.memoperands()) {
	+ // If this memory location can be a slot remapped here,
	+ // we remove AA information.
	+ bool MayHaveConflictingAAMD = false;
	+ if (MMO->getAAInfo()) {
	+ if (const Value *MMOV = MMO->getValue()) {
	+ SmallVector<Value *, 4> Objs;
	+ getUnderlyingObjectsForCodeGen(MMOV, Objs, MF->getDataLayout());
	+
	+ if (Objs.empty())
	+ MayHaveConflictingAAMD = true;
	+ else
	+ for (Value *V : Objs) {
	+ // If this memory location comes from a known stack slot
	+ // that is not remapped, we continue checking.
	+ // Otherwise, we need to invalidate AA infomation.
	+ const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(V);
	+ if (AI && MergedAllocas.count(AI)) {
	+ MayHaveConflictingAAMD = true;
	+ break;
	+ }
	+ }
	+ }
	+ }
	+ if (MayHaveConflictingAAMD) {
	+ NewMemOps[MemOpIdx++] = MF->getMachineMemOperand(MMO, AAMDNodes());
	+ ReplaceMemOps = true;
	+ }
	+ else
	+ NewMemOps[MemOpIdx++] = MMO;
	+ }
	+
	+ // If any memory operand is updated, set memory references of
	+ // this instruction.
	+ if (ReplaceMemOps)
	+ I.setMemRefs(std::make_pair(NewMemOps, I.getNumMemOperands()));
	}

	// Update the location of C++ catch objects for the MSVC personality routine.
	if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo())
	for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap)
	for (WinEHHandlerType &H : TBME.HandlerArray)
	if (H.CatchObj.FrameIndex != INT_MAX &&
	SlotRemap.count(H.CatchObj.FrameIndex))
	H.CatchObj.FrameIndex = SlotRemap[H.CatchObj.FrameIndex];

	DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n");
	DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n");
	DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n");
	}

	void StackColoring::removeInvalidSlotRanges() {
	for (MachineBasicBlock &BB : *MF)
	for (MachineInstr &I : BB) {
	if (I.getOpcode() == TargetOpcode::LIFETIME_START \|\|
	I.getOpcode() == TargetOpcode::LIFETIME_END \|\| I.isDebugValue())
	continue;

	// Some intervals are suspicious! In some cases we find address
	// calculations outside of the lifetime zone, but not actual memory
	// read or write. Memory accesses outside of the lifetime zone are a clear
	// violation, but address calculations are okay. This can happen when
	// GEPs are hoisted outside of the lifetime zone.
	// So, in here we only check instructions which can read or write memory.
	if (!I.mayLoad() && !I.mayStore())
	continue;

	// Check all of the machine operands.
	for (const MachineOperand &MO : I.operands()) {
	if (!MO.isFI())
	continue;

	int Slot = MO.getIndex();

	if (Slot<0)
	continue;

	if (Intervals[Slot]->empty())
	continue;

	// Check that the used slot is inside the calculated lifetime range.
	// If it is not, warn about it and invalidate the range.
	LiveInterval Interval = &Intervals[Slot];
	SlotIndex Index = Indexes->getInstructionIndex(I);
	if (Interval->find(Index) == Interval->end()) {
	Interval->clear();
	DEBUG(dbgs()<<"Invalidating range #"<<Slot<<"\n");
	EscapedAllocas++;
	}
	}
	}
	}

	void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
	unsigned NumSlots) {
	// Expunge slot remap map.
	for (unsigned i=0; i < NumSlots; ++i) {
	// If we are remapping i
	if (SlotRemap.count(i)) {
	int Target = SlotRemap[i];
	// As long as our target is mapped to something else, follow it.
	while (SlotRemap.count(Target)) {
	Target = SlotRemap[Target];
	SlotRemap[i] = Target;
	}
	}
	}
	}

	bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
	DEBUG(dbgs() << "******** Stack Coloring ********\n"
	<< "********** Function: "
	<< ((const Value*)Func.getFunction())->getName() << '\n');
	MF = &Func;
	MFI = &MF->getFrameInfo();
	Indexes = &getAnalysis<SlotIndexes>();
	SP = &getAnalysis<StackProtector>();
	BlockLiveness.clear();
	BasicBlocks.clear();
	BasicBlockNumbering.clear();
	Markers.clear();
	Intervals.clear();
	LiveStarts.clear();
	VNInfoAllocator.Reset();

	unsigned NumSlots = MFI->getObjectIndexEnd();

	// If there are no stack slots then there are no markers to remove.
	if (!NumSlots)
	return false;

	SmallVector<int, 8> SortedSlots;
	SortedSlots.reserve(NumSlots);
	Intervals.reserve(NumSlots);
	LiveStarts.resize(NumSlots);

	unsigned NumMarkers = collectMarkers(NumSlots);

	unsigned TotalSize = 0;
	DEBUG(dbgs()<<"Found "<<NumMarkers<<" markers and "<<NumSlots<<" slots\n");
	DEBUG(dbgs()<<"Slot structure:\n");

	for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
	DEBUG(dbgs()<<"Slot #"<<i<<" - "<<MFI->getObjectSize(i)<<" bytes.\n");
	TotalSize += MFI->getObjectSize(i);
	}

	DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n");

	// Don't continue because there are not enough lifetime markers, or the
	// stack is too small, or we are told not to optimize the slots.
	if (NumMarkers < 2 \|\| TotalSize < 16 \|\| DisableColoring \|\|
	skipFunction(*Func.getFunction())) {
	DEBUG(dbgs()<<"Will not try to merge slots.\n");
	return removeAllMarkers();
	}

	for (unsigned i=0; i < NumSlots; ++i) {
	std::unique_ptr<LiveInterval> LI(new LiveInterval(i, 0));
	LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
	Intervals.push_back(std::move(LI));
	SortedSlots.push_back(i);
	}

	// Calculate the liveness of each block.
	calculateLocalLiveness();
	DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n");
	DEBUG(dump());

	// Propagate the liveness information.
	calculateLiveIntervals(NumSlots);
	DEBUG(dumpIntervals());

	// Search for allocas which are used outside of the declared lifetime
	// markers.
	if (ProtectFromEscapedAllocas)
	removeInvalidSlotRanges();

	// Maps old slots to new slots.
	DenseMap<int, int> SlotRemap;
	unsigned RemovedSlots = 0;
	unsigned ReducedSize = 0;

	// Do not bother looking at empty intervals.
	for (unsigned I = 0; I < NumSlots; ++I) {
	if (Intervals[SortedSlots[I]]->empty())
	SortedSlots[I] = -1;
	}

	// This is a simple greedy algorithm for merging allocas. First, sort the
	// slots, placing the largest slots first. Next, perform an n^2 scan and look
	// for disjoint slots. When you find disjoint slots, merge the samller one
	// into the bigger one and update the live interval. Remove the small alloca
	// and continue.

	// Sort the slots according to their size. Place unused slots at the end.
	// Use stable sort to guarantee deterministic code generation.
	std::stable_sort(SortedSlots.begin(), SortedSlots.end(),
	[this](int LHS, int RHS) {
	// We use -1 to denote a uninteresting slot. Place these slots at the end.
	if (LHS == -1) return false;
	if (RHS == -1) return true;
	// Sort according to size.
	return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
	});

	for (auto &s : LiveStarts)
	std::sort(s.begin(), s.end());

	bool Changed = true;
	while (Changed) {
	Changed = false;
	for (unsigned I = 0; I < NumSlots; ++I) {
	if (SortedSlots[I] == -1)
	continue;

	for (unsigned J=I+1; J < NumSlots; ++J) {
	if (SortedSlots[J] == -1)
	continue;

	int FirstSlot = SortedSlots[I];
	int SecondSlot = SortedSlots[J];
	LiveInterval First = &Intervals[FirstSlot];
	LiveInterval Second = &Intervals[SecondSlot];
	auto &FirstS = LiveStarts[FirstSlot];
	auto &SecondS = LiveStarts[SecondSlot];
	assert (!First->empty() && !Second->empty() && "Found an empty range");

	// Merge disjoint slots. This is a little bit tricky - see the
	// Implementation Notes section for an explanation.
	if (!First->isLiveAtIndexes(SecondS) &&
	!Second->isLiveAtIndexes(FirstS)) {
	Changed = true;
	First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));

	int OldSize = FirstS.size();
	FirstS.append(SecondS.begin(), SecondS.end());
	auto Mid = FirstS.begin() + OldSize;
	std::inplace_merge(FirstS.begin(), Mid, FirstS.end());

	SlotRemap[SecondSlot] = FirstSlot;
	SortedSlots[J] = -1;
	DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
	SecondSlot<<" together.\n");
	unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
	MFI->getObjectAlignment(SecondSlot));

	assert(MFI->getObjectSize(FirstSlot) >=
	MFI->getObjectSize(SecondSlot) &&
	"Merging a small object into a larger one");

	RemovedSlots+=1;
	ReducedSize += MFI->getObjectSize(SecondSlot);
	MFI->setObjectAlignment(FirstSlot, MaxAlignment);
	MFI->RemoveStackObject(SecondSlot);
	}
	}
	}
	}// While changed.

	// Record statistics.
	StackSpaceSaved += ReducedSize;
	StackSlotMerged += RemovedSlots;
	DEBUG(dbgs()<<"Merge "<<RemovedSlots<<" slots. Saved "<<
	ReducedSize<<" bytes\n");

	// Scan the entire function and update all machine operands that use frame
	// indices to use the remapped frame index.
	expungeSlotMap(SlotRemap, NumSlots);
	remapInstructions(SlotRemap);

	return removeAllMarkers();
	}
	Index: head/contrib/llvm/lib/IR/ConstantFold.cpp
	===================================================================
	--- head/contrib/llvm/lib/IR/ConstantFold.cpp (revision 322319)
	+++ head/contrib/llvm/lib/IR/ConstantFold.cpp (revision 322320)
	@@ -1,2275 +1,2279 @@
	//===- ConstantFold.cpp - LLVM constant folder ----------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements folding of constants for LLVM. This implements the
	// (internal) ConstantFold.h interface, which is used by the
	// ConstantExpr::get* methods to automatically fold constants when possible.
	//
	// The current constant folding implementation is implemented in two pieces: the
	// pieces that don't need DataLayout, and the pieces that do. This is to avoid
	// a dependence in IR on Target.
	//
	//===----------------------------------------------------------------------===//

	#include "ConstantFold.h"
	#include "llvm/ADT/APSInt.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Operator.h"
	#include "llvm/IR/PatternMatch.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/ManagedStatic.h"
	#include "llvm/Support/MathExtras.h"
	using namespace llvm;
	using namespace llvm::PatternMatch;

	//===----------------------------------------------------------------------===//
	// ConstantFold*Instruction Implementations
	//===----------------------------------------------------------------------===//

	/// Convert the specified vector Constant node to the specified vector type.
	/// At this point, we know that the elements of the input vector constant are
	/// all simple integer or FP values.
	static Constant BitCastConstantVector(Constant CV, VectorType *DstTy) {

	if (CV->isAllOnesValue()) return Constant::getAllOnesValue(DstTy);
	if (CV->isNullValue()) return Constant::getNullValue(DstTy);

	// If this cast changes element count then we can't handle it here:
	// doing so requires endianness information. This should be handled by
	// Analysis/ConstantFolding.cpp
	unsigned NumElts = DstTy->getNumElements();
	if (NumElts != CV->getType()->getVectorNumElements())
	return nullptr;

	Type *DstEltTy = DstTy->getElementType();

	SmallVector<Constant*, 16> Result;
	Type *Ty = IntegerType::get(CV->getContext(), 32);
	for (unsigned i = 0; i != NumElts; ++i) {
	Constant *C =
	ConstantExpr::getExtractElement(CV, ConstantInt::get(Ty, i));
	C = ConstantExpr::getBitCast(C, DstEltTy);
	Result.push_back(C);
	}

	return ConstantVector::get(Result);
	}

	/// This function determines which opcode to use to fold two constant cast
	/// expressions together. It uses CastInst::isEliminableCastPair to determine
	/// the opcode. Consequently its just a wrapper around that function.
	/// @brief Determine if it is valid to fold a cast of a cast
	static unsigned
	foldConstantCastPair(
	unsigned opc, ///< opcode of the second cast constant expression
	ConstantExpr *Op, ///< the first cast constant expression
	Type *DstTy ///< destination type of the first cast
	) {
	assert(Op && Op->isCast() && "Can't fold cast of cast without a cast!");
	assert(DstTy && DstTy->isFirstClassType() && "Invalid cast destination type");
	assert(CastInst::isCast(opc) && "Invalid cast opcode");

	// The types and opcodes for the two Cast constant expressions
	Type *SrcTy = Op->getOperand(0)->getType();
	Type *MidTy = Op->getType();
	Instruction::CastOps firstOp = Instruction::CastOps(Op->getOpcode());
	Instruction::CastOps secondOp = Instruction::CastOps(opc);

	// Assume that pointers are never more than 64 bits wide, and only use this
	// for the middle type. Otherwise we could end up folding away illegal
	// bitcasts between address spaces with different sizes.
	IntegerType *FakeIntPtrTy = Type::getInt64Ty(DstTy->getContext());

	// Let CastInst::isEliminableCastPair do the heavy lifting.
	return CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy, DstTy,
	nullptr, FakeIntPtrTy, nullptr);
	}

	static Constant FoldBitCast(Constant V, Type *DestTy) {
	Type *SrcTy = V->getType();
	if (SrcTy == DestTy)
	return V; // no-op cast

	// Check to see if we are casting a pointer to an aggregate to a pointer to
	// the first element. If so, return the appropriate GEP instruction.
	if (PointerType *PTy = dyn_cast<PointerType>(V->getType()))
	if (PointerType *DPTy = dyn_cast<PointerType>(DestTy))
	if (PTy->getAddressSpace() == DPTy->getAddressSpace()
	&& PTy->getElementType()->isSized()) {
	SmallVector<Value*, 8> IdxList;
	Value *Zero =
	Constant::getNullValue(Type::getInt32Ty(DPTy->getContext()));
	IdxList.push_back(Zero);
	Type *ElTy = PTy->getElementType();
	while (ElTy != DPTy->getElementType()) {
	if (StructType *STy = dyn_cast<StructType>(ElTy)) {
	if (STy->getNumElements() == 0) break;
	ElTy = STy->getElementType(0);
	IdxList.push_back(Zero);
	} else if (SequentialType *STy =
	dyn_cast<SequentialType>(ElTy)) {
	ElTy = STy->getElementType();
	IdxList.push_back(Zero);
	} else {
	break;
	}
	}

	if (ElTy == DPTy->getElementType())
	// This GEP is inbounds because all indices are zero.
	return ConstantExpr::getInBoundsGetElementPtr(PTy->getElementType(),
	V, IdxList);
	}

	// Handle casts from one vector constant to another. We know that the src
	// and dest type have the same size (otherwise its an illegal cast).
	if (VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
	if (VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
	assert(DestPTy->getBitWidth() == SrcTy->getBitWidth() &&
	"Not cast between same sized vectors!");
	SrcTy = nullptr;
	// First, check for null. Undef is already handled.
	if (isa<ConstantAggregateZero>(V))
	return Constant::getNullValue(DestTy);

	// Handle ConstantVector and ConstantAggregateVector.
	return BitCastConstantVector(V, DestPTy);
	}

	// Canonicalize scalar-to-vector bitcasts into vector-to-vector bitcasts
	// This allows for other simplifications (although some of them
	// can only be handled by Analysis/ConstantFolding.cpp).
	if (isa<ConstantInt>(V) \|\| isa<ConstantFP>(V))
	return ConstantExpr::getBitCast(ConstantVector::get(V), DestPTy);
	}

	// Finally, implement bitcast folding now. The code below doesn't handle
	// bitcast right.
	if (isa<ConstantPointerNull>(V)) // ptr->ptr cast.
	return ConstantPointerNull::get(cast<PointerType>(DestTy));

	// Handle integral constant input.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
	if (DestTy->isIntegerTy())
	// Integral -> Integral. This is a no-op because the bit widths must
	// be the same. Consequently, we just fold to V.
	return V;

	// See note below regarding the PPC_FP128 restriction.
	if (DestTy->isFloatingPointTy() && !DestTy->isPPC_FP128Ty())
	return ConstantFP::get(DestTy->getContext(),
	APFloat(DestTy->getFltSemantics(),
	CI->getValue()));

	// Otherwise, can't fold this (vector?)
	return nullptr;
	}

	// Handle ConstantFP input: FP -> Integral.
	if (ConstantFP *FP = dyn_cast<ConstantFP>(V)) {
	// PPC_FP128 is really the sum of two consecutive doubles, where the first
	// double is always stored first in memory, regardless of the target
	// endianness. The memory layout of i128, however, depends on the target
	// endianness, and so we can't fold this without target endianness
	// information. This should instead be handled by
	// Analysis/ConstantFolding.cpp
	if (FP->getType()->isPPC_FP128Ty())
	return nullptr;

	// Make sure dest type is compatible with the folded integer constant.
	if (!DestTy->isIntegerTy())
	return nullptr;

	return ConstantInt::get(FP->getContext(),
	FP->getValueAPF().bitcastToAPInt());
	}

	return nullptr;
	}


	/// V is an integer constant which only has a subset of its bytes used.
	/// The bytes used are indicated by ByteStart (which is the first byte used,
	/// counting from the least significant byte) and ByteSize, which is the number
	/// of bytes used.
	///
	/// This function analyzes the specified constant to see if the specified byte
	/// range can be returned as a simplified constant. If so, the constant is
	/// returned, otherwise null is returned.
	static Constant ExtractConstantBytes(Constant C, unsigned ByteStart,
	unsigned ByteSize) {
	assert(C->getType()->isIntegerTy() &&
	(cast<IntegerType>(C->getType())->getBitWidth() & 7) == 0 &&
	"Non-byte sized integer input");
	unsigned CSize = cast<IntegerType>(C->getType())->getBitWidth()/8;
	assert(ByteSize && "Must be accessing some piece");
	assert(ByteStart+ByteSize <= CSize && "Extracting invalid piece from input");
	assert(ByteSize != CSize && "Should not extract everything");

	// Constant Integers are simple.
	if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
	APInt V = CI->getValue();
	if (ByteStart)
	V.lshrInPlace(ByteStart*8);
	V = V.trunc(ByteSize*8);
	return ConstantInt::get(CI->getContext(), V);
	}

	// In the input is a constant expr, we might be able to recursively simplify.
	// If not, we definitely can't do anything.
	ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
	if (!CE) return nullptr;

	switch (CE->getOpcode()) {
	default: return nullptr;
	case Instruction::Or: {
	Constant *RHS = ExtractConstantBytes(CE->getOperand(1), ByteStart,ByteSize);
	if (!RHS)
	return nullptr;

	// X \| -1 -> -1.
	if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS))
	if (RHSC->isMinusOne())
	return RHSC;

	Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
	if (!LHS)
	return nullptr;
	return ConstantExpr::getOr(LHS, RHS);
	}
	case Instruction::And: {
	Constant *RHS = ExtractConstantBytes(CE->getOperand(1), ByteStart,ByteSize);
	if (!RHS)
	return nullptr;

	// X & 0 -> 0.
	if (RHS->isNullValue())
	return RHS;

	Constant *LHS = ExtractConstantBytes(CE->getOperand(0), ByteStart,ByteSize);
	if (!LHS)
	return nullptr;
	return ConstantExpr::getAnd(LHS, RHS);
	}
	case Instruction::LShr: {
	ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
	if (!Amt)
	return nullptr;
	unsigned ShAmt = Amt->getZExtValue();
	// Cannot analyze non-byte shifts.
	if ((ShAmt & 7) != 0)
	return nullptr;
	ShAmt >>= 3;

	// If the extract is known to be all zeros, return zero.
	if (ByteStart >= CSize-ShAmt)
	return Constant::getNullValue(IntegerType::get(CE->getContext(),
	ByteSize*8));
	// If the extract is known to be fully in the input, extract it.
	if (ByteStart+ByteSize+ShAmt <= CSize)
	return ExtractConstantBytes(CE->getOperand(0), ByteStart+ShAmt, ByteSize);

	// TODO: Handle the 'partially zero' case.
	return nullptr;
	}

	case Instruction::Shl: {
	ConstantInt *Amt = dyn_cast<ConstantInt>(CE->getOperand(1));
	if (!Amt)
	return nullptr;
	unsigned ShAmt = Amt->getZExtValue();
	// Cannot analyze non-byte shifts.
	if ((ShAmt & 7) != 0)
	return nullptr;
	ShAmt >>= 3;

	// If the extract is known to be all zeros, return zero.
	if (ByteStart+ByteSize <= ShAmt)
	return Constant::getNullValue(IntegerType::get(CE->getContext(),
	ByteSize*8));
	// If the extract is known to be fully in the input, extract it.
	if (ByteStart >= ShAmt)
	return ExtractConstantBytes(CE->getOperand(0), ByteStart-ShAmt, ByteSize);

	// TODO: Handle the 'partially zero' case.
	return nullptr;
	}

	case Instruction::ZExt: {
	unsigned SrcBitSize =
	cast<IntegerType>(CE->getOperand(0)->getType())->getBitWidth();

	// If extracting something that is completely zero, return 0.
	if (ByteStart*8 >= SrcBitSize)
	return Constant::getNullValue(IntegerType::get(CE->getContext(),
	ByteSize*8));

	// If exactly extracting the input, return it.
	if (ByteStart == 0 && ByteSize*8 == SrcBitSize)
	return CE->getOperand(0);

	// If extracting something completely in the input, if if the input is a
	// multiple of 8 bits, recurse.
	if ((SrcBitSize&7) == 0 && (ByteStart+ByteSize)*8 <= SrcBitSize)
	return ExtractConstantBytes(CE->getOperand(0), ByteStart, ByteSize);

	// Otherwise, if extracting a subset of the input, which is not multiple of
	// 8 bits, do a shift and trunc to get the bits.
	if ((ByteStart+ByteSize)*8 < SrcBitSize) {
	assert((SrcBitSize&7) && "Shouldn't get byte sized case here");
	Constant *Res = CE->getOperand(0);
	if (ByteStart)
	Res = ConstantExpr::getLShr(Res,
	ConstantInt::get(Res->getType(), ByteStart*8));
	return ConstantExpr::getTrunc(Res, IntegerType::get(C->getContext(),
	ByteSize*8));
	}

	// TODO: Handle the 'partially zero' case.
	return nullptr;
	}
	}
	}

	/// Return a ConstantExpr with type DestTy for sizeof on Ty, with any known
	/// factors factored out. If Folded is false, return null if no factoring was
	/// possible, to avoid endlessly bouncing an unfoldable expression back into the
	/// top-level folder.
	static Constant getFoldedSizeOf(Type Ty, Type *DestTy, bool Folded) {
	if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	Constant *N = ConstantInt::get(DestTy, ATy->getNumElements());
	Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true);
	return ConstantExpr::getNUWMul(E, N);
	}

	if (StructType *STy = dyn_cast<StructType>(Ty))
	if (!STy->isPacked()) {
	unsigned NumElems = STy->getNumElements();
	// An empty struct has size zero.
	if (NumElems == 0)
	return ConstantExpr::getNullValue(DestTy);
	// Check for a struct with all members having the same size.
	Constant *MemberSize =
	getFoldedSizeOf(STy->getElementType(0), DestTy, true);
	bool AllSame = true;
	for (unsigned i = 1; i != NumElems; ++i)
	if (MemberSize !=
	getFoldedSizeOf(STy->getElementType(i), DestTy, true)) {
	AllSame = false;
	break;
	}
	if (AllSame) {
	Constant *N = ConstantInt::get(DestTy, NumElems);
	return ConstantExpr::getNUWMul(MemberSize, N);
	}
	}

	// Pointer size doesn't depend on the pointee type, so canonicalize them
	// to an arbitrary pointee.
	if (PointerType *PTy = dyn_cast<PointerType>(Ty))
	if (!PTy->getElementType()->isIntegerTy(1))
	return
	getFoldedSizeOf(PointerType::get(IntegerType::get(PTy->getContext(), 1),
	PTy->getAddressSpace()),
	DestTy, true);

	// If there's no interesting folding happening, bail so that we don't create
	// a constant that looks like it needs folding but really doesn't.
	if (!Folded)
	return nullptr;

	// Base case: Get a regular sizeof expression.
	Constant *C = ConstantExpr::getSizeOf(Ty);
	C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
	DestTy, false),
	C, DestTy);
	return C;
	}

	/// Return a ConstantExpr with type DestTy for alignof on Ty, with any known
	/// factors factored out. If Folded is false, return null if no factoring was
	/// possible, to avoid endlessly bouncing an unfoldable expression back into the
	/// top-level folder.
	static Constant getFoldedAlignOf(Type Ty, Type *DestTy, bool Folded) {
	// The alignment of an array is equal to the alignment of the
	// array element. Note that this is not always true for vectors.
	if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	Constant *C = ConstantExpr::getAlignOf(ATy->getElementType());
	C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
	DestTy,
	false),
	C, DestTy);
	return C;
	}

	if (StructType *STy = dyn_cast<StructType>(Ty)) {
	// Packed structs always have an alignment of 1.
	if (STy->isPacked())
	return ConstantInt::get(DestTy, 1);

	// Otherwise, struct alignment is the maximum alignment of any member.
	// Without target data, we can't compare much, but we can check to see
	// if all the members have the same alignment.
	unsigned NumElems = STy->getNumElements();
	// An empty struct has minimal alignment.
	if (NumElems == 0)
	return ConstantInt::get(DestTy, 1);
	// Check for a struct with all members having the same alignment.
	Constant *MemberAlign =
	getFoldedAlignOf(STy->getElementType(0), DestTy, true);
	bool AllSame = true;
	for (unsigned i = 1; i != NumElems; ++i)
	if (MemberAlign != getFoldedAlignOf(STy->getElementType(i), DestTy, true)) {
	AllSame = false;
	break;
	}
	if (AllSame)
	return MemberAlign;
	}

	// Pointer alignment doesn't depend on the pointee type, so canonicalize them
	// to an arbitrary pointee.
	if (PointerType *PTy = dyn_cast<PointerType>(Ty))
	if (!PTy->getElementType()->isIntegerTy(1))
	return
	getFoldedAlignOf(PointerType::get(IntegerType::get(PTy->getContext(),
	1),
	PTy->getAddressSpace()),
	DestTy, true);

	// If there's no interesting folding happening, bail so that we don't create
	// a constant that looks like it needs folding but really doesn't.
	if (!Folded)
	return nullptr;

	// Base case: Get a regular alignof expression.
	Constant *C = ConstantExpr::getAlignOf(Ty);
	C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
	DestTy, false),
	C, DestTy);
	return C;
	}

	/// Return a ConstantExpr with type DestTy for offsetof on Ty and FieldNo, with
	/// any known factors factored out. If Folded is false, return null if no
	/// factoring was possible, to avoid endlessly bouncing an unfoldable expression
	/// back into the top-level folder.
	static Constant getFoldedOffsetOf(Type Ty, Constant FieldNo, Type DestTy,
	bool Folded) {
	if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo, false,
	DestTy, false),
	FieldNo, DestTy);
	Constant *E = getFoldedSizeOf(ATy->getElementType(), DestTy, true);
	return ConstantExpr::getNUWMul(E, N);
	}

	if (StructType *STy = dyn_cast<StructType>(Ty))
	if (!STy->isPacked()) {
	unsigned NumElems = STy->getNumElements();
	// An empty struct has no members.
	if (NumElems == 0)
	return nullptr;
	// Check for a struct with all members having the same size.
	Constant *MemberSize =
	getFoldedSizeOf(STy->getElementType(0), DestTy, true);
	bool AllSame = true;
	for (unsigned i = 1; i != NumElems; ++i)
	if (MemberSize !=
	getFoldedSizeOf(STy->getElementType(i), DestTy, true)) {
	AllSame = false;
	break;
	}
	if (AllSame) {
	Constant *N = ConstantExpr::getCast(CastInst::getCastOpcode(FieldNo,
	false,
	DestTy,
	false),
	FieldNo, DestTy);
	return ConstantExpr::getNUWMul(MemberSize, N);
	}
	}

	// If there's no interesting folding happening, bail so that we don't create
	// a constant that looks like it needs folding but really doesn't.
	if (!Folded)
	return nullptr;

	// Base case: Get a regular offsetof expression.
	Constant *C = ConstantExpr::getOffsetOf(Ty, FieldNo);
	C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
	DestTy, false),
	C, DestTy);
	return C;
	}

	Constant llvm::ConstantFoldCastInstruction(unsigned opc, Constant V,
	Type *DestTy) {
	if (isa<UndefValue>(V)) {
	// zext(undef) = 0, because the top bits will be zero.
	// sext(undef) = 0, because the top bits will all be the same.
	// [us]itofp(undef) = 0, because the result value is bounded.
	if (opc == Instruction::ZExt \|\| opc == Instruction::SExt \|\|
	opc == Instruction::UIToFP \|\| opc == Instruction::SIToFP)
	return Constant::getNullValue(DestTy);
	return UndefValue::get(DestTy);
	}

	if (V->isNullValue() && !DestTy->isX86_MMXTy() &&
	opc != Instruction::AddrSpaceCast)
	return Constant::getNullValue(DestTy);

	// If the cast operand is a constant expression, there's a few things we can
	// do to try to simplify it.
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
	if (CE->isCast()) {
	// Try hard to fold cast of cast because they are often eliminable.
	if (unsigned newOpc = foldConstantCastPair(opc, CE, DestTy))
	return ConstantExpr::getCast(newOpc, CE->getOperand(0), DestTy);
	} else if (CE->getOpcode() == Instruction::GetElementPtr &&
	// Do not fold addrspacecast (gep 0, .., 0). It might make the
	// addrspacecast uncanonicalized.
	opc != Instruction::AddrSpaceCast &&
	// Do not fold bitcast (gep) with inrange index, as this loses
	// information.
	!cast<GEPOperator>(CE)->getInRangeIndex().hasValue()) {
	// If all of the indexes in the GEP are null values, there is no pointer
	// adjustment going on. We might as well cast the source pointer.
	bool isAllNull = true;
	for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
	if (!CE->getOperand(i)->isNullValue()) {
	isAllNull = false;
	break;
	}
	if (isAllNull)
	// This is casting one pointer type to another, always BitCast
	return ConstantExpr::getPointerCast(CE->getOperand(0), DestTy);
	}
	}

	// If the cast operand is a constant vector, perform the cast by
	// operating on each element. In the cast of bitcasts, the element
	// count may be mismatched; don't attempt to handle that here.
	if ((isa<ConstantVector>(V) \|\| isa<ConstantDataVector>(V)) &&
	DestTy->isVectorTy() &&
	DestTy->getVectorNumElements() == V->getType()->getVectorNumElements()) {
	SmallVector<Constant*, 16> res;
	VectorType *DestVecTy = cast<VectorType>(DestTy);
	Type *DstEltTy = DestVecTy->getElementType();
	Type *Ty = IntegerType::get(V->getContext(), 32);
	for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i) {
	Constant *C =
	ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
	res.push_back(ConstantExpr::getCast(opc, C, DstEltTy));
	}
	return ConstantVector::get(res);
	}

	// We actually have to do a cast now. Perform the cast according to the
	// opcode specified.
	switch (opc) {
	default:
	llvm_unreachable("Failed to cast constant expression");
	case Instruction::FPTrunc:
	case Instruction::FPExt:
	if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
	bool ignored;
	APFloat Val = FPC->getValueAPF();
	Val.convert(DestTy->isHalfTy() ? APFloat::IEEEhalf() :
	DestTy->isFloatTy() ? APFloat::IEEEsingle() :
	DestTy->isDoubleTy() ? APFloat::IEEEdouble() :
	DestTy->isX86_FP80Ty() ? APFloat::x87DoubleExtended() :
	DestTy->isFP128Ty() ? APFloat::IEEEquad() :
	DestTy->isPPC_FP128Ty() ? APFloat::PPCDoubleDouble() :
	APFloat::Bogus(),
	APFloat::rmNearestTiesToEven, &ignored);
	return ConstantFP::get(V->getContext(), Val);
	}
	return nullptr; // Can't fold.
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
	const APFloat &V = FPC->getValueAPF();
	bool ignored;
	uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
	APSInt IntVal(DestBitWidth, opc == Instruction::FPToUI);
	if (APFloat::opInvalidOp ==
	V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored)) {
	// Undefined behavior invoked - the destination type can't represent
	// the input constant.
	return UndefValue::get(DestTy);
	}
	return ConstantInt::get(FPC->getContext(), IntVal);
	}
	return nullptr; // Can't fold.
	case Instruction::IntToPtr: //always treated as unsigned
	if (V->isNullValue()) // Is it an integral null value?
	return ConstantPointerNull::get(cast<PointerType>(DestTy));
	return nullptr; // Other pointer types cannot be casted
	case Instruction::PtrToInt: // always treated as unsigned
	// Is it a null pointer value?
	if (V->isNullValue())
	return ConstantInt::get(DestTy, 0);
	// If this is a sizeof-like expression, pull out multiplications by
	// known factors to expose them to subsequent folding. If it's an
	// alignof-like expression, factor out known factors.
	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
	if (CE->getOpcode() == Instruction::GetElementPtr &&
	CE->getOperand(0)->isNullValue()) {
	GEPOperator *GEPO = cast<GEPOperator>(CE);
	Type *Ty = GEPO->getSourceElementType();
	if (CE->getNumOperands() == 2) {
	// Handle a sizeof-like expression.
	Constant *Idx = CE->getOperand(1);
	bool isOne = isa<ConstantInt>(Idx) && cast<ConstantInt>(Idx)->isOne();
	if (Constant *C = getFoldedSizeOf(Ty, DestTy, !isOne)) {
	Idx = ConstantExpr::getCast(CastInst::getCastOpcode(Idx, true,
	DestTy, false),
	Idx, DestTy);
	return ConstantExpr::getMul(C, Idx);
	}
	} else if (CE->getNumOperands() == 3 &&
	CE->getOperand(1)->isNullValue()) {
	// Handle an alignof-like expression.
	if (StructType *STy = dyn_cast<StructType>(Ty))
	if (!STy->isPacked()) {
	ConstantInt *CI = cast<ConstantInt>(CE->getOperand(2));
	if (CI->isOne() &&
	STy->getNumElements() == 2 &&
	STy->getElementType(0)->isIntegerTy(1)) {
	return getFoldedAlignOf(STy->getElementType(1), DestTy, false);
	}
	}
	// Handle an offsetof-like expression.
	if (Ty->isStructTy() \|\| Ty->isArrayTy()) {
	if (Constant *C = getFoldedOffsetOf(Ty, CE->getOperand(2),
	DestTy, false))
	return C;
	}
	}
	}
	// Other pointer types cannot be casted
	return nullptr;
	case Instruction::UIToFP:
	case Instruction::SIToFP:
	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
	const APInt &api = CI->getValue();
	APFloat apf(DestTy->getFltSemantics(),
	APInt::getNullValue(DestTy->getPrimitiveSizeInBits()));
	if (APFloat::opOverflow &
	apf.convertFromAPInt(api, opc==Instruction::SIToFP,
	APFloat::rmNearestTiesToEven)) {
	// Undefined behavior invoked - the destination type can't represent
	// the input constant.
	return UndefValue::get(DestTy);
	}
	return ConstantFP::get(V->getContext(), apf);
	}
	return nullptr;
	case Instruction::ZExt:
	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
	uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
	return ConstantInt::get(V->getContext(),
	CI->getValue().zext(BitWidth));
	}
	return nullptr;
	case Instruction::SExt:
	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
	uint32_t BitWidth = cast<IntegerType>(DestTy)->getBitWidth();
	return ConstantInt::get(V->getContext(),
	CI->getValue().sext(BitWidth));
	}
	return nullptr;
	case Instruction::Trunc: {
	if (V->getType()->isVectorTy())
	return nullptr;

	uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
	if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
	return ConstantInt::get(V->getContext(),
	CI->getValue().trunc(DestBitWidth));
	}

	// The input must be a constantexpr. See if we can simplify this based on
	// the bytes we are demanding. Only do this if the source and dest are an
	// even multiple of a byte.
	if ((DestBitWidth & 7) == 0 &&
	(cast<IntegerType>(V->getType())->getBitWidth() & 7) == 0)
	if (Constant *Res = ExtractConstantBytes(V, 0, DestBitWidth / 8))
	return Res;

	return nullptr;
	}
	case Instruction::BitCast:
	return FoldBitCast(V, DestTy);
	case Instruction::AddrSpaceCast:
	return nullptr;
	}
	}

	Constant llvm::ConstantFoldSelectInstruction(Constant Cond,
	Constant V1, Constant V2) {
	// Check for i1 and vector true/false conditions.
	if (Cond->isNullValue()) return V2;
	if (Cond->isAllOnesValue()) return V1;

	// If the condition is a vector constant, fold the result elementwise.
	if (ConstantVector *CondV = dyn_cast<ConstantVector>(Cond)) {
	SmallVector<Constant*, 16> Result;
	Type *Ty = IntegerType::get(CondV->getContext(), 32);
	for (unsigned i = 0, e = V1->getType()->getVectorNumElements(); i != e;++i){
	Constant *V;
	Constant *V1Element = ConstantExpr::getExtractElement(V1,
	ConstantInt::get(Ty, i));
	Constant *V2Element = ConstantExpr::getExtractElement(V2,
	ConstantInt::get(Ty, i));
	Constant *Cond = dyn_cast<Constant>(CondV->getOperand(i));
	if (V1Element == V2Element) {
	V = V1Element;
	} else if (isa<UndefValue>(Cond)) {
	V = isa<UndefValue>(V1Element) ? V1Element : V2Element;
	} else {
	if (!isa<ConstantInt>(Cond)) break;
	V = Cond->isNullValue() ? V2Element : V1Element;
	}
	Result.push_back(V);
	}

	// If we were able to build the vector, return it.
	if (Result.size() == V1->getType()->getVectorNumElements())
	return ConstantVector::get(Result);
	}

	if (isa<UndefValue>(Cond)) {
	if (isa<UndefValue>(V1)) return V1;
	return V2;
	}
	if (isa<UndefValue>(V1)) return V2;
	if (isa<UndefValue>(V2)) return V1;
	if (V1 == V2) return V1;

	if (ConstantExpr *TrueVal = dyn_cast<ConstantExpr>(V1)) {
	if (TrueVal->getOpcode() == Instruction::Select)
	if (TrueVal->getOperand(0) == Cond)
	return ConstantExpr::getSelect(Cond, TrueVal->getOperand(1), V2);
	}
	if (ConstantExpr *FalseVal = dyn_cast<ConstantExpr>(V2)) {
	if (FalseVal->getOpcode() == Instruction::Select)
	if (FalseVal->getOperand(0) == Cond)
	return ConstantExpr::getSelect(Cond, V1, FalseVal->getOperand(2));
	}

	return nullptr;
	}

	Constant llvm::ConstantFoldExtractElementInstruction(Constant Val,
	Constant *Idx) {
	if (isa<UndefValue>(Val)) // ee(undef, x) -> undef
	return UndefValue::get(Val->getType()->getVectorElementType());
	if (Val->isNullValue()) // ee(zero, x) -> zero
	return Constant::getNullValue(Val->getType()->getVectorElementType());
	// ee({w,x,y,z}, undef) -> undef
	if (isa<UndefValue>(Idx))
	return UndefValue::get(Val->getType()->getVectorElementType());

	if (ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx)) {
	// ee({w,x,y,z}, wrong_value) -> undef
	if (CIdx->uge(Val->getType()->getVectorNumElements()))
	return UndefValue::get(Val->getType()->getVectorElementType());
	return Val->getAggregateElement(CIdx->getZExtValue());
	}
	return nullptr;
	}

	Constant llvm::ConstantFoldInsertElementInstruction(Constant Val,
	Constant *Elt,
	Constant *Idx) {
	if (isa<UndefValue>(Idx))
	return UndefValue::get(Val->getType());

	ConstantInt *CIdx = dyn_cast<ConstantInt>(Idx);
	if (!CIdx) return nullptr;

	unsigned NumElts = Val->getType()->getVectorNumElements();
	if (CIdx->uge(NumElts))
	return UndefValue::get(Val->getType());

	SmallVector<Constant*, 16> Result;
	Result.reserve(NumElts);
	auto *Ty = Type::getInt32Ty(Val->getContext());
	uint64_t IdxVal = CIdx->getZExtValue();
	for (unsigned i = 0; i != NumElts; ++i) {
	if (i == IdxVal) {
	Result.push_back(Elt);
	continue;
	}

	Constant *C = ConstantExpr::getExtractElement(Val, ConstantInt::get(Ty, i));
	Result.push_back(C);
	}

	return ConstantVector::get(Result);
	}

	Constant llvm::ConstantFoldShuffleVectorInstruction(Constant V1,
	Constant *V2,
	Constant *Mask) {
	unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
	Type *EltTy = V1->getType()->getVectorElementType();

	// Undefined shuffle mask -> undefined value.
	if (isa<UndefValue>(Mask))
	return UndefValue::get(VectorType::get(EltTy, MaskNumElts));

	// Don't break the bitcode reader hack.
	if (isa<ConstantExpr>(Mask)) return nullptr;

	unsigned SrcNumElts = V1->getType()->getVectorNumElements();

	// Loop over the shuffle mask, evaluating each element.
	SmallVector<Constant*, 32> Result;
	for (unsigned i = 0; i != MaskNumElts; ++i) {
	int Elt = ShuffleVectorInst::getMaskValue(Mask, i);
	if (Elt == -1) {
	Result.push_back(UndefValue::get(EltTy));
	continue;
	}
	Constant *InElt;
	if (unsigned(Elt) >= SrcNumElts*2)
	InElt = UndefValue::get(EltTy);
	else if (unsigned(Elt) >= SrcNumElts) {
	Type *Ty = IntegerType::get(V2->getContext(), 32);
	InElt =
	ConstantExpr::getExtractElement(V2,
	ConstantInt::get(Ty, Elt - SrcNumElts));
	} else {
	Type *Ty = IntegerType::get(V1->getContext(), 32);
	InElt = ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, Elt));
	}
	Result.push_back(InElt);
	}

	return ConstantVector::get(Result);
	}

	Constant llvm::ConstantFoldExtractValueInstruction(Constant Agg,
	ArrayRef<unsigned> Idxs) {
	// Base case: no indices, so return the entire value.
	if (Idxs.empty())
	return Agg;

	if (Constant *C = Agg->getAggregateElement(Idxs[0]))
	return ConstantFoldExtractValueInstruction(C, Idxs.slice(1));

	return nullptr;
	}

	Constant llvm::ConstantFoldInsertValueInstruction(Constant Agg,
	Constant *Val,
	ArrayRef<unsigned> Idxs) {
	// Base case: no indices, so replace the entire value.
	if (Idxs.empty())
	return Val;

	unsigned NumElts;
	if (StructType *ST = dyn_cast<StructType>(Agg->getType()))
	NumElts = ST->getNumElements();
	else
	NumElts = cast<SequentialType>(Agg->getType())->getNumElements();

	SmallVector<Constant*, 32> Result;
	for (unsigned i = 0; i != NumElts; ++i) {
	Constant *C = Agg->getAggregateElement(i);
	if (!C) return nullptr;

	if (Idxs[0] == i)
	C = ConstantFoldInsertValueInstruction(C, Val, Idxs.slice(1));

	Result.push_back(C);
	}

	if (StructType *ST = dyn_cast<StructType>(Agg->getType()))
	return ConstantStruct::get(ST, Result);
	if (ArrayType *AT = dyn_cast<ArrayType>(Agg->getType()))
	return ConstantArray::get(AT, Result);
	return ConstantVector::get(Result);
	}


	Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
	Constant C1, Constant C2) {
	assert(Instruction::isBinaryOp(Opcode) && "Non-binary instruction detected");

	// Handle UndefValue up front.
	if (isa<UndefValue>(C1) \|\| isa<UndefValue>(C2)) {
	switch (static_cast<Instruction::BinaryOps>(Opcode)) {
	case Instruction::Xor:
	if (isa<UndefValue>(C1) && isa<UndefValue>(C2))
	// Handle undef ^ undef -> 0 special case. This is a common
	// idiom (misuse).
	return Constant::getNullValue(C1->getType());
	LLVM_FALLTHROUGH;
	case Instruction::Add:
	case Instruction::Sub:
	return UndefValue::get(C1->getType());
	case Instruction::And:
	if (isa<UndefValue>(C1) && isa<UndefValue>(C2)) // undef & undef -> undef
	return C1;
	return Constant::getNullValue(C1->getType()); // undef & X -> 0
	case Instruction::Mul: {
	// undef * undef -> undef
	if (isa<UndefValue>(C1) && isa<UndefValue>(C2))
	return C1;
	const APInt *CV;
	// X * undef -> undef if X is odd
	if (match(C1, m_APInt(CV)) \|\| match(C2, m_APInt(CV)))
	if ((*CV)[0])
	return UndefValue::get(C1->getType());

	// X * undef -> 0 otherwise
	return Constant::getNullValue(C1->getType());
	}
	case Instruction::SDiv:
	case Instruction::UDiv:
	// X / undef -> undef
	if (isa<UndefValue>(C2))
	return C2;
	// undef / 0 -> undef
	// undef / 1 -> undef
	if (match(C2, m_Zero()) \|\| match(C2, m_One()))
	return C1;
	// undef / X -> 0 otherwise
	return Constant::getNullValue(C1->getType());
	case Instruction::URem:
	case Instruction::SRem:
	// X % undef -> undef
	if (match(C2, m_Undef()))
	return C2;
	// undef % 0 -> undef
	if (match(C2, m_Zero()))
	return C1;
	// undef % X -> 0 otherwise
	return Constant::getNullValue(C1->getType());
	case Instruction::Or: // X \| undef -> -1
	if (isa<UndefValue>(C1) && isa<UndefValue>(C2)) // undef \| undef -> undef
	return C1;
	return Constant::getAllOnesValue(C1->getType()); // undef \| X -> ~0
	case Instruction::LShr:
	// X >>l undef -> undef
	if (isa<UndefValue>(C2))
	return C2;
	// undef >>l 0 -> undef
	if (match(C2, m_Zero()))
	return C1;
	// undef >>l X -> 0
	return Constant::getNullValue(C1->getType());
	case Instruction::AShr:
	// X >>a undef -> undef
	if (isa<UndefValue>(C2))
	return C2;
	// undef >>a 0 -> undef
	if (match(C2, m_Zero()))
	return C1;
	// TODO: undef >>a X -> undef if the shift is exact
	// undef >>a X -> 0
	return Constant::getNullValue(C1->getType());
	case Instruction::Shl:
	// X << undef -> undef
	if (isa<UndefValue>(C2))
	return C2;
	// undef << 0 -> undef
	if (match(C2, m_Zero()))
	return C1;
	// undef << X -> 0
	return Constant::getNullValue(C1->getType());
	case Instruction::FAdd:
	case Instruction::FSub:
	case Instruction::FMul:
	case Instruction::FDiv:
	case Instruction::FRem:
	// TODO: UNDEF handling for binary float instructions.
	return nullptr;
	case Instruction::BinaryOpsEnd:
	llvm_unreachable("Invalid BinaryOp");
	}
	}

	// At this point neither constant should be an UndefValue.
	assert(!isa<UndefValue>(C1) && !isa<UndefValue>(C2) &&
	"Unexpected UndefValue");

	// Handle simplifications when the RHS is a constant int.
	if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
	switch (Opcode) {
	case Instruction::Add:
	if (CI2->isZero()) return C1; // X + 0 == X
	break;
	case Instruction::Sub:
	if (CI2->isZero()) return C1; // X - 0 == X
	break;
	case Instruction::Mul:
	if (CI2->isZero()) return C2; // X * 0 == 0
	if (CI2->isOne())
	return C1; // X * 1 == X
	break;
	case Instruction::UDiv:
	case Instruction::SDiv:
	if (CI2->isOne())
	return C1; // X / 1 == X
	if (CI2->isZero())
	return UndefValue::get(CI2->getType()); // X / 0 == undef
	break;
	case Instruction::URem:
	case Instruction::SRem:
	if (CI2->isOne())
	return Constant::getNullValue(CI2->getType()); // X % 1 == 0
	if (CI2->isZero())
	return UndefValue::get(CI2->getType()); // X % 0 == undef
	break;
	case Instruction::And:
	if (CI2->isZero()) return C2; // X & 0 == 0
	if (CI2->isMinusOne())
	return C1; // X & -1 == X

	if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
	// (zext i32 to i64) & 4294967295 -> (zext i32 to i64)
	if (CE1->getOpcode() == Instruction::ZExt) {
	unsigned DstWidth = CI2->getType()->getBitWidth();
	unsigned SrcWidth =
	CE1->getOperand(0)->getType()->getPrimitiveSizeInBits();
	APInt PossiblySetBits(APInt::getLowBitsSet(DstWidth, SrcWidth));
	if ((PossiblySetBits & CI2->getValue()) == PossiblySetBits)
	return C1;
	}

	// If and'ing the address of a global with a constant, fold it.
	if (CE1->getOpcode() == Instruction::PtrToInt &&
	isa<GlobalValue>(CE1->getOperand(0))) {
	GlobalValue *GV = cast<GlobalValue>(CE1->getOperand(0));

	// Functions are at least 4-byte aligned.
	unsigned GVAlign = GV->getAlignment();
	if (isa<Function>(GV))
	GVAlign = std::max(GVAlign, 4U);

	if (GVAlign > 1) {
	unsigned DstWidth = CI2->getType()->getBitWidth();
	unsigned SrcWidth = std::min(DstWidth, Log2_32(GVAlign));
	APInt BitsNotSet(APInt::getLowBitsSet(DstWidth, SrcWidth));

	// If checking bits we know are clear, return zero.
	if ((CI2->getValue() & BitsNotSet) == CI2->getValue())
	return Constant::getNullValue(CI2->getType());
	}
	}
	}
	break;
	case Instruction::Or:
	if (CI2->isZero()) return C1; // X \| 0 == X
	if (CI2->isMinusOne())
	return C2; // X \| -1 == -1
	break;
	case Instruction::Xor:
	if (CI2->isZero()) return C1; // X ^ 0 == X

	if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
	switch (CE1->getOpcode()) {
	default: break;
	case Instruction::ICmp:
	case Instruction::FCmp:
	// cmp pred ^ true -> cmp !pred
	assert(CI2->isOne());
	CmpInst::Predicate pred = (CmpInst::Predicate)CE1->getPredicate();
	pred = CmpInst::getInversePredicate(pred);
	return ConstantExpr::getCompare(pred, CE1->getOperand(0),
	CE1->getOperand(1));
	}
	}
	break;
	case Instruction::AShr:
	// ashr (zext C to Ty), C2 -> lshr (zext C, CSA), C2
	if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1))
	if (CE1->getOpcode() == Instruction::ZExt) // Top bits known zero.
	return ConstantExpr::getLShr(C1, C2);
	break;
	}
	} else if (isa<ConstantInt>(C1)) {
	// If C1 is a ConstantInt and C2 is not, swap the operands.
	if (Instruction::isCommutative(Opcode))
	return ConstantExpr::get(Opcode, C2, C1);
	}

	if (ConstantInt *CI1 = dyn_cast<ConstantInt>(C1)) {
	if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
	const APInt &C1V = CI1->getValue();
	const APInt &C2V = CI2->getValue();
	switch (Opcode) {
	default:
	break;
	case Instruction::Add:
	return ConstantInt::get(CI1->getContext(), C1V + C2V);
	case Instruction::Sub:
	return ConstantInt::get(CI1->getContext(), C1V - C2V);
	case Instruction::Mul:
	return ConstantInt::get(CI1->getContext(), C1V * C2V);
	case Instruction::UDiv:
	assert(!CI2->isZero() && "Div by zero handled above");
	return ConstantInt::get(CI1->getContext(), C1V.udiv(C2V));
	case Instruction::SDiv:
	assert(!CI2->isZero() && "Div by zero handled above");
	if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
	return UndefValue::get(CI1->getType()); // MIN_INT / -1 -> undef
	return ConstantInt::get(CI1->getContext(), C1V.sdiv(C2V));
	case Instruction::URem:
	assert(!CI2->isZero() && "Div by zero handled above");
	return ConstantInt::get(CI1->getContext(), C1V.urem(C2V));
	case Instruction::SRem:
	assert(!CI2->isZero() && "Div by zero handled above");
	if (C2V.isAllOnesValue() && C1V.isMinSignedValue())
	return UndefValue::get(CI1->getType()); // MIN_INT % -1 -> undef
	return ConstantInt::get(CI1->getContext(), C1V.srem(C2V));
	case Instruction::And:
	return ConstantInt::get(CI1->getContext(), C1V & C2V);
	case Instruction::Or:
	return ConstantInt::get(CI1->getContext(), C1V \| C2V);
	case Instruction::Xor:
	return ConstantInt::get(CI1->getContext(), C1V ^ C2V);
	case Instruction::Shl:
	if (C2V.ult(C1V.getBitWidth()))
	return ConstantInt::get(CI1->getContext(), C1V.shl(C2V));
	return UndefValue::get(C1->getType()); // too big shift is undef
	case Instruction::LShr:
	if (C2V.ult(C1V.getBitWidth()))
	return ConstantInt::get(CI1->getContext(), C1V.lshr(C2V));
	return UndefValue::get(C1->getType()); // too big shift is undef
	case Instruction::AShr:
	if (C2V.ult(C1V.getBitWidth()))
	return ConstantInt::get(CI1->getContext(), C1V.ashr(C2V));
	return UndefValue::get(C1->getType()); // too big shift is undef
	}
	}

	switch (Opcode) {
	case Instruction::SDiv:
	case Instruction::UDiv:
	case Instruction::URem:
	case Instruction::SRem:
	case Instruction::LShr:
	case Instruction::AShr:
	case Instruction::Shl:
	if (CI1->isZero()) return C1;
	break;
	default:
	break;
	}
	} else if (ConstantFP *CFP1 = dyn_cast<ConstantFP>(C1)) {
	if (ConstantFP *CFP2 = dyn_cast<ConstantFP>(C2)) {
	const APFloat &C1V = CFP1->getValueAPF();
	const APFloat &C2V = CFP2->getValueAPF();
	APFloat C3V = C1V; // copy for modification
	switch (Opcode) {
	default:
	break;
	case Instruction::FAdd:
	(void)C3V.add(C2V, APFloat::rmNearestTiesToEven);
	return ConstantFP::get(C1->getContext(), C3V);
	case Instruction::FSub:
	(void)C3V.subtract(C2V, APFloat::rmNearestTiesToEven);
	return ConstantFP::get(C1->getContext(), C3V);
	case Instruction::FMul:
	(void)C3V.multiply(C2V, APFloat::rmNearestTiesToEven);
	return ConstantFP::get(C1->getContext(), C3V);
	case Instruction::FDiv:
	(void)C3V.divide(C2V, APFloat::rmNearestTiesToEven);
	return ConstantFP::get(C1->getContext(), C3V);
	case Instruction::FRem:
	(void)C3V.mod(C2V);
	return ConstantFP::get(C1->getContext(), C3V);
	}
	}
	} else if (VectorType *VTy = dyn_cast<VectorType>(C1->getType())) {
	// Perform elementwise folding.
	SmallVector<Constant*, 16> Result;
	Type *Ty = IntegerType::get(VTy->getContext(), 32);
	for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
	Constant *ExtractIdx = ConstantInt::get(Ty, i);
	Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx);
	Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx);

	// If any element of a divisor vector is zero, the whole op is undef.
	if ((Opcode == Instruction::SDiv \|\| Opcode == Instruction::UDiv \|\|
	Opcode == Instruction::SRem \|\| Opcode == Instruction::URem) &&
	RHS->isNullValue())
	return UndefValue::get(VTy);

	Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
	}

	return ConstantVector::get(Result);
	}

	if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
	// There are many possible foldings we could do here. We should probably
	// at least fold add of a pointer with an integer into the appropriate
	// getelementptr. This will improve alias analysis a bit.

	// Given ((a + b) + c), if (b + c) folds to something interesting, return
	// (a + (b + c)).
	if (Instruction::isAssociative(Opcode) && CE1->getOpcode() == Opcode) {
	Constant *T = ConstantExpr::get(Opcode, CE1->getOperand(1), C2);
	if (!isa<ConstantExpr>(T) \|\| cast<ConstantExpr>(T)->getOpcode() != Opcode)
	return ConstantExpr::get(Opcode, CE1->getOperand(0), T);
	}
	} else if (isa<ConstantExpr>(C2)) {
	// If C2 is a constant expr and C1 isn't, flop them around and fold the
	// other way if possible.
	if (Instruction::isCommutative(Opcode))
	return ConstantFoldBinaryInstruction(Opcode, C2, C1);
	}

	// i1 can be simplified in many cases.
	if (C1->getType()->isIntegerTy(1)) {
	switch (Opcode) {
	case Instruction::Add:
	case Instruction::Sub:
	return ConstantExpr::getXor(C1, C2);
	case Instruction::Mul:
	return ConstantExpr::getAnd(C1, C2);
	case Instruction::Shl:
	case Instruction::LShr:
	case Instruction::AShr:
	// We can assume that C2 == 0. If it were one the result would be
	// undefined because the shift value is as large as the bitwidth.
	return C1;
	case Instruction::SDiv:
	case Instruction::UDiv:
	// We can assume that C2 == 1. If it were zero the result would be
	// undefined through division by zero.
	return C1;
	case Instruction::URem:
	case Instruction::SRem:
	// We can assume that C2 == 1. If it were zero the result would be
	// undefined through division by zero.
	return ConstantInt::getFalse(C1->getContext());
	default:
	break;
	}
	}

	// We don't know how to fold this.
	return nullptr;
	}

	/// This type is zero-sized if it's an array or structure of zero-sized types.
	/// The only leaf zero-sized type is an empty structure.
	static bool isMaybeZeroSizedType(Type *Ty) {
	if (StructType *STy = dyn_cast<StructType>(Ty)) {
	if (STy->isOpaque()) return true; // Can't say.

	// If all of elements have zero size, this does too.
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
	if (!isMaybeZeroSizedType(STy->getElementType(i))) return false;
	return true;

	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	return isMaybeZeroSizedType(ATy->getElementType());
	}
	return false;
	}

	/// Compare the two constants as though they were getelementptr indices.
	/// This allows coercion of the types to be the same thing.
	///
	/// If the two constants are the "same" (after coercion), return 0. If the
	/// first is less than the second, return -1, if the second is less than the
	/// first, return 1. If the constants are not integral, return -2.
	///
	static int IdxCompare(Constant C1, Constant C2, Type *ElTy) {
	if (C1 == C2) return 0;

	// Ok, we found a different index. If they are not ConstantInt, we can't do
	// anything with them.
	if (!isa<ConstantInt>(C1) \|\| !isa<ConstantInt>(C2))
	return -2; // don't know!

	// We cannot compare the indices if they don't fit in an int64_t.
	if (cast<ConstantInt>(C1)->getValue().getActiveBits() > 64 \|\|
	cast<ConstantInt>(C2)->getValue().getActiveBits() > 64)
	return -2; // don't know!

	// Ok, we have two differing integer indices. Sign extend them to be the same
	// type.
	int64_t C1Val = cast<ConstantInt>(C1)->getSExtValue();
	int64_t C2Val = cast<ConstantInt>(C2)->getSExtValue();

	if (C1Val == C2Val) return 0; // They are equal

	// If the type being indexed over is really just a zero sized type, there is
	// no pointer difference being made here.
	if (isMaybeZeroSizedType(ElTy))
	return -2; // dunno.

	// If they are really different, now that they are the same type, then we
	// found a difference!
	if (C1Val < C2Val)
	return -1;
	else
	return 1;
	}

	/// This function determines if there is anything we can decide about the two
	/// constants provided. This doesn't need to handle simple things like
	/// ConstantFP comparisons, but should instead handle ConstantExprs.
	/// If we can determine that the two constants have a particular relation to
	/// each other, we should return the corresponding FCmpInst predicate,
	/// otherwise return FCmpInst::BAD_FCMP_PREDICATE. This is used below in
	/// ConstantFoldCompareInstruction.
	///
	/// To simplify this code we canonicalize the relation so that the first
	/// operand is always the most "complex" of the two. We consider ConstantFP
	/// to be the simplest, and ConstantExprs to be the most complex.
	static FCmpInst::Predicate evaluateFCmpRelation(Constant V1, Constant V2) {
	assert(V1->getType() == V2->getType() &&
	"Cannot compare values of different types!");

	// Handle degenerate case quickly
	if (V1 == V2) return FCmpInst::FCMP_OEQ;

	if (!isa<ConstantExpr>(V1)) {
	if (!isa<ConstantExpr>(V2)) {
	// Simple case, use the standard constant folder.
	ConstantInt *R = nullptr;
	R = dyn_cast<ConstantInt>(
	ConstantExpr::getFCmp(FCmpInst::FCMP_OEQ, V1, V2));
	if (R && !R->isZero())
	return FCmpInst::FCMP_OEQ;
	R = dyn_cast<ConstantInt>(
	ConstantExpr::getFCmp(FCmpInst::FCMP_OLT, V1, V2));
	if (R && !R->isZero())
	return FCmpInst::FCMP_OLT;
	R = dyn_cast<ConstantInt>(
	ConstantExpr::getFCmp(FCmpInst::FCMP_OGT, V1, V2));
	if (R && !R->isZero())
	return FCmpInst::FCMP_OGT;

	// Nothing more we can do
	return FCmpInst::BAD_FCMP_PREDICATE;
	}

	// If the first operand is simple and second is ConstantExpr, swap operands.
	FCmpInst::Predicate SwappedRelation = evaluateFCmpRelation(V2, V1);
	if (SwappedRelation != FCmpInst::BAD_FCMP_PREDICATE)
	return FCmpInst::getSwappedPredicate(SwappedRelation);
	} else {
	// Ok, the LHS is known to be a constantexpr. The RHS can be any of a
	// constantexpr or a simple constant.
	ConstantExpr *CE1 = cast<ConstantExpr>(V1);
	switch (CE1->getOpcode()) {
	case Instruction::FPTrunc:
	case Instruction::FPExt:
	case Instruction::UIToFP:
	case Instruction::SIToFP:
	// We might be able to do something with these but we don't right now.
	break;
	default:
	break;
	}
	}
	// There are MANY other foldings that we could perform here. They will
	// probably be added on demand, as they seem needed.
	return FCmpInst::BAD_FCMP_PREDICATE;
	}

	static ICmpInst::Predicate areGlobalsPotentiallyEqual(const GlobalValue *GV1,
	const GlobalValue *GV2) {
	auto isGlobalUnsafeForEquality = [](const GlobalValue *GV) {
	if (GV->hasExternalWeakLinkage() \|\| GV->hasWeakAnyLinkage())
	return true;
	if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
	Type *Ty = GVar->getValueType();
	// A global with opaque type might end up being zero sized.
	if (!Ty->isSized())
	return true;
	// A global with an empty type might lie at the address of any other
	// global.
	if (Ty->isEmptyTy())
	return true;
	}
	return false;
	};
	// Don't try to decide equality of aliases.
	if (!isa<GlobalAlias>(GV1) && !isa<GlobalAlias>(GV2))
	if (!isGlobalUnsafeForEquality(GV1) && !isGlobalUnsafeForEquality(GV2))
	return ICmpInst::ICMP_NE;
	return ICmpInst::BAD_ICMP_PREDICATE;
	}

	/// This function determines if there is anything we can decide about the two
	/// constants provided. This doesn't need to handle simple things like integer
	/// comparisons, but should instead handle ConstantExprs and GlobalValues.
	/// If we can determine that the two constants have a particular relation to
	/// each other, we should return the corresponding ICmp predicate, otherwise
	/// return ICmpInst::BAD_ICMP_PREDICATE.
	///
	/// To simplify this code we canonicalize the relation so that the first
	/// operand is always the most "complex" of the two. We consider simple
	/// constants (like ConstantInt) to be the simplest, followed by
	/// GlobalValues, followed by ConstantExpr's (the most complex).
	///
	static ICmpInst::Predicate evaluateICmpRelation(Constant V1, Constant V2,
	bool isSigned) {
	assert(V1->getType() == V2->getType() &&
	"Cannot compare different types of values!");
	if (V1 == V2) return ICmpInst::ICMP_EQ;

	if (!isa<ConstantExpr>(V1) && !isa<GlobalValue>(V1) &&
	!isa<BlockAddress>(V1)) {
	if (!isa<GlobalValue>(V2) && !isa<ConstantExpr>(V2) &&
	!isa<BlockAddress>(V2)) {
	// We distilled this down to a simple case, use the standard constant
	// folder.
	ConstantInt *R = nullptr;
	ICmpInst::Predicate pred = ICmpInst::ICMP_EQ;
	R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
	if (R && !R->isZero())
	return pred;
	pred = isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
	R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
	if (R && !R->isZero())
	return pred;
	pred = isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
	R = dyn_cast<ConstantInt>(ConstantExpr::getICmp(pred, V1, V2));
	if (R && !R->isZero())
	return pred;

	// If we couldn't figure it out, bail.
	return ICmpInst::BAD_ICMP_PREDICATE;
	}

	// If the first operand is simple, swap operands.
	ICmpInst::Predicate SwappedRelation =
	evaluateICmpRelation(V2, V1, isSigned);
	if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
	return ICmpInst::getSwappedPredicate(SwappedRelation);

	} else if (const GlobalValue *GV = dyn_cast<GlobalValue>(V1)) {
	if (isa<ConstantExpr>(V2)) { // Swap as necessary.
	ICmpInst::Predicate SwappedRelation =
	evaluateICmpRelation(V2, V1, isSigned);
	if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
	return ICmpInst::getSwappedPredicate(SwappedRelation);
	return ICmpInst::BAD_ICMP_PREDICATE;
	}

	// Now we know that the RHS is a GlobalValue, BlockAddress or simple
	// constant (which, since the types must match, means that it's a
	// ConstantPointerNull).
	if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2)) {
	return areGlobalsPotentiallyEqual(GV, GV2);
	} else if (isa<BlockAddress>(V2)) {
	return ICmpInst::ICMP_NE; // Globals never equal labels.
	} else {
	assert(isa<ConstantPointerNull>(V2) && "Canonicalization guarantee!");
	// GlobalVals can never be null unless they have external weak linkage.
	// We don't try to evaluate aliases here.
	if (!GV->hasExternalWeakLinkage() && !isa<GlobalAlias>(GV))
	return ICmpInst::ICMP_NE;
	}
	} else if (const BlockAddress *BA = dyn_cast<BlockAddress>(V1)) {
	if (isa<ConstantExpr>(V2)) { // Swap as necessary.
	ICmpInst::Predicate SwappedRelation =
	evaluateICmpRelation(V2, V1, isSigned);
	if (SwappedRelation != ICmpInst::BAD_ICMP_PREDICATE)
	return ICmpInst::getSwappedPredicate(SwappedRelation);
	return ICmpInst::BAD_ICMP_PREDICATE;
	}

	// Now we know that the RHS is a GlobalValue, BlockAddress or simple
	// constant (which, since the types must match, means that it is a
	// ConstantPointerNull).
	if (const BlockAddress *BA2 = dyn_cast<BlockAddress>(V2)) {
	// Block address in another function can't equal this one, but block
	// addresses in the current function might be the same if blocks are
	// empty.
	if (BA2->getFunction() != BA->getFunction())
	return ICmpInst::ICMP_NE;
	} else {
	// Block addresses aren't null, don't equal the address of globals.
	assert((isa<ConstantPointerNull>(V2) \|\| isa<GlobalValue>(V2)) &&
	"Canonicalization guarantee!");
	return ICmpInst::ICMP_NE;
	}
	} else {
	// Ok, the LHS is known to be a constantexpr. The RHS can be any of a
	// constantexpr, a global, block address, or a simple constant.
	ConstantExpr *CE1 = cast<ConstantExpr>(V1);
	Constant *CE1Op0 = CE1->getOperand(0);

	switch (CE1->getOpcode()) {
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::FPExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	break; // We can't evaluate floating point casts or truncations.

	case Instruction::UIToFP:
	case Instruction::SIToFP:
	case Instruction::BitCast:
	case Instruction::ZExt:
	case Instruction::SExt:
	// We can't evaluate floating point casts or truncations.
	if (CE1Op0->getType()->isFloatingPointTy())
	break;

	// If the cast is not actually changing bits, and the second operand is a
	// null pointer, do the comparison with the pre-casted value.
	if (V2->isNullValue() &&
	(CE1->getType()->isPointerTy() \|\| CE1->getType()->isIntegerTy())) {
	if (CE1->getOpcode() == Instruction::ZExt) isSigned = false;
	if (CE1->getOpcode() == Instruction::SExt) isSigned = true;
	return evaluateICmpRelation(CE1Op0,
	Constant::getNullValue(CE1Op0->getType()),
	isSigned);
	}
	break;

	case Instruction::GetElementPtr: {
	GEPOperator *CE1GEP = cast<GEPOperator>(CE1);
	// Ok, since this is a getelementptr, we know that the constant has a
	// pointer type. Check the various cases.
	if (isa<ConstantPointerNull>(V2)) {
	// If we are comparing a GEP to a null pointer, check to see if the base
	// of the GEP equals the null pointer.
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) {
	if (GV->hasExternalWeakLinkage())
	// Weak linkage GVals could be zero or not. We're comparing that
	// to null pointer so its greater-or-equal
	return isSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
	else
	// If its not weak linkage, the GVal must have a non-zero address
	// so the result is greater-than
	return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
	} else if (isa<ConstantPointerNull>(CE1Op0)) {
	// If we are indexing from a null pointer, check to see if we have any
	// non-zero indices.
	for (unsigned i = 1, e = CE1->getNumOperands(); i != e; ++i)
	if (!CE1->getOperand(i)->isNullValue())
	// Offsetting from null, must not be equal.
	return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
	// Only zero indexes from null, must still be zero.
	return ICmpInst::ICMP_EQ;
	}
	// Otherwise, we can't really say if the first operand is null or not.
	} else if (const GlobalValue *GV2 = dyn_cast<GlobalValue>(V2)) {
	if (isa<ConstantPointerNull>(CE1Op0)) {
	if (GV2->hasExternalWeakLinkage())
	// Weak linkage GVals could be zero or not. We're comparing it to
	// a null pointer, so its less-or-equal
	return isSigned ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
	else
	// If its not weak linkage, the GVal must have a non-zero address
	// so the result is less-than
	return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
	} else if (const GlobalValue *GV = dyn_cast<GlobalValue>(CE1Op0)) {
	if (GV == GV2) {
	// If this is a getelementptr of the same global, then it must be
	// different. Because the types must match, the getelementptr could
	// only have at most one index, and because we fold getelementptr's
	// with a single zero index, it must be nonzero.
	assert(CE1->getNumOperands() == 2 &&
	!CE1->getOperand(1)->isNullValue() &&
	"Surprising getelementptr!");
	return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
	} else {
	if (CE1GEP->hasAllZeroIndices())
	return areGlobalsPotentiallyEqual(GV, GV2);
	return ICmpInst::BAD_ICMP_PREDICATE;
	}
	}
	} else {
	ConstantExpr *CE2 = cast<ConstantExpr>(V2);
	Constant *CE2Op0 = CE2->getOperand(0);

	// There are MANY other foldings that we could perform here. They will
	// probably be added on demand, as they seem needed.
	switch (CE2->getOpcode()) {
	default: break;
	case Instruction::GetElementPtr:
	// By far the most common case to handle is when the base pointers are
	// obviously to the same global.
	if (isa<GlobalValue>(CE1Op0) && isa<GlobalValue>(CE2Op0)) {
	// Don't know relative ordering, but check for inequality.
	if (CE1Op0 != CE2Op0) {
	GEPOperator *CE2GEP = cast<GEPOperator>(CE2);
	if (CE1GEP->hasAllZeroIndices() && CE2GEP->hasAllZeroIndices())
	return areGlobalsPotentiallyEqual(cast<GlobalValue>(CE1Op0),
	cast<GlobalValue>(CE2Op0));
	return ICmpInst::BAD_ICMP_PREDICATE;
	}
	// Ok, we know that both getelementptr instructions are based on the
	// same global. From this, we can precisely determine the relative
	// ordering of the resultant pointers.
	unsigned i = 1;

	// The logic below assumes that the result of the comparison
	// can be determined by finding the first index that differs.
	// This doesn't work if there is over-indexing in any
	// subsequent indices, so check for that case first.
	if (!CE1->isGEPWithNoNotionalOverIndexing() \|\|
	!CE2->isGEPWithNoNotionalOverIndexing())
	return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.

	// Compare all of the operands the GEP's have in common.
	gep_type_iterator GTI = gep_type_begin(CE1);
	for (;i != CE1->getNumOperands() && i != CE2->getNumOperands();
	++i, ++GTI)
	switch (IdxCompare(CE1->getOperand(i),
	CE2->getOperand(i), GTI.getIndexedType())) {
	case -1: return isSigned ? ICmpInst::ICMP_SLT:ICmpInst::ICMP_ULT;
	case 1: return isSigned ? ICmpInst::ICMP_SGT:ICmpInst::ICMP_UGT;
	case -2: return ICmpInst::BAD_ICMP_PREDICATE;
	}

	// Ok, we ran out of things they have in common. If any leftovers
	// are non-zero then we have a difference, otherwise we are equal.
	for (; i < CE1->getNumOperands(); ++i)
	if (!CE1->getOperand(i)->isNullValue()) {
	if (isa<ConstantInt>(CE1->getOperand(i)))
	return isSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
	else
	return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
	}

	for (; i < CE2->getNumOperands(); ++i)
	if (!CE2->getOperand(i)->isNullValue()) {
	if (isa<ConstantInt>(CE2->getOperand(i)))
	return isSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
	else
	return ICmpInst::BAD_ICMP_PREDICATE; // Might be equal.
	}
	return ICmpInst::ICMP_EQ;
	}
	}
	}
	}
	default:
	break;
	}
	}

	return ICmpInst::BAD_ICMP_PREDICATE;
	}

	Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
	Constant C1, Constant C2) {
	Type *ResultTy;
	if (VectorType *VT = dyn_cast<VectorType>(C1->getType()))
	ResultTy = VectorType::get(Type::getInt1Ty(C1->getContext()),
	VT->getNumElements());
	else
	ResultTy = Type::getInt1Ty(C1->getContext());

	// Fold FCMP_FALSE/FCMP_TRUE unconditionally.
	if (pred == FCmpInst::FCMP_FALSE)
	return Constant::getNullValue(ResultTy);

	if (pred == FCmpInst::FCMP_TRUE)
	return Constant::getAllOnesValue(ResultTy);

	// Handle some degenerate cases first
	if (isa<UndefValue>(C1) \|\| isa<UndefValue>(C2)) {
	CmpInst::Predicate Predicate = CmpInst::Predicate(pred);
	bool isIntegerPredicate = ICmpInst::isIntPredicate(Predicate);
	// For EQ and NE, we can always pick a value for the undef to make the
	// predicate pass or fail, so we can return undef.
	// Also, if both operands are undef, we can return undef for int comparison.
	if (ICmpInst::isEquality(Predicate) \|\| (isIntegerPredicate && C1 == C2))
	return UndefValue::get(ResultTy);

	// Otherwise, for integer compare, pick the same value as the non-undef
	// operand, and fold it to true or false.
	if (isIntegerPredicate)
	return ConstantInt::get(ResultTy, CmpInst::isTrueWhenEqual(Predicate));

	// Choosing NaN for the undef will always make unordered comparison succeed
	// and ordered comparison fails.
	return ConstantInt::get(ResultTy, CmpInst::isUnordered(Predicate));
	}

	// icmp eq/ne(null,GV) -> false/true
	if (C1->isNullValue()) {
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(C2))
	// Don't try to evaluate aliases. External weak GV can be null.
	if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage()) {
	if (pred == ICmpInst::ICMP_EQ)
	return ConstantInt::getFalse(C1->getContext());
	else if (pred == ICmpInst::ICMP_NE)
	return ConstantInt::getTrue(C1->getContext());
	}
	// icmp eq/ne(GV,null) -> false/true
	} else if (C2->isNullValue()) {
	if (const GlobalValue *GV = dyn_cast<GlobalValue>(C1))
	// Don't try to evaluate aliases. External weak GV can be null.
	if (!isa<GlobalAlias>(GV) && !GV->hasExternalWeakLinkage()) {
	if (pred == ICmpInst::ICMP_EQ)
	return ConstantInt::getFalse(C1->getContext());
	else if (pred == ICmpInst::ICMP_NE)
	return ConstantInt::getTrue(C1->getContext());
	}
	}

	// If the comparison is a comparison between two i1's, simplify it.
	if (C1->getType()->isIntegerTy(1)) {
	switch(pred) {
	case ICmpInst::ICMP_EQ:
	if (isa<ConstantInt>(C2))
	return ConstantExpr::getXor(C1, ConstantExpr::getNot(C2));
	return ConstantExpr::getXor(ConstantExpr::getNot(C1), C2);
	case ICmpInst::ICMP_NE:
	return ConstantExpr::getXor(C1, C2);
	default:
	break;
	}
	}

	if (isa<ConstantInt>(C1) && isa<ConstantInt>(C2)) {
	const APInt &V1 = cast<ConstantInt>(C1)->getValue();
	const APInt &V2 = cast<ConstantInt>(C2)->getValue();
	switch (pred) {
	default: llvm_unreachable("Invalid ICmp Predicate");
	case ICmpInst::ICMP_EQ: return ConstantInt::get(ResultTy, V1 == V2);
	case ICmpInst::ICMP_NE: return ConstantInt::get(ResultTy, V1 != V2);
	case ICmpInst::ICMP_SLT: return ConstantInt::get(ResultTy, V1.slt(V2));
	case ICmpInst::ICMP_SGT: return ConstantInt::get(ResultTy, V1.sgt(V2));
	case ICmpInst::ICMP_SLE: return ConstantInt::get(ResultTy, V1.sle(V2));
	case ICmpInst::ICMP_SGE: return ConstantInt::get(ResultTy, V1.sge(V2));
	case ICmpInst::ICMP_ULT: return ConstantInt::get(ResultTy, V1.ult(V2));
	case ICmpInst::ICMP_UGT: return ConstantInt::get(ResultTy, V1.ugt(V2));
	case ICmpInst::ICMP_ULE: return ConstantInt::get(ResultTy, V1.ule(V2));
	case ICmpInst::ICMP_UGE: return ConstantInt::get(ResultTy, V1.uge(V2));
	}
	} else if (isa<ConstantFP>(C1) && isa<ConstantFP>(C2)) {
	const APFloat &C1V = cast<ConstantFP>(C1)->getValueAPF();
	const APFloat &C2V = cast<ConstantFP>(C2)->getValueAPF();
	APFloat::cmpResult R = C1V.compare(C2V);
	switch (pred) {
	default: llvm_unreachable("Invalid FCmp Predicate");
	case FCmpInst::FCMP_FALSE: return Constant::getNullValue(ResultTy);
	case FCmpInst::FCMP_TRUE: return Constant::getAllOnesValue(ResultTy);
	case FCmpInst::FCMP_UNO:
	return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered);
	case FCmpInst::FCMP_ORD:
	return ConstantInt::get(ResultTy, R!=APFloat::cmpUnordered);
	case FCmpInst::FCMP_UEQ:
	return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpEqual);
	case FCmpInst::FCMP_OEQ:
	return ConstantInt::get(ResultTy, R==APFloat::cmpEqual);
	case FCmpInst::FCMP_UNE:
	return ConstantInt::get(ResultTy, R!=APFloat::cmpEqual);
	case FCmpInst::FCMP_ONE:
	return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan \|\|
	R==APFloat::cmpGreaterThan);
	case FCmpInst::FCMP_ULT:
	return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpLessThan);
	case FCmpInst::FCMP_OLT:
	return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan);
	case FCmpInst::FCMP_UGT:
	return ConstantInt::get(ResultTy, R==APFloat::cmpUnordered \|\|
	R==APFloat::cmpGreaterThan);
	case FCmpInst::FCMP_OGT:
	return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan);
	case FCmpInst::FCMP_ULE:
	return ConstantInt::get(ResultTy, R!=APFloat::cmpGreaterThan);
	case FCmpInst::FCMP_OLE:
	return ConstantInt::get(ResultTy, R==APFloat::cmpLessThan \|\|
	R==APFloat::cmpEqual);
	case FCmpInst::FCMP_UGE:
	return ConstantInt::get(ResultTy, R!=APFloat::cmpLessThan);
	case FCmpInst::FCMP_OGE:
	return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan \|\|
	R==APFloat::cmpEqual);
	}
	} else if (C1->getType()->isVectorTy()) {
	// If we can constant fold the comparison of each element, constant fold
	// the whole vector comparison.
	SmallVector<Constant*, 4> ResElts;
	Type *Ty = IntegerType::get(C1->getContext(), 32);
	// Compare the elements, producing an i1 result or constant expr.
	for (unsigned i = 0, e = C1->getType()->getVectorNumElements(); i != e;++i){
	Constant *C1E =
	ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
	Constant *C2E =
	ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));

	ResElts.push_back(ConstantExpr::getCompare(pred, C1E, C2E));
	}

	return ConstantVector::get(ResElts);
	}

	if (C1->getType()->isFloatingPointTy() &&
	// Only call evaluateFCmpRelation if we have a constant expr to avoid
	// infinite recursive loop
	(isa<ConstantExpr>(C1) \|\| isa<ConstantExpr>(C2))) {
	int Result = -1; // -1 = unknown, 0 = known false, 1 = known true.
	switch (evaluateFCmpRelation(C1, C2)) {
	default: llvm_unreachable("Unknown relation!");
	case FCmpInst::FCMP_UNO:
	case FCmpInst::FCMP_ORD:
	case FCmpInst::FCMP_UEQ:
	case FCmpInst::FCMP_UNE:
	case FCmpInst::FCMP_ULT:
	case FCmpInst::FCMP_UGT:
	case FCmpInst::FCMP_ULE:
	case FCmpInst::FCMP_UGE:
	case FCmpInst::FCMP_TRUE:
	case FCmpInst::FCMP_FALSE:
	case FCmpInst::BAD_FCMP_PREDICATE:
	break; // Couldn't determine anything about these constants.
	case FCmpInst::FCMP_OEQ: // We know that C1 == C2
	Result = (pred == FCmpInst::FCMP_UEQ \|\| pred == FCmpInst::FCMP_OEQ \|\|
	pred == FCmpInst::FCMP_ULE \|\| pred == FCmpInst::FCMP_OLE \|\|
	pred == FCmpInst::FCMP_UGE \|\| pred == FCmpInst::FCMP_OGE);
	break;
	case FCmpInst::FCMP_OLT: // We know that C1 < C2
	Result = (pred == FCmpInst::FCMP_UNE \|\| pred == FCmpInst::FCMP_ONE \|\|
	pred == FCmpInst::FCMP_ULT \|\| pred == FCmpInst::FCMP_OLT \|\|
	pred == FCmpInst::FCMP_ULE \|\| pred == FCmpInst::FCMP_OLE);
	break;
	case FCmpInst::FCMP_OGT: // We know that C1 > C2
	Result = (pred == FCmpInst::FCMP_UNE \|\| pred == FCmpInst::FCMP_ONE \|\|
	pred == FCmpInst::FCMP_UGT \|\| pred == FCmpInst::FCMP_OGT \|\|
	pred == FCmpInst::FCMP_UGE \|\| pred == FCmpInst::FCMP_OGE);
	break;
	case FCmpInst::FCMP_OLE: // We know that C1 <= C2
	// We can only partially decide this relation.
	if (pred == FCmpInst::FCMP_UGT \|\| pred == FCmpInst::FCMP_OGT)
	Result = 0;
	else if (pred == FCmpInst::FCMP_ULT \|\| pred == FCmpInst::FCMP_OLT)
	Result = 1;
	break;
	case FCmpInst::FCMP_OGE: // We known that C1 >= C2
	// We can only partially decide this relation.
	if (pred == FCmpInst::FCMP_ULT \|\| pred == FCmpInst::FCMP_OLT)
	Result = 0;
	else if (pred == FCmpInst::FCMP_UGT \|\| pred == FCmpInst::FCMP_OGT)
	Result = 1;
	break;
	case FCmpInst::FCMP_ONE: // We know that C1 != C2
	// We can only partially decide this relation.
	if (pred == FCmpInst::FCMP_OEQ \|\| pred == FCmpInst::FCMP_UEQ)
	Result = 0;
	else if (pred == FCmpInst::FCMP_ONE \|\| pred == FCmpInst::FCMP_UNE)
	Result = 1;
	break;
	}

	// If we evaluated the result, return it now.
	if (Result != -1)
	return ConstantInt::get(ResultTy, Result);

	} else {
	// Evaluate the relation between the two constants, per the predicate.
	int Result = -1; // -1 = unknown, 0 = known false, 1 = known true.
	switch (evaluateICmpRelation(C1, C2,
	CmpInst::isSigned((CmpInst::Predicate)pred))) {
	default: llvm_unreachable("Unknown relational!");
	case ICmpInst::BAD_ICMP_PREDICATE:
	break; // Couldn't determine anything about these constants.
	case ICmpInst::ICMP_EQ: // We know the constants are equal!
	// If we know the constants are equal, we can decide the result of this
	// computation precisely.
	Result = ICmpInst::isTrueWhenEqual((ICmpInst::Predicate)pred);
	break;
	case ICmpInst::ICMP_ULT:
	switch (pred) {
	case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_ULE:
	Result = 1; break;
	case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_UGE:
	Result = 0; break;
	}
	break;
	case ICmpInst::ICMP_SLT:
	switch (pred) {
	case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_SLE:
	Result = 1; break;
	case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_SGE:
	Result = 0; break;
	}
	break;
	case ICmpInst::ICMP_UGT:
	switch (pred) {
	case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGE:
	Result = 1; break;
	case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULE:
	Result = 0; break;
	}
	break;
	case ICmpInst::ICMP_SGT:
	switch (pred) {
	case ICmpInst::ICMP_SGT: case ICmpInst::ICMP_NE: case ICmpInst::ICMP_SGE:
	Result = 1; break;
	case ICmpInst::ICMP_SLT: case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_SLE:
	Result = 0; break;
	}
	break;
	case ICmpInst::ICMP_ULE:
	if (pred == ICmpInst::ICMP_UGT) Result = 0;
	if (pred == ICmpInst::ICMP_ULT \|\| pred == ICmpInst::ICMP_ULE) Result = 1;
	break;
	case ICmpInst::ICMP_SLE:
	if (pred == ICmpInst::ICMP_SGT) Result = 0;
	if (pred == ICmpInst::ICMP_SLT \|\| pred == ICmpInst::ICMP_SLE) Result = 1;
	break;
	case ICmpInst::ICMP_UGE:
	if (pred == ICmpInst::ICMP_ULT) Result = 0;
	if (pred == ICmpInst::ICMP_UGT \|\| pred == ICmpInst::ICMP_UGE) Result = 1;
	break;
	case ICmpInst::ICMP_SGE:
	if (pred == ICmpInst::ICMP_SLT) Result = 0;
	if (pred == ICmpInst::ICMP_SGT \|\| pred == ICmpInst::ICMP_SGE) Result = 1;
	break;
	case ICmpInst::ICMP_NE:
	if (pred == ICmpInst::ICMP_EQ) Result = 0;
	if (pred == ICmpInst::ICMP_NE) Result = 1;
	break;
	}

	// If we evaluated the result, return it now.
	if (Result != -1)
	return ConstantInt::get(ResultTy, Result);

	// If the right hand side is a bitcast, try using its inverse to simplify
	// it by moving it to the left hand side. We can't do this if it would turn
	// a vector compare into a scalar compare or visa versa.
	if (ConstantExpr *CE2 = dyn_cast<ConstantExpr>(C2)) {
	Constant *CE2Op0 = CE2->getOperand(0);
	if (CE2->getOpcode() == Instruction::BitCast &&
	CE2->getType()->isVectorTy() == CE2Op0->getType()->isVectorTy()) {
	Constant *Inverse = ConstantExpr::getBitCast(C1, CE2Op0->getType());
	return ConstantExpr::getICmp(pred, Inverse, CE2Op0);
	}
	}

	// If the left hand side is an extension, try eliminating it.
	if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(C1)) {
	if ((CE1->getOpcode() == Instruction::SExt &&
	ICmpInst::isSigned((ICmpInst::Predicate)pred)) \|\|
	(CE1->getOpcode() == Instruction::ZExt &&
	!ICmpInst::isSigned((ICmpInst::Predicate)pred))){
	Constant *CE1Op0 = CE1->getOperand(0);
	Constant *CE1Inverse = ConstantExpr::getTrunc(CE1, CE1Op0->getType());
	if (CE1Inverse == CE1Op0) {
	// Check whether we can safely truncate the right hand side.
	Constant *C2Inverse = ConstantExpr::getTrunc(C2, CE1Op0->getType());
	if (ConstantExpr::getCast(CE1->getOpcode(), C2Inverse,
	C2->getType()) == C2)
	return ConstantExpr::getICmp(pred, CE1Inverse, C2Inverse);
	}
	}
	}

	if ((!isa<ConstantExpr>(C1) && isa<ConstantExpr>(C2)) \|\|
	(C1->isNullValue() && !C2->isNullValue())) {
	// If C2 is a constant expr and C1 isn't, flip them around and fold the
	// other way if possible.
	// Also, if C1 is null and C2 isn't, flip them around.
	pred = ICmpInst::getSwappedPredicate((ICmpInst::Predicate)pred);
	return ConstantExpr::getICmp(pred, C2, C1);
	}
	}
	return nullptr;
	}

	/// Test whether the given sequence of normalized indices is "inbounds".
	template<typename IndexTy>
	static bool isInBoundsIndices(ArrayRef<IndexTy> Idxs) {
	// No indices means nothing that could be out of bounds.
	if (Idxs.empty()) return true;

	// If the first index is zero, it's in bounds.
	if (cast<Constant>(Idxs[0])->isNullValue()) return true;

	// If the first index is one and all the rest are zero, it's in bounds,
	// by the one-past-the-end rule.
	if (!cast<ConstantInt>(Idxs[0])->isOne())
	return false;
	for (unsigned i = 1, e = Idxs.size(); i != e; ++i)
	if (!cast<Constant>(Idxs[i])->isNullValue())
	return false;
	return true;
	}

	/// Test whether a given ConstantInt is in-range for a SequentialType.
	static bool isIndexInRangeOfArrayType(uint64_t NumElements,
	const ConstantInt *CI) {
	// We cannot bounds check the index if it doesn't fit in an int64_t.
	if (CI->getValue().getActiveBits() > 64)
	return false;

	// A negative index or an index past the end of our sequential type is
	// considered out-of-range.
	int64_t IndexVal = CI->getSExtValue();
	if (IndexVal < 0 \|\| (NumElements > 0 && (uint64_t)IndexVal >= NumElements))
	return false;

	// Otherwise, it is in-range.
	return true;
	}

	Constant llvm::ConstantFoldGetElementPtr(Type PointeeTy, Constant *C,
	bool InBounds,
	Optional<unsigned> InRangeIndex,
	ArrayRef<Value *> Idxs) {
	if (Idxs.empty()) return C;

	if (isa<UndefValue>(C)) {
	Type *GEPTy = GetElementPtrInst::getGEPReturnType(
	C, makeArrayRef((Value * const *)Idxs.data(), Idxs.size()));
	return UndefValue::get(GEPTy);
	}

	Constant *Idx0 = cast<Constant>(Idxs[0]);
	if (Idxs.size() == 1 && (Idx0->isNullValue() \|\| isa<UndefValue>(Idx0)))
	return C;

	if (C->isNullValue()) {
	bool isNull = true;
	for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
	if (!isa<UndefValue>(Idxs[i]) &&
	!cast<Constant>(Idxs[i])->isNullValue()) {
	isNull = false;
	break;
	}
	if (isNull) {
	PointerType *PtrTy = cast<PointerType>(C->getType()->getScalarType());
	Type *Ty = GetElementPtrInst::getIndexedType(PointeeTy, Idxs);

	assert(Ty && "Invalid indices for GEP!");
	Type *GEPTy = PointerType::get(Ty, PtrTy->getAddressSpace());
	if (VectorType *VT = dyn_cast<VectorType>(C->getType()))
	GEPTy = VectorType::get(GEPTy, VT->getNumElements());
	return Constant::getNullValue(GEPTy);
	}
	}

	if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
	// Combine Indices - If the source pointer to this getelementptr instruction
	// is a getelementptr instruction, combine the indices of the two
	// getelementptr instructions into a single instruction.
	//
	if (CE->getOpcode() == Instruction::GetElementPtr) {
	gep_type_iterator LastI = gep_type_end(CE);
	for (gep_type_iterator I = gep_type_begin(CE), E = gep_type_end(CE);
	I != E; ++I)
	LastI = I;

	// We cannot combine indices if doing so would take us outside of an
	// array or vector. Doing otherwise could trick us if we evaluated such a
	// GEP as part of a load.
	//
	// e.g. Consider if the original GEP was:
	// i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
	// i32 0, i32 0, i64 0)
	//
	// If we then tried to offset it by '8' to get to the third element,
	// an i8, we should not get:
	// i8* getelementptr ({ [2 x i8], i32, i8, [3 x i8] }* @main.c,
	// i32 0, i32 0, i64 8)
	//
	// This GEP tries to index array element '8 which runs out-of-bounds.
	// Subsequent evaluation would get confused and produce erroneous results.
	//
	// The following prohibits such a GEP from being formed by checking to see
	- // if the index is in-range with respect to an array or vector.
	+ // if the index is in-range with respect to an array.
	+ // TODO: This code may be extended to handle vectors as well.
	bool PerformFold = false;
	if (Idx0->isNullValue())
	PerformFold = true;
	else if (LastI.isSequential())
	if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx0))
	- PerformFold =
	- !LastI.isBoundedSequential() \|\|
	- isIndexInRangeOfArrayType(LastI.getSequentialNumElements(), CI);
	+ PerformFold = (!LastI.isBoundedSequential() \|\|
	+ isIndexInRangeOfArrayType(
	+ LastI.getSequentialNumElements(), CI)) &&
	+ !CE->getOperand(CE->getNumOperands() - 1)
	+ ->getType()
	+ ->isVectorTy();

	if (PerformFold) {
	SmallVector<Value*, 16> NewIndices;
	NewIndices.reserve(Idxs.size() + CE->getNumOperands());
	NewIndices.append(CE->op_begin() + 1, CE->op_end() - 1);

	// Add the last index of the source with the first index of the new GEP.
	// Make sure to handle the case when they are actually different types.
	Constant *Combined = CE->getOperand(CE->getNumOperands()-1);
	// Otherwise it must be an array.
	if (!Idx0->isNullValue()) {
	Type *IdxTy = Combined->getType();
	if (IdxTy != Idx0->getType()) {
	unsigned CommonExtendedWidth =
	std::max(IdxTy->getIntegerBitWidth(),
	Idx0->getType()->getIntegerBitWidth());
	CommonExtendedWidth = std::max(CommonExtendedWidth, 64U);

	Type *CommonTy =
	Type::getIntNTy(IdxTy->getContext(), CommonExtendedWidth);
	Constant *C1 = ConstantExpr::getSExtOrBitCast(Idx0, CommonTy);
	Constant *C2 = ConstantExpr::getSExtOrBitCast(Combined, CommonTy);
	Combined = ConstantExpr::get(Instruction::Add, C1, C2);
	} else {
	Combined =
	ConstantExpr::get(Instruction::Add, Idx0, Combined);
	}
	}

	NewIndices.push_back(Combined);
	NewIndices.append(Idxs.begin() + 1, Idxs.end());

	// The combined GEP normally inherits its index inrange attribute from
	// the inner GEP, but if the inner GEP's last index was adjusted by the
	// outer GEP, any inbounds attribute on that index is invalidated.
	Optional<unsigned> IRIndex = cast<GEPOperator>(CE)->getInRangeIndex();
	if (IRIndex && *IRIndex == CE->getNumOperands() - 2 && !Idx0->isNullValue())
	IRIndex = None;

	return ConstantExpr::getGetElementPtr(
	cast<GEPOperator>(CE)->getSourceElementType(), CE->getOperand(0),
	NewIndices, InBounds && cast<GEPOperator>(CE)->isInBounds(),
	IRIndex);
	}
	}

	// Attempt to fold casts to the same type away. For example, folding:
	//
	// i32* getelementptr ([2 x i32]* bitcast ([3 x i32]* %X to [2 x i32]*),
	// i64 0, i64 0)
	// into:
	//
	// i32* getelementptr ([3 x i32]* %X, i64 0, i64 0)
	//
	// Don't fold if the cast is changing address spaces.
	if (CE->isCast() && Idxs.size() > 1 && Idx0->isNullValue()) {
	PointerType *SrcPtrTy =
	dyn_cast<PointerType>(CE->getOperand(0)->getType());
	PointerType *DstPtrTy = dyn_cast<PointerType>(CE->getType());
	if (SrcPtrTy && DstPtrTy) {
	ArrayType *SrcArrayTy =
	dyn_cast<ArrayType>(SrcPtrTy->getElementType());
	ArrayType *DstArrayTy =
	dyn_cast<ArrayType>(DstPtrTy->getElementType());
	if (SrcArrayTy && DstArrayTy
	&& SrcArrayTy->getElementType() == DstArrayTy->getElementType()
	&& SrcPtrTy->getAddressSpace() == DstPtrTy->getAddressSpace())
	return ConstantExpr::getGetElementPtr(SrcArrayTy,
	(Constant *)CE->getOperand(0),
	Idxs, InBounds, InRangeIndex);
	}
	}
	}

	// Check to see if any array indices are not within the corresponding
	// notional array or vector bounds. If so, try to determine if they can be
	// factored out into preceding dimensions.
	SmallVector<Constant *, 8> NewIdxs;
	Type *Ty = PointeeTy;
	Type *Prev = C->getType();
	bool Unknown = !isa<ConstantInt>(Idxs[0]);
	for (unsigned i = 1, e = Idxs.size(); i != e;
	Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
	auto *CI = dyn_cast<ConstantInt>(Idxs[i]);
	if (!CI) {
	// We don't know if it's in range or not.
	Unknown = true;
	continue;
	}
	if (InRangeIndex && i == *InRangeIndex + 1) {
	// If an index is marked inrange, we cannot apply this canonicalization to
	// the following index, as that will cause the inrange index to point to
	// the wrong element.
	continue;
	}
	if (isa<StructType>(Ty)) {
	// The verify makes sure that GEPs into a struct are in range.
	continue;
	}
	auto *STy = cast<SequentialType>(Ty);
	if (isa<VectorType>(STy)) {
	// There can be awkward padding in after a non-power of two vector.
	Unknown = true;
	continue;
	}
	if (isIndexInRangeOfArrayType(STy->getNumElements(), CI))
	// It's in range, skip to the next index.
	continue;
	if (isa<StructType>(Prev)) {
	// It's out of range, but the prior dimension is a struct
	// so we can't do anything about it.
	Unknown = true;
	continue;
	}
	if (CI->getSExtValue() < 0) {
	// It's out of range and negative, don't try to factor it.
	Unknown = true;
	continue;
	}
	// It's out of range, but we can factor it into the prior
	// dimension.
	NewIdxs.resize(Idxs.size());
	// Determine the number of elements in our sequential type.
	uint64_t NumElements = STy->getArrayNumElements();

	ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
	NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);

	Constant *PrevIdx = NewIdxs[i-1] ? NewIdxs[i-1] :
	cast<Constant>(Idxs[i - 1]);
	Constant *Div = ConstantExpr::getSDiv(CI, Factor);

	unsigned CommonExtendedWidth =
	std::max(PrevIdx->getType()->getIntegerBitWidth(),
	Div->getType()->getIntegerBitWidth());
	CommonExtendedWidth = std::max(CommonExtendedWidth, 64U);

	// Before adding, extend both operands to i64 to avoid
	// overflow trouble.
	if (!PrevIdx->getType()->isIntegerTy(CommonExtendedWidth))
	PrevIdx = ConstantExpr::getSExt(
	PrevIdx, Type::getIntNTy(Div->getContext(), CommonExtendedWidth));
	if (!Div->getType()->isIntegerTy(CommonExtendedWidth))
	Div = ConstantExpr::getSExt(
	Div, Type::getIntNTy(Div->getContext(), CommonExtendedWidth));

	NewIdxs[i - 1] = ConstantExpr::getAdd(PrevIdx, Div);
	}

	// If we did any factoring, start over with the adjusted indices.
	if (!NewIdxs.empty()) {
	for (unsigned i = 0, e = Idxs.size(); i != e; ++i)
	if (!NewIdxs[i]) NewIdxs[i] = cast<Constant>(Idxs[i]);
	return ConstantExpr::getGetElementPtr(PointeeTy, C, NewIdxs, InBounds,
	InRangeIndex);
	}

	// If all indices are known integers and normalized, we can do a simple
	// check for the "inbounds" property.
	if (!Unknown && !InBounds)
	if (auto *GV = dyn_cast<GlobalVariable>(C))
	if (!GV->hasExternalWeakLinkage() && isInBoundsIndices(Idxs))
	return ConstantExpr::getGetElementPtr(PointeeTy, C, Idxs,
	/InBounds=/true, InRangeIndex);

	return nullptr;
	}
	Index: head/contrib/llvm/lib/Object/COFFImportFile.cpp
	===================================================================
	--- head/contrib/llvm/lib/Object/COFFImportFile.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Object/COFFImportFile.cpp (revision 322320)
	@@ -1,615 +1,612 @@
	//===- COFFImportFile.cpp - COFF short import file implementation ---------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the writeImportLibrary function.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Object/COFFImportFile.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/Object/Archive.h"
	#include "llvm/Object/ArchiveWriter.h"
	#include "llvm/Object/COFF.h"
	#include "llvm/Support/Error.h"
	#include "llvm/Support/Path.h"

	#include <cstdint>
	#include <map>
	#include <set>
	#include <string>
	#include <vector>

	using namespace llvm::COFF;
	using namespace llvm::object;
	using namespace llvm;

	namespace llvm {
	namespace object {

	static bool is32bit(MachineTypes Machine) {
	switch (Machine) {
	default:
	llvm_unreachable("unsupported machine");
	case IMAGE_FILE_MACHINE_AMD64:
	return false;
	case IMAGE_FILE_MACHINE_ARMNT:
	case IMAGE_FILE_MACHINE_I386:
	return true;
	}
	}

	static uint16_t getImgRelRelocation(MachineTypes Machine) {
	switch (Machine) {
	default:
	llvm_unreachable("unsupported machine");
	case IMAGE_FILE_MACHINE_AMD64:
	return IMAGE_REL_AMD64_ADDR32NB;
	case IMAGE_FILE_MACHINE_ARMNT:
	return IMAGE_REL_ARM_ADDR32NB;
	case IMAGE_FILE_MACHINE_I386:
	return IMAGE_REL_I386_DIR32NB;
	}
	}

	template <class T> static void append(std::vector<uint8_t> &B, const T &Data) {
	size_t S = B.size();
	B.resize(S + sizeof(T));
	memcpy(&B[S], &Data, sizeof(T));
	}

	static void writeStringTable(std::vector<uint8_t> &B,
	ArrayRef<const std::string> Strings) {
	// The COFF string table consists of a 4-byte value which is the size of the
	// table, including the length field itself. This value is followed by the
	// string content itself, which is an array of null-terminated C-style
	// strings. The termination is important as they are referenced to by offset
	// by the symbol entity in the file format.

	size_t Pos = B.size();
	size_t Offset = B.size();

	// Skip over the length field, we will fill it in later as we will have
	// computed the length while emitting the string content itself.
	Pos += sizeof(uint32_t);

	for (const auto &S : Strings) {
	B.resize(Pos + S.length() + 1);
	strcpy(reinterpret_cast<char *>(&B[Pos]), S.c_str());
	Pos += S.length() + 1;
	}

	// Backfill the length of the table now that it has been computed.
	support::ulittle32_t Length(B.size() - Offset);
	support::endian::write32le(&B[Offset], Length);
	}

	static ImportNameType getNameType(StringRef Sym, StringRef ExtName,
	MachineTypes Machine) {
	if (Sym != ExtName)
	return IMPORT_NAME_UNDECORATE;
	if (Machine == IMAGE_FILE_MACHINE_I386 && Sym.startswith("_"))
	return IMPORT_NAME_NOPREFIX;
	return IMPORT_NAME;
	}

	static Expected<std::string> replace(StringRef S, StringRef From,
	StringRef To) {
	size_t Pos = S.find(From);

	// From and To may be mangled, but substrings in S may not.
	if (Pos == StringRef::npos && From.startswith("_") && To.startswith("_")) {
	From = From.substr(1);
	To = To.substr(1);
	Pos = S.find(From);
	}

	if (Pos == StringRef::npos) {
	return make_error<StringError>(
	StringRef(Twine(S + ": replacing '" + From +
	"' with '" + To + "' failed").str()), object_error::parse_failed);
	}

	return (Twine(S.substr(0, Pos)) + To + S.substr(Pos + From.size())).str();
	}

	static const std::string NullImportDescriptorSymbolName =
	"__NULL_IMPORT_DESCRIPTOR";

	namespace {
	// This class constructs various small object files necessary to support linking
	// symbols imported from a DLL. The contents are pretty strictly defined and
	// nearly entirely static. The details of the structures files are defined in
	// WINNT.h and the PE/COFF specification.
	class ObjectFactory {
	using u16 = support::ulittle16_t;
	using u32 = support::ulittle32_t;
	MachineTypes Machine;
	BumpPtrAllocator Alloc;
	StringRef ImportName;
	StringRef Library;
	std::string ImportDescriptorSymbolName;
	std::string NullThunkSymbolName;

	public:
	ObjectFactory(StringRef S, MachineTypes M)
	: Machine(M), ImportName(S), Library(S.drop_back(4)),
	ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()),
	NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {}

	// Creates an Import Descriptor. This is a small object file which contains a
	// reference to the terminators and contains the library name (entry) for the
	// import name table. It will force the linker to construct the necessary
	// structure to import symbols from the DLL.
	NewArchiveMember createImportDescriptor(std::vector<uint8_t> &Buffer);

	// Creates a NULL import descriptor. This is a small object file whcih
	// contains a NULL import descriptor. It is used to terminate the imports
	// from a specific DLL.
	NewArchiveMember createNullImportDescriptor(std::vector<uint8_t> &Buffer);

	// Create a NULL Thunk Entry. This is a small object file which contains a
	// NULL Import Address Table entry and a NULL Import Lookup Table Entry. It
	// is used to terminate the IAT and ILT.
	NewArchiveMember createNullThunk(std::vector<uint8_t> &Buffer);

	// Create a short import file which is described in PE/COFF spec 7. Import
	// Library Format.
	NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
	ImportType Type, ImportNameType NameType);

	// Create a weak external file which is described in PE/COFF Aux Format 3.
	NewArchiveMember createWeakExternal(StringRef Sym, StringRef Weak, bool Imp);
	};
	} // namespace

	NewArchiveMember
	ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
	const uint32_t NumberOfSections = 2;
	const uint32_t NumberOfSymbols = 7;
	const uint32_t NumberOfRelocations = 3;

	// COFF Header
	coff_file_header Header{
	u16(Machine),
	u16(NumberOfSections),
	u32(0),
	u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
	// .idata$2
	sizeof(coff_import_directory_table_entry) +
	NumberOfRelocations * sizeof(coff_relocation) +
	// .idata$4
	(ImportName.size() + 1)),
	u32(NumberOfSymbols),
	u16(0),
	u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
	};
	append(Buffer, Header);

	// Section Header Table
	const coff_section SectionTable[NumberOfSections] = {
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '2'},
	u32(0),
	u32(0),
	u32(sizeof(coff_import_directory_table_entry)),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section)),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
	sizeof(coff_import_directory_table_entry)),
	u32(0),
	u16(NumberOfRelocations),
	u16(0),
	u32(IMAGE_SCN_ALIGN_4BYTES \| IMAGE_SCN_CNT_INITIALIZED_DATA \|
	IMAGE_SCN_MEM_READ \| IMAGE_SCN_MEM_WRITE)},
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '6'},
	u32(0),
	u32(0),
	u32(ImportName.size() + 1),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
	sizeof(coff_import_directory_table_entry) +
	NumberOfRelocations * sizeof(coff_relocation)),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32(IMAGE_SCN_ALIGN_2BYTES \| IMAGE_SCN_CNT_INITIALIZED_DATA \|
	IMAGE_SCN_MEM_READ \| IMAGE_SCN_MEM_WRITE)},
	};
	append(Buffer, SectionTable);

	// .idata$2
	const coff_import_directory_table_entry ImportDescriptor{
	u32(0), u32(0), u32(0), u32(0), u32(0),
	};
	append(Buffer, ImportDescriptor);

	const coff_relocation RelocationTable[NumberOfRelocations] = {
	{u32(offsetof(coff_import_directory_table_entry, NameRVA)), u32(2),
	u16(getImgRelRelocation(Machine))},
	{u32(offsetof(coff_import_directory_table_entry, ImportLookupTableRVA)),
	u32(3), u16(getImgRelRelocation(Machine))},
	{u32(offsetof(coff_import_directory_table_entry, ImportAddressTableRVA)),
	u32(4), u16(getImgRelRelocation(Machine))},
	};
	append(Buffer, RelocationTable);

	// .idata$6
	auto S = Buffer.size();
	Buffer.resize(S + ImportName.size() + 1);
	memcpy(&Buffer[S], ImportName.data(), ImportName.size());
	Buffer[S + ImportName.size()] = '\0';

	// Symbol Table
	coff_symbol16 SymbolTable[NumberOfSymbols] = {
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(1),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	{{{'.', 'i', 'd', 'a', 't', 'a', '$', '2'}},
	u32(0),
	u16(1),
	u16(0),
	IMAGE_SYM_CLASS_SECTION,
	0},
	{{{'.', 'i', 'd', 'a', 't', 'a', '$', '6'}},
	u32(0),
	u16(2),
	u16(0),
	IMAGE_SYM_CLASS_STATIC,
	0},
	{{{'.', 'i', 'd', 'a', 't', 'a', '$', '4'}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_SECTION,
	0},
	{{{'.', 'i', 'd', 'a', 't', 'a', '$', '5'}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_SECTION,
	0},
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	};
	// TODO: Name.Offset.Offset here and in the all similar places below
	// suggests a names refactoring. Maybe StringTableOffset.Value?
	SymbolTable[0].Name.Offset.Offset =
	sizeof(uint32_t);
	SymbolTable[5].Name.Offset.Offset =
	sizeof(uint32_t) + ImportDescriptorSymbolName.length() + 1;
	SymbolTable[6].Name.Offset.Offset =
	sizeof(uint32_t) + ImportDescriptorSymbolName.length() + 1 +
	NullImportDescriptorSymbolName.length() + 1;
	append(Buffer, SymbolTable);

	// String Table
	writeStringTable(Buffer,
	{ImportDescriptorSymbolName, NullImportDescriptorSymbolName,
	NullThunkSymbolName});

	StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
	return {MemoryBufferRef(F, ImportName)};
	}

	NewArchiveMember
	ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
	const uint32_t NumberOfSections = 1;
	const uint32_t NumberOfSymbols = 1;

	// COFF Header
	coff_file_header Header{
	u16(Machine),
	u16(NumberOfSections),
	u32(0),
	u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
	// .idata$3
	sizeof(coff_import_directory_table_entry)),
	u32(NumberOfSymbols),
	u16(0),
	u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
	};
	append(Buffer, Header);

	// Section Header Table
	const coff_section SectionTable[NumberOfSections] = {
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '3'},
	u32(0),
	u32(0),
	u32(sizeof(coff_import_directory_table_entry)),
	u32(sizeof(coff_file_header) +
	(NumberOfSections * sizeof(coff_section))),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32(IMAGE_SCN_ALIGN_4BYTES \| IMAGE_SCN_CNT_INITIALIZED_DATA \|
	IMAGE_SCN_MEM_READ \| IMAGE_SCN_MEM_WRITE)},
	};
	append(Buffer, SectionTable);

	// .idata$3
	const coff_import_directory_table_entry ImportDescriptor{
	u32(0), u32(0), u32(0), u32(0), u32(0),
	};
	append(Buffer, ImportDescriptor);

	// Symbol Table
	coff_symbol16 SymbolTable[NumberOfSymbols] = {
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(1),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	};
	SymbolTable[0].Name.Offset.Offset = sizeof(uint32_t);
	append(Buffer, SymbolTable);

	// String Table
	writeStringTable(Buffer, {NullImportDescriptorSymbolName});

	StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
	return {MemoryBufferRef(F, ImportName)};
	}

	NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
	const uint32_t NumberOfSections = 2;
	const uint32_t NumberOfSymbols = 1;
	uint32_t VASize = is32bit(Machine) ? 4 : 8;

	// COFF Header
	coff_file_header Header{
	u16(Machine),
	u16(NumberOfSections),
	u32(0),
	u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
	// .idata$5
	VASize +
	// .idata$4
	VASize),
	u32(NumberOfSymbols),
	u16(0),
	u16(is32bit(Machine) ? IMAGE_FILE_32BIT_MACHINE : 0),
	};
	append(Buffer, Header);

	// Section Header Table
	const coff_section SectionTable[NumberOfSections] = {
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '5'},
	u32(0),
	u32(0),
	u32(VASize),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section)),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32((is32bit(Machine) ? IMAGE_SCN_ALIGN_4BYTES
	: IMAGE_SCN_ALIGN_8BYTES) \|
	IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ \|
	IMAGE_SCN_MEM_WRITE)},
	{{'.', 'i', 'd', 'a', 't', 'a', '$', '4'},
	u32(0),
	u32(0),
	u32(VASize),
	u32(sizeof(coff_file_header) + NumberOfSections * sizeof(coff_section) +
	VASize),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32((is32bit(Machine) ? IMAGE_SCN_ALIGN_4BYTES
	: IMAGE_SCN_ALIGN_8BYTES) \|
	IMAGE_SCN_CNT_INITIALIZED_DATA \| IMAGE_SCN_MEM_READ \|
	IMAGE_SCN_MEM_WRITE)},
	};
	append(Buffer, SectionTable);

	// .idata$5, ILT
	append(Buffer, u32(0));
	if (!is32bit(Machine))
	append(Buffer, u32(0));

	// .idata$4, IAT
	append(Buffer, u32(0));
	if (!is32bit(Machine))
	append(Buffer, u32(0));

	// Symbol Table
	coff_symbol16 SymbolTable[NumberOfSymbols] = {
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(1),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	};
	SymbolTable[0].Name.Offset.Offset = sizeof(uint32_t);
	append(Buffer, SymbolTable);

	// String Table
	writeStringTable(Buffer, {NullThunkSymbolName});

	StringRef F{reinterpret_cast<const char *>(Buffer.data()), Buffer.size()};
	return {MemoryBufferRef{F, ImportName}};
	}

	NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
	uint16_t Ordinal,
	ImportType ImportType,
	ImportNameType NameType) {
	size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
	size_t Size = sizeof(coff_import_header) + ImpSize;
	char *Buf = Alloc.Allocate<char>(Size);
	memset(Buf, 0, Size);
	char *P = Buf;

	// Write short import library.
	auto Imp = reinterpret_cast<coff_import_header >(P);
	P += sizeof(*Imp);
	Imp->Sig2 = 0xFFFF;
	Imp->Machine = Machine;
	Imp->SizeOfData = ImpSize;
	if (Ordinal > 0)
	Imp->OrdinalHint = Ordinal;
	Imp->TypeInfo = (NameType << 2) \| ImportType;

	// Write symbol name and DLL name.
	memcpy(P, Sym.data(), Sym.size());
	P += Sym.size() + 1;
	memcpy(P, ImportName.data(), ImportName.size());

	return {MemoryBufferRef(StringRef(Buf, Size), ImportName)};
	}

	NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
	StringRef Weak, bool Imp) {
	std::vector<uint8_t> Buffer;
	const uint32_t NumberOfSections = 1;
	const uint32_t NumberOfSymbols = 5;

	// COFF Header
	coff_file_header Header{
	u16(0),
	u16(NumberOfSections),
	u32(0),
	u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section))),
	u32(NumberOfSymbols),
	u16(0),
	u16(0),
	};
	append(Buffer, Header);

	// Section Header Table
	const coff_section SectionTable[NumberOfSections] = {
	{{'.', 'd', 'r', 'e', 'c', 't', 'v', 'e'},
	u32(0),
	u32(0),
	u32(0),
	u32(0),
	u32(0),
	u32(0),
	u16(0),
	u16(0),
	u32(IMAGE_SCN_LNK_INFO \| IMAGE_SCN_LNK_REMOVE)}};
	append(Buffer, SectionTable);

	// Symbol Table
	coff_symbol16 SymbolTable[NumberOfSymbols] = {
	{{{'@', 'c', 'o', 'm', 'p', '.', 'i', 'd'}},
	u32(0),
	u16(0xFFFF),
	u16(0),
	IMAGE_SYM_CLASS_STATIC,
	0},
	{{{'@', 'f', 'e', 'a', 't', '.', '0', '0'}},
	u32(0),
	u16(0xFFFF),
	u16(0),
	IMAGE_SYM_CLASS_STATIC,
	0},
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_EXTERNAL,
	0},
	{{{0, 0, 0, 0, 0, 0, 0, 0}},
	u32(0),
	u16(0),
	u16(0),
	IMAGE_SYM_CLASS_WEAK_EXTERNAL,
	1},
	{{{2, 0, 0, 0, 3, 0, 0, 0}}, u32(0), u16(0), u16(0), uint8_t(0), 0},
	};
	SymbolTable[2].Name.Offset.Offset = sizeof(uint32_t);

	//__imp_ String Table
	- if (Imp) {
	- SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 7;
	- writeStringTable(Buffer, {std::string("__imp_").append(Sym),
	- std::string("__imp_").append(Weak)});
	- } else {
	- SymbolTable[3].Name.Offset.Offset = sizeof(uint32_t) + Sym.size() + 1;
	- writeStringTable(Buffer, {Sym, Weak});
	- }
	+ StringRef Prefix = Imp ? "__imp_" : "";
	+ SymbolTable[3].Name.Offset.Offset =
	+ sizeof(uint32_t) + Sym.size() + Prefix.size() + 1;
	append(Buffer, SymbolTable);
	+ writeStringTable(Buffer, {(Prefix + Sym).str(),
	+ (Prefix + Weak).str()});

	// Copied here so we can still use writeStringTable
	char *Buf = Alloc.Allocate<char>(Buffer.size());
	memcpy(Buf, Buffer.data(), Buffer.size());
	return {MemoryBufferRef(StringRef(Buf, Buffer.size()), ImportName)};
	}

	std::error_code writeImportLibrary(StringRef ImportName, StringRef Path,
	ArrayRef<COFFShortExport> Exports,
	MachineTypes Machine) {

	std::vector<NewArchiveMember> Members;
	ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine);

	std::vector<uint8_t> ImportDescriptor;
	Members.push_back(OF.createImportDescriptor(ImportDescriptor));

	std::vector<uint8_t> NullImportDescriptor;
	Members.push_back(OF.createNullImportDescriptor(NullImportDescriptor));

	std::vector<uint8_t> NullThunk;
	Members.push_back(OF.createNullThunk(NullThunk));

	for (COFFShortExport E : Exports) {
	if (E.Private)
	continue;

	if (E.isWeak()) {
	Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, false));
	Members.push_back(OF.createWeakExternal(E.Name, E.ExtName, true));
	continue;
	}

	ImportType ImportType = IMPORT_CODE;
	if (E.Data)
	ImportType = IMPORT_DATA;
	if (E.Constant)
	ImportType = IMPORT_CONST;

	StringRef SymbolName = E.isWeak() ? E.ExtName : E.Name;
	ImportNameType NameType = getNameType(SymbolName, E.Name, Machine);
	Expected<std::string> Name = E.ExtName.empty()
	? SymbolName
	: replace(SymbolName, E.Name, E.ExtName);

	if (!Name) {
	return errorToErrorCode(Name.takeError());
	}

	Members.push_back(
	OF.createShortImport(*Name, E.Ordinal, ImportType, NameType));
	}

	std::pair<StringRef, std::error_code> Result =
	writeArchive(Path, Members, /WriteSymtab/ true, object::Archive::K_GNU,
	/Deterministic/ true, /Thin/ false);

	return Result.second;
	}

	} // namespace object
	} // namespace llvm
	Index: head/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp (revision 322320)
	@@ -1,980 +1,992 @@
	//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --- C++ --=//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains a pass that expands pseudo instructions into target
	// instructions to allow proper scheduling and other late optimizations. This
	// pass should be run after register allocation but before the post-regalloc
	// scheduling pass.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64InstrInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/CodeGen/LivePhysRegs.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/Support/MathExtras.h"
	using namespace llvm;

	#define AARCH64_EXPAND_PSEUDO_NAME "AArch64 pseudo instruction expansion pass"

	namespace {
	class AArch64ExpandPseudo : public MachineFunctionPass {
	public:
	static char ID;
	AArch64ExpandPseudo() : MachineFunctionPass(ID) {
	initializeAArch64ExpandPseudoPass(*PassRegistry::getPassRegistry());
	}

	const AArch64InstrInfo *TII;

	bool runOnMachineFunction(MachineFunction &Fn) override;

	StringRef getPassName() const override { return AARCH64_EXPAND_PSEUDO_NAME; }

	private:
	bool expandMBB(MachineBasicBlock &MBB);
	bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	MachineBasicBlock::iterator &NextMBBI);
	bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	unsigned BitSize);

	bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
	unsigned ExtendImm, unsigned ZeroReg,
	MachineBasicBlock::iterator &NextMBBI);
	bool expandCMP_SWAP_128(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	MachineBasicBlock::iterator &NextMBBI);
	};
	char AArch64ExpandPseudo::ID = 0;
	}

	INITIALIZE_PASS(AArch64ExpandPseudo, "aarch64-expand-pseudo",
	AARCH64_EXPAND_PSEUDO_NAME, false, false)

	/// \brief Transfer implicit operands on the pseudo instruction to the
	/// instructions created from the expansion.
	static void transferImpOps(MachineInstr &OldMI, MachineInstrBuilder &UseMI,
	MachineInstrBuilder &DefMI) {
	const MCInstrDesc &Desc = OldMI.getDesc();
	for (unsigned i = Desc.getNumOperands(), e = OldMI.getNumOperands(); i != e;
	++i) {
	const MachineOperand &MO = OldMI.getOperand(i);
	assert(MO.isReg() && MO.getReg());
	if (MO.isUse())
	UseMI.add(MO);
	else
	DefMI.add(MO);
	}
	}

	/// \brief Helper function which extracts the specified 16-bit chunk from a
	/// 64-bit value.
	static uint64_t getChunk(uint64_t Imm, unsigned ChunkIdx) {
	assert(ChunkIdx < 4 && "Out of range chunk index specified!");

	return (Imm >> (ChunkIdx * 16)) & 0xFFFF;
	}

	/// \brief Helper function which replicates a 16-bit chunk within a 64-bit
	/// value. Indices correspond to element numbers in a v4i16.
	static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
	assert((FromIdx < 4) && (ToIdx < 4) && "Out of range chunk index specified!");
	const unsigned ShiftAmt = ToIdx * 16;

	// Replicate the source chunk to the destination position.
	const uint64_t Chunk = getChunk(Imm, FromIdx) << ShiftAmt;
	// Clear the destination chunk.
	Imm &= ~(0xFFFFLL << ShiftAmt);
	// Insert the replicated chunk.
	return Imm \| Chunk;
	}

	/// \brief Helper function which tries to materialize a 64-bit value with an
	/// ORR + MOVK instruction sequence.
	static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	const AArch64InstrInfo *TII, unsigned ChunkIdx) {
	assert(ChunkIdx < 4 && "Out of range chunk index specified!");
	const unsigned ShiftAmt = ChunkIdx * 16;

	uint64_t Encoding;
	if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
	// Create the ORR-immediate instruction.
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
	.add(MI.getOperand(0))
	.addReg(AArch64::XZR)
	.addImm(Encoding);

	// Create the MOVK instruction.
	const unsigned Imm16 = getChunk(UImm, ChunkIdx);
	const unsigned DstReg = MI.getOperand(0).getReg();
	const bool DstIsDead = MI.getOperand(0).isDead();
	MachineInstrBuilder MIB1 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
	.addReg(DstReg, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(DstReg)
	.addImm(Imm16)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));

	transferImpOps(MI, MIB, MIB1);
	MI.eraseFromParent();
	return true;
	}

	return false;
	}

	/// \brief Check whether the given 16-bit chunk replicated to full 64-bit width
	/// can be materialized with an ORR instruction.
	static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
	Chunk = (Chunk << 48) \| (Chunk << 32) \| (Chunk << 16) \| Chunk;

	return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
	}

	/// \brief Check for identical 16-bit chunks within the constant and if so
	/// materialize them with a single ORR instruction. The remaining one or two
	/// 16-bit chunks will be materialized with MOVK instructions.
	///
	/// This allows us to materialize constants like \|A\|B\|A\|A\| or \|A\|B\|C\|A\| (order
	/// of the chunks doesn't matter), assuming \|A\|A\|A\|A\| can be materialized with
	/// an ORR instruction.
	///
	static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	const AArch64InstrInfo *TII) {
	typedef DenseMap<uint64_t, unsigned> CountMap;
	CountMap Counts;

	// Scan the constant and count how often every chunk occurs.
	for (unsigned Idx = 0; Idx < 4; ++Idx)
	++Counts[getChunk(UImm, Idx)];

	// Traverse the chunks to find one which occurs more than once.
	for (CountMap::const_iterator Chunk = Counts.begin(), End = Counts.end();
	Chunk != End; ++Chunk) {
	const uint64_t ChunkVal = Chunk->first;
	const unsigned Count = Chunk->second;

	uint64_t Encoding = 0;

	// We are looking for chunks which have two or three instances and can be
	// materialized with an ORR instruction.
	if ((Count != 2 && Count != 3) \|\| !canUseOrr(ChunkVal, Encoding))
	continue;

	const bool CountThree = Count == 3;
	// Create the ORR-immediate instruction.
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
	.add(MI.getOperand(0))
	.addReg(AArch64::XZR)
	.addImm(Encoding);

	const unsigned DstReg = MI.getOperand(0).getReg();
	const bool DstIsDead = MI.getOperand(0).isDead();

	unsigned ShiftAmt = 0;
	uint64_t Imm16 = 0;
	// Find the first chunk not materialized with the ORR instruction.
	for (; ShiftAmt < 64; ShiftAmt += 16) {
	Imm16 = (UImm >> ShiftAmt) & 0xFFFF;

	if (Imm16 != ChunkVal)
	break;
	}

	// Create the first MOVK instruction.
	MachineInstrBuilder MIB1 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
	.addReg(DstReg,
	RegState::Define \| getDeadRegState(DstIsDead && CountThree))
	.addReg(DstReg)
	.addImm(Imm16)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));

	// In case we have three instances the whole constant is now materialized
	// and we can exit.
	if (CountThree) {
	transferImpOps(MI, MIB, MIB1);
	MI.eraseFromParent();
	return true;
	}

	// Find the remaining chunk which needs to be materialized.
	for (ShiftAmt += 16; ShiftAmt < 64; ShiftAmt += 16) {
	Imm16 = (UImm >> ShiftAmt) & 0xFFFF;

	if (Imm16 != ChunkVal)
	break;
	}

	// Create the second MOVK instruction.
	MachineInstrBuilder MIB2 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
	.addReg(DstReg, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(DstReg)
	.addImm(Imm16)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));

	transferImpOps(MI, MIB, MIB2);
	MI.eraseFromParent();
	return true;
	}

	return false;
	}

	/// \brief Check whether this chunk matches the pattern '1...0...'. This pattern
	/// starts a contiguous sequence of ones if we look at the bits from the LSB
	/// towards the MSB.
	static bool isStartChunk(uint64_t Chunk) {
	if (Chunk == 0 \|\| Chunk == UINT64_MAX)
	return false;

	return isMask_64(~Chunk);
	}

	/// \brief Check whether this chunk matches the pattern '0...1...' This pattern
	/// ends a contiguous sequence of ones if we look at the bits from the LSB
	/// towards the MSB.
	static bool isEndChunk(uint64_t Chunk) {
	if (Chunk == 0 \|\| Chunk == UINT64_MAX)
	return false;

	return isMask_64(Chunk);
	}

	/// \brief Clear or set all bits in the chunk at the given index.
	static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
	const uint64_t Mask = 0xFFFF;

	if (Clear)
	// Clear chunk in the immediate.
	Imm &= ~(Mask << (Idx * 16));
	else
	// Set all bits in the immediate for the particular chunk.
	Imm \|= Mask << (Idx * 16);

	return Imm;
	}

	/// \brief Check whether the constant contains a sequence of contiguous ones,
	/// which might be interrupted by one or two chunks. If so, materialize the
	/// sequence of contiguous ones with an ORR instruction.
	/// Materialize the chunks which are either interrupting the sequence or outside
	/// of the sequence with a MOVK instruction.
	///
	/// Assuming S is a chunk which starts the sequence (1...0...), E is a chunk
	/// which ends the sequence (0...1...). Then we are looking for constants which
	/// contain at least one S and E chunk.
	/// E.g. \|E\|A\|B\|S\|, \|A\|E\|B\|S\| or \|A\|B\|E\|S\|.
	///
	/// We are also looking for constants like \|S\|A\|B\|E\| where the contiguous
	/// sequence of ones wraps around the MSB into the LSB.
	///
	static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
	MachineBasicBlock &MBB,
	MachineBasicBlock::iterator &MBBI,
	const AArch64InstrInfo *TII) {
	const int NotSet = -1;
	const uint64_t Mask = 0xFFFF;

	int StartIdx = NotSet;
	int EndIdx = NotSet;
	// Try to find the chunks which start/end a contiguous sequence of ones.
	for (int Idx = 0; Idx < 4; ++Idx) {
	int64_t Chunk = getChunk(UImm, Idx);
	// Sign extend the 16-bit chunk to 64-bit.
	Chunk = (Chunk << 48) >> 48;

	if (isStartChunk(Chunk))
	StartIdx = Idx;
	else if (isEndChunk(Chunk))
	EndIdx = Idx;
	}

	// Early exit in case we can't find a start/end chunk.
	if (StartIdx == NotSet \|\| EndIdx == NotSet)
	return false;

	// Outside of the contiguous sequence of ones everything needs to be zero.
	uint64_t Outside = 0;
	// Chunks between the start and end chunk need to have all their bits set.
	uint64_t Inside = Mask;

	// If our contiguous sequence of ones wraps around from the MSB into the LSB,
	// just swap indices and pretend we are materializing a contiguous sequence
	// of zeros surrounded by a contiguous sequence of ones.
	if (StartIdx > EndIdx) {
	std::swap(StartIdx, EndIdx);
	std::swap(Outside, Inside);
	}

	uint64_t OrrImm = UImm;
	int FirstMovkIdx = NotSet;
	int SecondMovkIdx = NotSet;

	// Find out which chunks we need to patch up to obtain a contiguous sequence
	// of ones.
	for (int Idx = 0; Idx < 4; ++Idx) {
	const uint64_t Chunk = getChunk(UImm, Idx);

	// Check whether we are looking at a chunk which is not part of the
	// contiguous sequence of ones.
	if ((Idx < StartIdx \|\| EndIdx < Idx) && Chunk != Outside) {
	OrrImm = updateImm(OrrImm, Idx, Outside == 0);

	// Remember the index we need to patch.
	if (FirstMovkIdx == NotSet)
	FirstMovkIdx = Idx;
	else
	SecondMovkIdx = Idx;

	// Check whether we are looking a chunk which is part of the contiguous
	// sequence of ones.
	} else if (Idx > StartIdx && Idx < EndIdx && Chunk != Inside) {
	OrrImm = updateImm(OrrImm, Idx, Inside != Mask);

	// Remember the index we need to patch.
	if (FirstMovkIdx == NotSet)
	FirstMovkIdx = Idx;
	else
	SecondMovkIdx = Idx;
	}
	}
	assert(FirstMovkIdx != NotSet && "Constant materializable with single ORR!");

	// Create the ORR-immediate instruction.
	uint64_t Encoding = 0;
	AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
	.add(MI.getOperand(0))
	.addReg(AArch64::XZR)
	.addImm(Encoding);

	const unsigned DstReg = MI.getOperand(0).getReg();
	const bool DstIsDead = MI.getOperand(0).isDead();

	const bool SingleMovk = SecondMovkIdx == NotSet;
	// Create the first MOVK instruction.
	MachineInstrBuilder MIB1 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
	.addReg(DstReg,
	RegState::Define \| getDeadRegState(DstIsDead && SingleMovk))
	.addReg(DstReg)
	.addImm(getChunk(UImm, FirstMovkIdx))
	.addImm(
	AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));

	// Early exit in case we only need to emit a single MOVK instruction.
	if (SingleMovk) {
	transferImpOps(MI, MIB, MIB1);
	MI.eraseFromParent();
	return true;
	}

	// Create the second MOVK instruction.
	MachineInstrBuilder MIB2 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
	.addReg(DstReg, RegState::Define \| getDeadRegState(DstIsDead))
	.addReg(DstReg)
	.addImm(getChunk(UImm, SecondMovkIdx))
	.addImm(
	AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));

	transferImpOps(MI, MIB, MIB2);
	MI.eraseFromParent();
	return true;
	}

	/// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
	/// real move-immediate instructions to synthesize the immediate.
	bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	unsigned BitSize) {
	MachineInstr &MI = *MBBI;
	unsigned DstReg = MI.getOperand(0).getReg();
	uint64_t Imm = MI.getOperand(1).getImm();
	const unsigned Mask = 0xFFFF;

	if (DstReg == AArch64::XZR \|\| DstReg == AArch64::WZR) {
	// Useless def, and we don't want to risk creating an invalid ORR (which
	// would really write to sp).
	MI.eraseFromParent();
	return true;
	}

	// Try a MOVI instruction (aka ORR-immediate with the zero register).
	uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
	uint64_t Encoding;
	if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
	unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
	.add(MI.getOperand(0))
	.addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
	.addImm(Encoding);
	transferImpOps(MI, MIB, MIB);
	MI.eraseFromParent();
	return true;
	}

	// Scan the immediate and count the number of 16-bit chunks which are either
	// all ones or all zeros.
	unsigned OneChunks = 0;
	unsigned ZeroChunks = 0;
	for (unsigned Shift = 0; Shift < BitSize; Shift += 16) {
	const unsigned Chunk = (Imm >> Shift) & Mask;
	if (Chunk == Mask)
	OneChunks++;
	else if (Chunk == 0)
	ZeroChunks++;
	}

	// Since we can't materialize the constant with a single ORR instruction,
	// let's see whether we can materialize 3/4 of the constant with an ORR
	// instruction and use an additional MOVK instruction to materialize the
	// remaining 1/4.
	//
	// We are looking for constants with a pattern like: \|A\|X\|B\|X\| or \|X\|A\|X\|B\|.
	//
	// E.g. assuming \|A\|X\|A\|X\| is a pattern which can be materialized with ORR,
	// we would create the following instruction sequence:
	//
	// ORR x0, xzr, \|A\|X\|A\|X\|
	// MOVK x0, \|B\|, LSL #16
	//
	// Only look at 64-bit constants which can't be materialized with a single
	// instruction e.g. which have less than either three all zero or all one
	// chunks.
	//
	// Ignore 32-bit constants here, they always can be materialized with a
	// MOVZ/MOVN + MOVK pair. Since the 32-bit constant can't be materialized
	// with a single ORR, the best sequence we can achieve is a ORR + MOVK pair.
	// Thus we fall back to the default code below which in the best case creates
	// a single MOVZ/MOVN instruction (in case one chunk is all zero or all one).
	//
	if (BitSize == 64 && OneChunks < 3 && ZeroChunks < 3) {
	// If we interpret the 64-bit constant as a v4i16, are elements 0 and 2
	// identical?
	if (getChunk(UImm, 0) == getChunk(UImm, 2)) {
	// See if we can come up with a constant which can be materialized with
	// ORR-immediate by replicating element 3 into element 1.
	uint64_t OrrImm = replicateChunk(UImm, 3, 1);
	if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 1))
	return true;

	// See if we can come up with a constant which can be materialized with
	// ORR-immediate by replicating element 1 into element 3.
	OrrImm = replicateChunk(UImm, 1, 3);
	if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 3))
	return true;

	// If we interpret the 64-bit constant as a v4i16, are elements 1 and 3
	// identical?
	} else if (getChunk(UImm, 1) == getChunk(UImm, 3)) {
	// See if we can come up with a constant which can be materialized with
	// ORR-immediate by replicating element 2 into element 0.
	uint64_t OrrImm = replicateChunk(UImm, 2, 0);
	if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 0))
	return true;

	// See if we can come up with a constant which can be materialized with
	// ORR-immediate by replicating element 1 into element 3.
	OrrImm = replicateChunk(UImm, 0, 2);
	if (tryOrrMovk(UImm, OrrImm, MI, MBB, MBBI, TII, 2))
	return true;
	}
	}

	// Check for identical 16-bit chunks within the constant and if so materialize
	// them with a single ORR instruction. The remaining one or two 16-bit chunks
	// will be materialized with MOVK instructions.
	if (BitSize == 64 && tryToreplicateChunks(UImm, MI, MBB, MBBI, TII))
	return true;

	// Check whether the constant contains a sequence of contiguous ones, which
	// might be interrupted by one or two chunks. If so, materialize the sequence
	// of contiguous ones with an ORR instruction. Materialize the chunks which
	// are either interrupting the sequence or outside of the sequence with a
	// MOVK instruction.
	if (BitSize == 64 && trySequenceOfOnes(UImm, MI, MBB, MBBI, TII))
	return true;

	// Use a MOVZ or MOVN instruction to set the high bits, followed by one or
	// more MOVK instructions to insert additional 16-bit portions into the
	// lower bits.
	bool isNeg = false;

	// Use MOVN to materialize the high bits if we have more all one chunks
	// than all zero chunks.
	if (OneChunks > ZeroChunks) {
	isNeg = true;
	Imm = ~Imm;
	}

	unsigned FirstOpc;
	if (BitSize == 32) {
	Imm &= (1LL << 32) - 1;
	FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
	} else {
	FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
	}
	unsigned Shift = 0; // LSL amount for high bits with MOVZ/MOVN
	unsigned LastShift = 0; // LSL amount for last MOVK
	if (Imm != 0) {
	unsigned LZ = countLeadingZeros(Imm);
	unsigned TZ = countTrailingZeros(Imm);
	Shift = (TZ / 16) * 16;
	LastShift = ((63 - LZ) / 16) * 16;
	}
	unsigned Imm16 = (Imm >> Shift) & Mask;
	bool DstIsDead = MI.getOperand(0).isDead();
	MachineInstrBuilder MIB1 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(FirstOpc))
	.addReg(DstReg, RegState::Define \|
	getDeadRegState(DstIsDead && Shift == LastShift))
	.addImm(Imm16)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));

	// If a MOVN was used for the high bits of a negative value, flip the rest
	// of the bits back for use with MOVK.
	if (isNeg)
	Imm = ~Imm;

	if (Shift == LastShift) {
	transferImpOps(MI, MIB1, MIB1);
	MI.eraseFromParent();
	return true;
	}

	MachineInstrBuilder MIB2;
	unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
	while (Shift < LastShift) {
	Shift += 16;
	Imm16 = (Imm >> Shift) & Mask;
	if (Imm16 == (isNeg ? Mask : 0))
	continue; // This 16-bit portion is already set correctly.
	MIB2 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
	.addReg(DstReg,
	RegState::Define \|
	getDeadRegState(DstIsDead && Shift == LastShift))
	.addReg(DstReg)
	.addImm(Imm16)
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
	}

	transferImpOps(MI, MIB1, MIB2);
	MI.eraseFromParent();
	return true;
	}

	bool AArch64ExpandPseudo::expandCMP_SWAP(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
	unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
	MachineBasicBlock::iterator &NextMBBI) {
	MachineInstr &MI = *MBBI;
	DebugLoc DL = MI.getDebugLoc();
	const MachineOperand &Dest = MI.getOperand(0);
	unsigned StatusReg = MI.getOperand(1).getReg();
	bool StatusDead = MI.getOperand(1).isDead();
	// Duplicating undef operands into 2 instructions does not guarantee the same
	// value on both; However undef should be replaced by xzr anyway.
	assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
	unsigned AddrReg = MI.getOperand(2).getReg();
	unsigned DesiredReg = MI.getOperand(3).getReg();
	unsigned NewReg = MI.getOperand(4).getReg();

	MachineFunction *MF = MBB.getParent();
	auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
	auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
	auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());

	MF->insert(++MBB.getIterator(), LoadCmpBB);
	MF->insert(++LoadCmpBB->getIterator(), StoreBB);
	MF->insert(++StoreBB->getIterator(), DoneBB);

	// .Lloadcmp:
	// mov wStatus, 0
	// ldaxr xDest, [xAddr]
	// cmp xDest, xDesired
	// b.ne .Ldone
	if (!StatusDead)
	BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg)
	.addImm(0).addImm(0);
	BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
	.addReg(AddrReg);
	BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
	.addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
	.addReg(DesiredReg)
	.addImm(ExtendImm);
	BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
	.addImm(AArch64CC::NE)
	.addMBB(DoneBB)
	.addReg(AArch64::NZCV, RegState::Implicit \| RegState::Kill);
	LoadCmpBB->addSuccessor(DoneBB);
	LoadCmpBB->addSuccessor(StoreBB);

	// .Lstore:
	// stlxr wStatus, xNew, [xAddr]
	// cbnz wStatus, .Lloadcmp
	BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
	.addReg(NewReg)
	.addReg(AddrReg);
	BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
	.addReg(StatusReg, getKillRegState(StatusDead))
	.addMBB(LoadCmpBB);
	StoreBB->addSuccessor(LoadCmpBB);
	StoreBB->addSuccessor(DoneBB);

	DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
	DoneBB->transferSuccessors(&MBB);

	MBB.addSuccessor(LoadCmpBB);

	NextMBBI = MBB.end();
	MI.eraseFromParent();

	// Recompute livein lists.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	LivePhysRegs LiveRegs;
	computeLiveIns(LiveRegs, MRI, *DoneBB);
	computeLiveIns(LiveRegs, MRI, *StoreBB);
	computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
	// Do an extra pass around the loop to get loop carried registers right.
	StoreBB->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, *StoreBB);
	LoadCmpBB->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, *LoadCmpBB);

	return true;
	}

	bool AArch64ExpandPseudo::expandCMP_SWAP_128(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	MachineBasicBlock::iterator &NextMBBI) {

	MachineInstr &MI = *MBBI;
	DebugLoc DL = MI.getDebugLoc();
	MachineOperand &DestLo = MI.getOperand(0);
	MachineOperand &DestHi = MI.getOperand(1);
	unsigned StatusReg = MI.getOperand(2).getReg();
	bool StatusDead = MI.getOperand(2).isDead();
	// Duplicating undef operands into 2 instructions does not guarantee the same
	// value on both; However undef should be replaced by xzr anyway.
	assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
	unsigned AddrReg = MI.getOperand(3).getReg();
	unsigned DesiredLoReg = MI.getOperand(4).getReg();
	unsigned DesiredHiReg = MI.getOperand(5).getReg();
	unsigned NewLoReg = MI.getOperand(6).getReg();
	unsigned NewHiReg = MI.getOperand(7).getReg();

	MachineFunction *MF = MBB.getParent();
	auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
	auto StoreBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
	auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());

	MF->insert(++MBB.getIterator(), LoadCmpBB);
	MF->insert(++LoadCmpBB->getIterator(), StoreBB);
	MF->insert(++StoreBB->getIterator(), DoneBB);

	// .Lloadcmp:
	// ldaxp xDestLo, xDestHi, [xAddr]
	// cmp xDestLo, xDesiredLo
	// sbcs xDestHi, xDesiredHi
	// b.ne .Ldone
	BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
	.addReg(DestLo.getReg(), RegState::Define)
	.addReg(DestHi.getReg(), RegState::Define)
	.addReg(AddrReg);
	BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
	.addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
	.addReg(DesiredLoReg)
	.addImm(0);
	BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
	.addUse(AArch64::WZR)
	.addUse(AArch64::WZR)
	.addImm(AArch64CC::EQ);
	BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
	.addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
	.addReg(DesiredHiReg)
	.addImm(0);
	BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
	.addUse(StatusReg, RegState::Kill)
	.addUse(StatusReg, RegState::Kill)
	.addImm(AArch64CC::EQ);
	BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
	.addUse(StatusReg, getKillRegState(StatusDead))
	.addMBB(DoneBB);
	LoadCmpBB->addSuccessor(DoneBB);
	LoadCmpBB->addSuccessor(StoreBB);

	// .Lstore:
	// stlxp wStatus, xNewLo, xNewHi, [xAddr]
	// cbnz wStatus, .Lloadcmp
	BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
	.addReg(NewLoReg)
	.addReg(NewHiReg)
	.addReg(AddrReg);
	BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
	.addReg(StatusReg, getKillRegState(StatusDead))
	.addMBB(LoadCmpBB);
	StoreBB->addSuccessor(LoadCmpBB);
	StoreBB->addSuccessor(DoneBB);

	DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
	DoneBB->transferSuccessors(&MBB);

	MBB.addSuccessor(LoadCmpBB);

	NextMBBI = MBB.end();
	MI.eraseFromParent();

	// Recompute liveness bottom up.
	const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
	LivePhysRegs LiveRegs;
	computeLiveIns(LiveRegs, MRI, *DoneBB);
	computeLiveIns(LiveRegs, MRI, *StoreBB);
	computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
	// Do an extra pass in the loop to get the loop carried dependencies right.
	StoreBB->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, *StoreBB);
	LoadCmpBB->clearLiveIns();
	computeLiveIns(LiveRegs, MRI, *LoadCmpBB);

	return true;
	}

	/// \brief If MBBI references a pseudo instruction that should be expanded here,
	/// do the expansion and return true. Otherwise return false.
	bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
	MachineBasicBlock::iterator MBBI,
	MachineBasicBlock::iterator &NextMBBI) {
	MachineInstr &MI = *MBBI;
	unsigned Opcode = MI.getOpcode();
	switch (Opcode) {
	default:
	break;

	case AArch64::ADDWrr:
	case AArch64::SUBWrr:
	case AArch64::ADDXrr:
	case AArch64::SUBXrr:
	case AArch64::ADDSWrr:
	case AArch64::SUBSWrr:
	case AArch64::ADDSXrr:
	case AArch64::SUBSXrr:
	case AArch64::ANDWrr:
	case AArch64::ANDXrr:
	case AArch64::BICWrr:
	case AArch64::BICXrr:
	case AArch64::ANDSWrr:
	case AArch64::ANDSXrr:
	case AArch64::BICSWrr:
	case AArch64::BICSXrr:
	case AArch64::EONWrr:
	case AArch64::EONXrr:
	case AArch64::EORWrr:
	case AArch64::EORXrr:
	case AArch64::ORNWrr:
	case AArch64::ORNXrr:
	case AArch64::ORRWrr:
	case AArch64::ORRXrr: {
	unsigned Opcode;
	switch (MI.getOpcode()) {
	default:
	return false;
	case AArch64::ADDWrr: Opcode = AArch64::ADDWrs; break;
	case AArch64::SUBWrr: Opcode = AArch64::SUBWrs; break;
	case AArch64::ADDXrr: Opcode = AArch64::ADDXrs; break;
	case AArch64::SUBXrr: Opcode = AArch64::SUBXrs; break;
	case AArch64::ADDSWrr: Opcode = AArch64::ADDSWrs; break;
	case AArch64::SUBSWrr: Opcode = AArch64::SUBSWrs; break;
	case AArch64::ADDSXrr: Opcode = AArch64::ADDSXrs; break;
	case AArch64::SUBSXrr: Opcode = AArch64::SUBSXrs; break;
	case AArch64::ANDWrr: Opcode = AArch64::ANDWrs; break;
	case AArch64::ANDXrr: Opcode = AArch64::ANDXrs; break;
	case AArch64::BICWrr: Opcode = AArch64::BICWrs; break;
	case AArch64::BICXrr: Opcode = AArch64::BICXrs; break;
	case AArch64::ANDSWrr: Opcode = AArch64::ANDSWrs; break;
	case AArch64::ANDSXrr: Opcode = AArch64::ANDSXrs; break;
	case AArch64::BICSWrr: Opcode = AArch64::BICSWrs; break;
	case AArch64::BICSXrr: Opcode = AArch64::BICSXrs; break;
	case AArch64::EONWrr: Opcode = AArch64::EONWrs; break;
	case AArch64::EONXrr: Opcode = AArch64::EONXrs; break;
	case AArch64::EORWrr: Opcode = AArch64::EORWrs; break;
	case AArch64::EORXrr: Opcode = AArch64::EORXrs; break;
	case AArch64::ORNWrr: Opcode = AArch64::ORNWrs; break;
	case AArch64::ORNXrr: Opcode = AArch64::ORNXrs; break;
	case AArch64::ORRWrr: Opcode = AArch64::ORRWrs; break;
	case AArch64::ORRXrr: Opcode = AArch64::ORRXrs; break;
	}
	MachineInstrBuilder MIB1 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
	MI.getOperand(0).getReg())
	.add(MI.getOperand(1))
	.add(MI.getOperand(2))
	.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
	transferImpOps(MI, MIB1, MIB1);
	MI.eraseFromParent();
	return true;
	}

	case AArch64::LOADgot: {
	// Expand into ADRP + LDR.
	unsigned DstReg = MI.getOperand(0).getReg();
	const MachineOperand &MO1 = MI.getOperand(1);
	unsigned Flags = MO1.getTargetFlags();
	MachineInstrBuilder MIB1 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
	MachineInstrBuilder MIB2 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
	.add(MI.getOperand(0))
	.addReg(DstReg);

	if (MO1.isGlobal()) {
	MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags \| AArch64II::MO_PAGE);
	MIB2.addGlobalAddress(MO1.getGlobal(), 0,
	Flags \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	} else if (MO1.isSymbol()) {
	MIB1.addExternalSymbol(MO1.getSymbolName(), Flags \| AArch64II::MO_PAGE);
	MIB2.addExternalSymbol(MO1.getSymbolName(),
	Flags \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	} else {
	assert(MO1.isCPI() &&
	"Only expect globals, externalsymbols, or constant pools");
	MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
	Flags \| AArch64II::MO_PAGE);
	MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
	Flags \| AArch64II::MO_PAGEOFF \|
	AArch64II::MO_NC);
	}

	transferImpOps(MI, MIB1, MIB2);
	MI.eraseFromParent();
	return true;
	}

	case AArch64::MOVaddr:
	case AArch64::MOVaddrJT:
	case AArch64::MOVaddrCP:
	case AArch64::MOVaddrBA:
	case AArch64::MOVaddrTLS:
	case AArch64::MOVaddrEXT: {
	// Expand into ADRP + ADD.
	unsigned DstReg = MI.getOperand(0).getReg();
	MachineInstrBuilder MIB1 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
	.add(MI.getOperand(1));

	MachineInstrBuilder MIB2 =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
	.add(MI.getOperand(0))
	.addReg(DstReg)
	.add(MI.getOperand(2))
	.addImm(0);

	transferImpOps(MI, MIB1, MIB2);
	MI.eraseFromParent();
	return true;
	}
	case AArch64::MOVbaseTLS: {
	unsigned DstReg = MI.getOperand(0).getReg();
	auto SysReg = AArch64SysReg::TPIDR_EL0;
	MachineFunction *MF = MBB.getParent();
	if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
	MF->getTarget().getCodeModel() == CodeModel::Kernel)
	SysReg = AArch64SysReg::TPIDR_EL1;
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg)
	.addImm(SysReg);
	MI.eraseFromParent();
	return true;
	}

	case AArch64::MOVi32imm:
	return expandMOVImm(MBB, MBBI, 32);
	case AArch64::MOVi64imm:
	return expandMOVImm(MBB, MBBI, 64);
	case AArch64::RET_ReallyLR: {
	// Hiding the LR use with RET_ReallyLR may lead to extra kills in the
	// function and missing live-ins. We are fine in practice because callee
	// saved register handling ensures the register value is restored before
	// RET, but we need the undef flag here to appease the MachineVerifier
	// liveness checks.
	MachineInstrBuilder MIB =
	BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
	.addReg(AArch64::LR, RegState::Undef);
	transferImpOps(MI, MIB, MIB);
	MI.eraseFromParent();
	return true;
	}
	case AArch64::CMP_SWAP_8:
	return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRB, AArch64::STLXRB,
	AArch64::SUBSWrx,
	AArch64_AM::getArithExtendImm(AArch64_AM::UXTB, 0),
	AArch64::WZR, NextMBBI);
	case AArch64::CMP_SWAP_16:
	return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRH, AArch64::STLXRH,
	AArch64::SUBSWrx,
	AArch64_AM::getArithExtendImm(AArch64_AM::UXTH, 0),
	AArch64::WZR, NextMBBI);
	case AArch64::CMP_SWAP_32:
	return expandCMP_SWAP(MBB, MBBI, AArch64::LDAXRW, AArch64::STLXRW,
	AArch64::SUBSWrs,
	AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
	AArch64::WZR, NextMBBI);
	case AArch64::CMP_SWAP_64:
	return expandCMP_SWAP(MBB, MBBI,
	AArch64::LDAXRX, AArch64::STLXRX, AArch64::SUBSXrs,
	AArch64_AM::getShifterImm(AArch64_AM::LSL, 0),
	AArch64::XZR, NextMBBI);
	case AArch64::CMP_SWAP_128:
	return expandCMP_SWAP_128(MBB, MBBI, NextMBBI);

	+ case AArch64::AESMCrrTied:
	+ case AArch64::AESIMCrrTied: {
	+ MachineInstrBuilder MIB =
	+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
	+ TII->get(Opcode == AArch64::AESMCrrTied ? AArch64::AESMCrr :
	+ AArch64::AESIMCrr))
	+ .add(MI.getOperand(0))
	+ .add(MI.getOperand(1));
	+ transferImpOps(MI, MIB, MIB);
	+ MI.eraseFromParent();
	+ return true;
	+ }
	}
	return false;
	}

	/// \brief Iterate over the instructions in basic block MBB and expand any
	/// pseudo instructions. Return true if anything was modified.
	bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
	bool Modified = false;

	MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
	while (MBBI != E) {
	MachineBasicBlock::iterator NMBBI = std::next(MBBI);
	Modified \|= expandMI(MBB, MBBI, NMBBI);
	MBBI = NMBBI;
	}

	return Modified;
	}

	bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
	TII = static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());

	bool Modified = false;
	for (auto &MBB : MF)
	Modified \|= expandMBB(MBB);
	return Modified;
	}

	/// \brief Returns an instance of the pseudo instruction expansion pass.
	FunctionPass *llvm::createAArch64ExpandPseudoPass() {
	return new AArch64ExpandPseudo();
	}
	Index: head/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp (revision 322320)
	@@ -1,1263 +1,1271 @@
	//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------- C++ --====//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file contains the AArch64 implementation of TargetFrameLowering class.
	//
	// On AArch64, stack frames are structured as follows:
	//
	// The stack grows downward.
	//
	// All of the individual frame areas on the frame below are optional, i.e. it's
	// possible to create a function so that the particular area isn't present
	// in the frame.
	//
	// At function entry, the "frame" looks as follows:
	//
	// \| \| Higher address
	// \|-----------------------------------\|
	// \| \|
	// \| arguments passed on the stack \|
	// \| \|
	// \|-----------------------------------\| <- sp
	// \| \| Lower address
	//
	//
	// After the prologue has run, the frame has the following general structure.
	// Note that this doesn't depict the case where a red-zone is used. Also,
	// technically the last frame area (VLAs) doesn't get created until in the
	// main function body, after the prologue is run. However, it's depicted here
	// for completeness.
	//
	// \| \| Higher address
	// \|-----------------------------------\|
	// \| \|
	// \| arguments passed on the stack \|
	// \| \|
	// \|-----------------------------------\|
	// \| \|
	// \| (Win64 only) varargs from reg \|
	// \| \|
	// \|-----------------------------------\|
	// \| \|
	// \| prev_fp, prev_lr \|
	// \| (a.k.a. "frame record") \|
	// \|-----------------------------------\| <- fp(=x29)
	// \| \|
	// \| other callee-saved registers \|
	// \| \|
	// \|-----------------------------------\|
	// \|.empty.space.to.make.part.below....\|
	// \|.aligned.in.case.it.needs.more.than\| (size of this area is unknown at
	// \|.the.standard.16-byte.alignment....\| compile time; if present)
	// \|-----------------------------------\|
	// \| \|
	// \| local variables of fixed size \|
	// \| including spill slots \|
	// \|-----------------------------------\| <- bp(not defined by ABI,
	// \|.variable-sized.local.variables....\| LLVM chooses X19)
	// \|.(VLAs)............................\| (size of this area is unknown at
	// \|...................................\| compile time)
	// \|-----------------------------------\| <- sp
	// \| \| Lower address
	//
	//
	// To access the data in a frame, at-compile time, a constant offset must be
	// computable from one of the pointers (fp, bp, sp) to access it. The size
	// of the areas with a dotted background cannot be computed at compile-time
	// if they are present, making it required to have all three of fp, bp and
	// sp to be set up to be able to access all contents in the frame areas,
	// assuming all of the frame areas are non-empty.
	//
	// For most functions, some of the frame areas are empty. For those functions,
	// it may not be necessary to set up fp or bp:
	// * A base pointer is definitely needed when there are both VLAs and local
	// variables with more-than-default alignment requirements.
	// * A frame pointer is definitely needed when there are local variables with
	// more-than-default alignment requirements.
	//
	// In some cases when a base pointer is not strictly needed, it is generated
	// anyway when offsets from the frame pointer to access local variables become
	// so large that the offset can't be encoded in the immediate fields of loads
	// or stores.
	//
	// FIXME: also explain the redzone concept.
	// FIXME: also explain the concept of reserved call frames.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64FrameLowering.h"
	#include "AArch64InstrInfo.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64Subtarget.h"
	#include "AArch64TargetMachine.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/CodeGen/LivePhysRegs.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/RegisterScavenging.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/Function.h"
	#include "llvm/MC/MCDwarf.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include "llvm/Target/TargetRegisterInfo.h"
	#include "llvm/Target/TargetSubtargetInfo.h"
	#include <cassert>
	#include <cstdint>
	#include <iterator>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "frame-info"

	static cl::opt<bool> EnableRedZone("aarch64-redzone",
	cl::desc("enable use of redzone on AArch64"),
	cl::init(false), cl::Hidden);

	STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");

	/// Look at each instruction that references stack frames and return the stack
	/// size limit beyond which some of these instructions will require a scratch
	/// register during their expansion later.
	static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
	// FIXME: For now, just conservatively guestimate based on unscaled indexing
	// range. We'll end up allocating an unnecessary spill slot a lot, but
	// realistically that's not a big deal at this stage of the game.
	for (MachineBasicBlock &MBB : MF) {
	for (MachineInstr &MI : MBB) {
	if (MI.isDebugValue() \|\| MI.isPseudo() \|\|
	MI.getOpcode() == AArch64::ADDXri \|\|
	MI.getOpcode() == AArch64::ADDSXri)
	continue;

	for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
	if (!MI.getOperand(i).isFI())
	continue;

	int Offset = 0;
	if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
	AArch64FrameOffsetCannotUpdate)
	return 0;
	}
	}
	}
	return 255;
	}

	bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
	if (!EnableRedZone)
	return false;
	// Don't use the red zone if the function explicitly asks us not to.
	// This is typically used for kernel code.
	if (MF.getFunction()->hasFnAttribute(Attribute::NoRedZone))
	return false;

	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	unsigned NumBytes = AFI->getLocalStackSize();

	return !(MFI.hasCalls() \|\| hasFP(MF) \|\| NumBytes > 128);
	}

	/// hasFP - Return true if the specified function should have a dedicated frame
	/// pointer register.
	bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
	// Retain behavior of always omitting the FP for leaf functions when possible.
	return (MFI.hasCalls() &&
	MF.getTarget().Options.DisableFramePointerElim(MF)) \|\|
	MFI.hasVarSizedObjects() \|\| MFI.isFrameAddressTaken() \|\|
	MFI.hasStackMap() \|\| MFI.hasPatchPoint() \|\|
	RegInfo->needsStackRealignment(MF);
	}

	/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
	/// not required, we reserve argument space for call sites in the function
	/// immediately on entry to the current function. This eliminates the need for
	/// add/sub sp brackets around call sites. Returns true if the call frame is
	/// included as part of the stack frame.
	bool
	AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
	return !MF.getFrameInfo().hasVarSizedObjects();
	}

	MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
	MachineFunction &MF, MachineBasicBlock &MBB,
	MachineBasicBlock::iterator I) const {
	const AArch64InstrInfo *TII =
	static_cast<const AArch64InstrInfo *>(MF.getSubtarget().getInstrInfo());
	DebugLoc DL = I->getDebugLoc();
	unsigned Opc = I->getOpcode();
	bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
	uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;

	const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
	if (!TFI->hasReservedCallFrame(MF)) {
	unsigned Align = getStackAlignment();

	int64_t Amount = I->getOperand(0).getImm();
	Amount = alignTo(Amount, Align);
	if (!IsDestroy)
	Amount = -Amount;

	// N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
	// doesn't have to pop anything), then the first operand will be zero too so
	// this adjustment is a no-op.
	if (CalleePopAmount == 0) {
	// FIXME: in-function stack adjustment for calls is limited to 24-bits
	// because there's no guaranteed temporary register available.
	//
	// ADD/SUB (immediate) has only LSL #0 and LSL #12 available.
	// 1) For offset <= 12-bit, we use LSL #0
	// 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
	// LSL #0, and the other uses LSL #12.
	//
	// Most call frames will be allocated at the start of a function so
	// this is OK, but it is a limitation that needs dealing with.
	assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
	emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
	}
	} else if (CalleePopAmount != 0) {
	// If the calling convention demands that the callee pops arguments from the
	// stack, we want to add it back if we have a reserved call frame.
	assert(CalleePopAmount < 0xffffff && "call frame too large");
	emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
	TII);
	}
	return MBB.erase(I);
	}

	void AArch64FrameLowering::emitCalleeSavedFrameMoves(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
	MachineFunction &MF = *MBB.getParent();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const TargetSubtargetInfo &STI = MF.getSubtarget();
	const MCRegisterInfo *MRI = STI.getRegisterInfo();
	const TargetInstrInfo *TII = STI.getInstrInfo();
	DebugLoc DL = MBB.findDebugLoc(MBBI);

	// Add callee saved registers to move list.
	const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
	if (CSI.empty())
	return;

	for (const auto &Info : CSI) {
	unsigned Reg = Info.getReg();
	int64_t Offset =
	MFI.getObjectOffset(Info.getFrameIdx()) - getOffsetOfLocalArea();
	unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}
	}

	// Find a scratch register that we can use at the start of the prologue to
	// re-align the stack pointer. We avoid using callee-save registers since they
	// may appear to be free when this is called from canUseAsPrologue (during
	// shrink wrapping), but then no longer be free when this is called from
	// emitPrologue.
	//
	// FIXME: This is a bit conservative, since in the above case we could use one
	// of the callee-save registers as a scratch temp to re-align the stack pointer,
	// but we would then have to make sure that we were in fact saving at least one
	// callee-save register in the prologue, which is additional complexity that
	// doesn't seem worth the benefit.
	static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
	MachineFunction *MF = MBB->getParent();

	// If MBB is an entry block, use X9 as the scratch register
	if (&MF->front() == MBB)
	return AArch64::X9;

	const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
	LivePhysRegs LiveRegs(TRI);
	LiveRegs.addLiveIns(*MBB);

	// Mark callee saved registers as used so we will not choose them.
	const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
	for (unsigned i = 0; CSRegs[i]; ++i)
	LiveRegs.addReg(CSRegs[i]);

	// Prefer X9 since it was historically used for the prologue scratch reg.
	const MachineRegisterInfo &MRI = MF->getRegInfo();
	if (LiveRegs.available(MRI, AArch64::X9))
	return AArch64::X9;

	for (unsigned Reg : AArch64::GPR64RegClass) {
	if (LiveRegs.available(MRI, Reg))
	return Reg;
	}
	return AArch64::NoRegister;
	}

	bool AArch64FrameLowering::canUseAsPrologue(
	const MachineBasicBlock &MBB) const {
	const MachineFunction *MF = MBB.getParent();
	MachineBasicBlock TmpMBB = const_cast<MachineBasicBlock >(&MBB);
	const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

	// Don't need a scratch register if we're not going to re-align the stack.
	if (!RegInfo->needsStackRealignment(*MF))
	return true;
	// Otherwise, we can use any block as long as it has a scratch register
	// available.
	return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
	}

	bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
	MachineFunction &MF, unsigned StackBumpBytes) const {
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();

	if (AFI->getLocalStackSize() == 0)
	return false;

	// 512 is the maximum immediate for stp/ldp that will be used for
	// callee-save save/restores
	if (StackBumpBytes >= 512)
	return false;

	if (MFI.hasVarSizedObjects())
	return false;

	if (RegInfo->needsStackRealignment(MF))
	return false;

	// This isn't strictly necessary, but it simplifies things a bit since the
	// current RedZone handling code assumes the SP is adjusted by the
	// callee-save save/restore code.
	if (canUseRedZone(MF))
	return false;

	return true;
	}

	// Convert callee-save register save/restore instruction to do stack pointer
	// decrement/increment to allocate/deallocate the callee-save stack area by
	// converting store/load to use pre/post increment version.
	static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
	const DebugLoc &DL, const TargetInstrInfo *TII, int CSStackSizeInc) {
	unsigned NewOpc;
	bool NewIsUnscaled = false;
	switch (MBBI->getOpcode()) {
	default:
	llvm_unreachable("Unexpected callee-save save/restore opcode!");
	case AArch64::STPXi:
	NewOpc = AArch64::STPXpre;
	break;
	case AArch64::STPDi:
	NewOpc = AArch64::STPDpre;
	break;
	case AArch64::STRXui:
	NewOpc = AArch64::STRXpre;
	NewIsUnscaled = true;
	break;
	case AArch64::STRDui:
	NewOpc = AArch64::STRDpre;
	NewIsUnscaled = true;
	break;
	case AArch64::LDPXi:
	NewOpc = AArch64::LDPXpost;
	break;
	case AArch64::LDPDi:
	NewOpc = AArch64::LDPDpost;
	break;
	case AArch64::LDRXui:
	NewOpc = AArch64::LDRXpost;
	NewIsUnscaled = true;
	break;
	case AArch64::LDRDui:
	NewOpc = AArch64::LDRDpost;
	NewIsUnscaled = true;
	break;
	}

	MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(NewOpc));
	MIB.addReg(AArch64::SP, RegState::Define);

	// Copy all operands other than the immediate offset.
	unsigned OpndIdx = 0;
	for (unsigned OpndEnd = MBBI->getNumOperands() - 1; OpndIdx < OpndEnd;
	++OpndIdx)
	MIB.add(MBBI->getOperand(OpndIdx));

	assert(MBBI->getOperand(OpndIdx).getImm() == 0 &&
	"Unexpected immediate offset in first/last callee-save save/restore "
	"instruction!");
	assert(MBBI->getOperand(OpndIdx - 1).getReg() == AArch64::SP &&
	"Unexpected base register in callee-save save/restore instruction!");
	// Last operand is immediate offset that needs fixing.
	assert(CSStackSizeInc % 8 == 0);
	int64_t CSStackSizeIncImm = CSStackSizeInc;
	if (!NewIsUnscaled)
	CSStackSizeIncImm /= 8;
	MIB.addImm(CSStackSizeIncImm);

	MIB.setMIFlags(MBBI->getFlags());
	MIB.setMemRefs(MBBI->memoperands_begin(), MBBI->memoperands_end());

	return std::prev(MBB.erase(MBBI));
	}

	// Fixup callee-save register save/restore instructions to take into account
	// combined SP bump by adding the local stack size to the stack offsets.
	static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
	unsigned LocalStackSize) {
	unsigned Opc = MI.getOpcode();
	(void)Opc;
	assert((Opc == AArch64::STPXi \|\| Opc == AArch64::STPDi \|\|
	Opc == AArch64::STRXui \|\| Opc == AArch64::STRDui \|\|
	Opc == AArch64::LDPXi \|\| Opc == AArch64::LDPDi \|\|
	Opc == AArch64::LDRXui \|\| Opc == AArch64::LDRDui) &&
	"Unexpected callee-save save/restore opcode!");

	unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
	assert(MI.getOperand(OffsetIdx - 1).getReg() == AArch64::SP &&
	"Unexpected base register in callee-save save/restore instruction!");
	// Last operand is immediate offset that needs fixing.
	MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
	// All generated opcodes have scaled offsets.
	assert(LocalStackSize % 8 == 0);
	OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
	}

	void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	MachineBasicBlock::iterator MBBI = MBB.begin();
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const Function *Fn = MF.getFunction();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineModuleInfo &MMI = MF.getMMI();
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	bool needsFrameMoves = MMI.hasDebugInfo() \|\| Fn->needsUnwindTableEntry();
	bool HasFP = hasFP(MF);

	// Debug location must be unknown since the first debug location is used
	// to determine the end of the prologue.
	DebugLoc DL;

	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
	return;

	int NumBytes = (int)MFI.getStackSize();
	if (!AFI->hasStackFrame()) {
	assert(!HasFP && "unexpected function without stack frame but with FP");

	// All of the stack allocation is for locals.
	AFI->setLocalStackSize(NumBytes);

	if (!NumBytes)
	return;
	// REDZONE: If the stack size is less than 128 bytes, we don't need
	// to actually allocate.
	if (canUseRedZone(MF))
	++NumRedZoneFunctions;
	else {
	emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
	MachineInstr::FrameSetup);

	// Label used to tie together the PROLOG_LABEL and the MachineMoves.
	MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
	// Encode the stack size of the leaf function.
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}
	return;
	}

	- auto CSStackSize = AFI->getCalleeSavedStackSize();
	+ bool IsWin64 =
	+ Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
	+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
	+
	+ auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
	// All of the remaining stack allocations are for locals.
	- AFI->setLocalStackSize(NumBytes - CSStackSize);
	+ AFI->setLocalStackSize(NumBytes - PrologueSaveSize);

	bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
	if (CombineSPBump) {
	emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
	MachineInstr::FrameSetup);
	NumBytes = 0;
	- } else if (CSStackSize != 0) {
	+ } else if (PrologueSaveSize != 0) {
	MBBI = convertCalleeSaveRestoreToSPPrePostIncDec(MBB, MBBI, DL, TII,
	- -CSStackSize);
	- NumBytes -= CSStackSize;
	+ -PrologueSaveSize);
	+ NumBytes -= PrologueSaveSize;
	}
	assert(NumBytes >= 0 && "Negative stack allocation size!?");

	// Move past the saves of the callee-saved registers, fixing up the offsets
	// and pre-inc if we decided to combine the callee-save and local stack
	// pointer bump above.
	MachineBasicBlock::iterator End = MBB.end();
	while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
	if (CombineSPBump)
	fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize());
	++MBBI;
	}
	if (HasFP) {
	- // Only set up FP if we actually need to. Frame pointer is fp = sp - 16.
	- int FPOffset = CSStackSize - 16;
	+ // Only set up FP if we actually need to. Frame pointer is fp =
	+ // sp - fixedobject - 16.
	+ int FPOffset = AFI->getCalleeSavedStackSize() - 16;
	if (CombineSPBump)
	FPOffset += AFI->getLocalStackSize();

	// Issue sub fp, sp, FPOffset or
	// mov fp,sp when FPOffset is zero.
	// Note: All stores of callee-saved registers are marked as "FrameSetup".
	// This code marks the instruction(s) that set the FP also.
	emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
	MachineInstr::FrameSetup);
	}

	// Allocate space for the rest of the frame.
	if (NumBytes) {
	const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
	unsigned scratchSPReg = AArch64::SP;

	if (NeedsRealignment) {
	scratchSPReg = findScratchNonCalleeSaveRegister(&MBB);
	assert(scratchSPReg != AArch64::NoRegister);
	}

	// If we're a leaf function, try using the red zone.
	if (!canUseRedZone(MF))
	// FIXME: in the case of dynamic re-alignment, NumBytes doesn't have
	// the correct value here, as NumBytes also includes padding bytes,
	// which shouldn't be counted here.
	emitFrameOffset(MBB, MBBI, DL, scratchSPReg, AArch64::SP, -NumBytes, TII,
	MachineInstr::FrameSetup);

	if (NeedsRealignment) {
	const unsigned Alignment = MFI.getMaxAlignment();
	const unsigned NrBitsToZero = countTrailingZeros(Alignment);
	assert(NrBitsToZero > 1);
	assert(scratchSPReg != AArch64::SP);

	// SUB X9, SP, NumBytes
	// -- X9 is temporary register, so shouldn't contain any live data here,
	// -- free to use. This is already produced by emitFrameOffset above.
	// AND SP, X9, 0b11111...0000
	// The logical immediates have a non-trivial encoding. The following
	// formula computes the encoded immediate with all ones but
	// NrBitsToZero zero bits as least significant bits.
	uint32_t andMaskEncoded = (1 << 12) // = N
	\| ((64 - NrBitsToZero) << 6) // immr
	\| ((64 - NrBitsToZero - 1) << 0); // imms

	BuildMI(MBB, MBBI, DL, TII->get(AArch64::ANDXri), AArch64::SP)
	.addReg(scratchSPReg, RegState::Kill)
	.addImm(andMaskEncoded);
	AFI->setStackRealigned(true);
	}
	}

	// If we need a base pointer, set it up here. It's whatever the value of the
	// stack pointer is at this point. Any variable size objects will be allocated
	// after this, so we can still use the base pointer to reference locals.
	//
	// FIXME: Clarify FrameSetup flags here.
	// Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
	// needed.
	if (RegInfo->hasBasePointer(MF)) {
	TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
	false);
	}

	if (needsFrameMoves) {
	const DataLayout &TD = MF.getDataLayout();
	const int StackGrowth = -TD.getPointerSize(0);
	unsigned FramePtr = RegInfo->getFrameRegister(MF);
	// An example of the prologue:
	//
	// .globl __foo
	// .align 2
	// __foo:
	// Ltmp0:
	// .cfi_startproc
	// .cfi_personality 155, ___gxx_personality_v0
	// Leh_func_begin:
	// .cfi_lsda 16, Lexception33
	//
	// stp xa,bx, [sp, -#offset]!
	// ...
	// stp x28, x27, [sp, #offset-32]
	// stp fp, lr, [sp, #offset-16]
	// add fp, sp, #offset - 16
	// sub sp, sp, #1360
	//
	// The Stack:
	// +-------------------------------------------+
	// 10000 \| ........ \| ........ \| ........ \| ........ \|
	// 10004 \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	// 10008 \| ........ \| ........ \| ........ \| ........ \|
	// 1000c \| ........ \| ........ \| ........ \| ........ \|
	// +===========================================+
	// 10010 \| X28 Register \|
	// 10014 \| X28 Register \|
	// +-------------------------------------------+
	// 10018 \| X27 Register \|
	// 1001c \| X27 Register \|
	// +===========================================+
	// 10020 \| Frame Pointer \|
	// 10024 \| Frame Pointer \|
	// +-------------------------------------------+
	// 10028 \| Link Register \|
	// 1002c \| Link Register \|
	// +===========================================+
	// 10030 \| ........ \| ........ \| ........ \| ........ \|
	// 10034 \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	// 10038 \| ........ \| ........ \| ........ \| ........ \|
	// 1003c \| ........ \| ........ \| ........ \| ........ \|
	// +-------------------------------------------+
	//
	// [sp] = 10030 :: >>initial value<<
	// sp = 10020 :: stp fp, lr, [sp, #-16]!
	// fp = sp == 10020 :: mov fp, sp
	// [sp] == 10020 :: stp x28, x27, [sp, #-16]!
	// sp == 10010 :: >>final value<<
	//
	// The frame pointer (w29) points to address 10020. If we use an offset of
	// '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
	// for w27, and -32 for w28:
	//
	// Ltmp1:
	// .cfi_def_cfa w29, 16
	// Ltmp2:
	// .cfi_offset w30, -8
	// Ltmp3:
	// .cfi_offset w29, -16
	// Ltmp4:
	// .cfi_offset w27, -24
	// Ltmp5:
	// .cfi_offset w28, -32

	if (HasFP) {
	// Define the current CFA rule to use the provided FP.
	unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
	- unsigned CFIIndex = MF.addFrameInst(
	- MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
	+ unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
	+ nullptr, Reg, 2 * StackGrowth - FixedObject));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	} else {
	// Encode the stack size of the leaf function.
	unsigned CFIIndex = MF.addFrameInst(
	MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
	BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
	.addCFIIndex(CFIIndex)
	.setMIFlags(MachineInstr::FrameSetup);
	}

	// Now emit the moves for whatever callee saved regs we have (including FP,
	// LR if those are saved).
	emitCalleeSavedFrameMoves(MBB, MBBI);
	}
	}

	void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
	MachineBasicBlock &MBB) const {
	MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL;
	bool IsTailCallReturn = false;
	if (MBB.end() != MBBI) {
	DL = MBBI->getDebugLoc();
	unsigned RetOpcode = MBBI->getOpcode();
	IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi \|\|
	RetOpcode == AArch64::TCRETURNri;
	}
	int NumBytes = MFI.getStackSize();
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();

	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
	return;

	// Initial and residual are named for consistency with the prologue. Note that
	// in the epilogue, the residual adjustment is executed first.
	uint64_t ArgumentPopSize = 0;
	if (IsTailCallReturn) {
	MachineOperand &StackAdjust = MBBI->getOperand(1);

	// For a tail-call in a callee-pops-arguments environment, some or all of
	// the stack may actually be in use for the call's arguments, this is
	// calculated during LowerCall and consumed here...
	ArgumentPopSize = StackAdjust.getImm();
	} else {
	// ... otherwise the amount to pop is all of the argument space,
	// conveniently stored in the MachineFunctionInfo by
	// LowerFormalArguments. This will, of course, be zero for the C calling
	// convention.
	ArgumentPopSize = AFI->getArgumentStackToRestore();
	}

	// The stack frame should be like below,
	//
	// ---------------------- ---
	// \| \| \|
	// \| BytesInStackArgArea\| CalleeArgStackSize
	// \| (NumReusableBytes) \| (of tail call)
	// \| \| ---
	// \| \| \|
	// ---------------------\| --- \|
	// \| \| \| \|
	// \| CalleeSavedReg \| \| \|
	// \| (CalleeSavedStackSize)\| \| \|
	// \| \| \| \|
	// ---------------------\| \| NumBytes
	// \| \| StackSize (StackAdjustUp)
	// \| LocalStackSize \| \| \|
	// \| (covering callee \| \| \|
	// \| args) \| \| \|
	// \| \| \| \|
	// ---------------------- --- ---
	//
	// So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
	// = StackSize + ArgumentPopSize
	//
	// AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
	// it as the 2nd argument of AArch64ISD::TC_RETURN.

	- auto CSStackSize = AFI->getCalleeSavedStackSize();
	+ bool IsWin64 =
	+ Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
	+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
	+
	+ auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
	bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);

	- if (!CombineSPBump && CSStackSize != 0)
	+ if (!CombineSPBump && PrologueSaveSize != 0)
	convertCalleeSaveRestoreToSPPrePostIncDec(
	- MBB, std::prev(MBB.getFirstTerminator()), DL, TII, CSStackSize);
	+ MBB, std::prev(MBB.getFirstTerminator()), DL, TII, PrologueSaveSize);

	// Move past the restores of the callee-saved registers.
	MachineBasicBlock::iterator LastPopI = MBB.getFirstTerminator();
	MachineBasicBlock::iterator Begin = MBB.begin();
	while (LastPopI != Begin) {
	--LastPopI;
	if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
	++LastPopI;
	break;
	} else if (CombineSPBump)
	fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize());
	}

	// If there is a single SP update, insert it before the ret and we're done.
	if (CombineSPBump) {
	emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
	NumBytes + ArgumentPopSize, TII,
	MachineInstr::FrameDestroy);
	return;
	}

	- NumBytes -= CSStackSize;
	+ NumBytes -= PrologueSaveSize;
	assert(NumBytes >= 0 && "Negative stack allocation size!?");

	if (!hasFP(MF)) {
	bool RedZone = canUseRedZone(MF);
	// If this was a redzone leaf function, we don't need to restore the
	// stack pointer (but we may need to pop stack args for fastcc).
	if (RedZone && ArgumentPopSize == 0)
	return;

	- bool NoCalleeSaveRestore = CSStackSize == 0;
	+ bool NoCalleeSaveRestore = PrologueSaveSize == 0;
	int StackRestoreBytes = RedZone ? 0 : NumBytes;
	if (NoCalleeSaveRestore)
	StackRestoreBytes += ArgumentPopSize;
	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
	StackRestoreBytes, TII, MachineInstr::FrameDestroy);
	// If we were able to combine the local stack pop with the argument pop,
	// then we're done.
	if (NoCalleeSaveRestore \|\| ArgumentPopSize == 0)
	return;
	NumBytes = 0;
	}

	// Restore the original stack pointer.
	// FIXME: Rather than doing the math here, we should instead just use
	// non-post-indexed loads for the restores if we aren't actually going to
	// be able to save any instructions.
	if (MFI.hasVarSizedObjects() \|\| AFI->isStackRealigned())
	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
	- -CSStackSize + 16, TII, MachineInstr::FrameDestroy);
	+ -AFI->getCalleeSavedStackSize() + 16, TII,
	+ MachineInstr::FrameDestroy);
	else if (NumBytes)
	emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes, TII,
	MachineInstr::FrameDestroy);

	// This must be placed after the callee-save restore code because that code
	// assumes the SP is at the same location as it was after the callee-save save
	// code in the prologue.
	if (ArgumentPopSize)
	emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP,
	ArgumentPopSize, TII, MachineInstr::FrameDestroy);
	}

	/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
	/// debug info. It's the same as what we use for resolving the code-gen
	/// references for now. FIXME: This can go wrong when references are
	/// SP-relative and simple call frames aren't used.
	int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
	int FI,
	unsigned &FrameReg) const {
	return resolveFrameIndexReference(MF, FI, FrameReg);
	}

	int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
	int FI, unsigned &FrameReg,
	bool PreferFP) const {
	const MachineFrameInfo &MFI = MF.getFrameInfo();
	const AArch64RegisterInfo RegInfo = static_cast<const AArch64RegisterInfo >(
	MF.getSubtarget().getRegisterInfo());
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	- int FPOffset = MFI.getObjectOffset(FI) + 16;
	+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	+ bool IsWin64 =
	+ Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
	+ unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
	+ int FPOffset = MFI.getObjectOffset(FI) + FixedObject + 16;
	int Offset = MFI.getObjectOffset(FI) + MFI.getStackSize();
	bool isFixed = MFI.isFixedObjectIndex(FI);

	// Use frame pointer to reference fixed objects. Use it for locals if
	// there are VLAs or a dynamically realigned SP (and thus the SP isn't
	// reliable as a base). Make sure useFPForScavengingIndex() does the
	// right thing for the emergency spill slot.
	bool UseFP = false;
	if (AFI->hasStackFrame()) {
	// Note: Keeping the following as multiple 'if' statements rather than
	// merging to a single expression for readability.
	//
	// Argument access should always use the FP.
	if (isFixed) {
	UseFP = hasFP(MF);
	} else if (hasFP(MF) && !RegInfo->hasBasePointer(MF) &&
	!RegInfo->needsStackRealignment(MF)) {
	// Use SP or FP, whichever gives us the best chance of the offset
	// being in range for direct access. If the FPOffset is positive,
	// that'll always be best, as the SP will be even further away.
	// If the FPOffset is negative, we have to keep in mind that the
	// available offset range for negative offsets is smaller than for
	// positive ones. If we have variable sized objects, we're stuck with
	// using the FP regardless, though, as the SP offset is unknown
	// and we don't have a base pointer available. If an offset is
	// available via the FP and the SP, use whichever is closest.
	if (PreferFP \|\| MFI.hasVarSizedObjects() \|\| FPOffset >= 0 \|\|
	(FPOffset >= -256 && Offset > -FPOffset))
	UseFP = true;
	}
	}

	assert((isFixed \|\| !RegInfo->needsStackRealignment(MF) \|\| !UseFP) &&
	"In the presence of dynamic stack pointer realignment, "
	"non-argument objects cannot be accessed through the frame pointer");

	if (UseFP) {
	FrameReg = RegInfo->getFrameRegister(MF);
	return FPOffset;
	}

	// Use the base pointer if we have one.
	if (RegInfo->hasBasePointer(MF))
	FrameReg = RegInfo->getBaseRegister();
	else {
	FrameReg = AArch64::SP;
	// If we're using the red zone for this function, the SP won't actually
	// be adjusted, so the offsets will be negative. They're also all
	// within range of the signed 9-bit immediate instructions.
	if (canUseRedZone(MF))
	Offset -= AFI->getLocalStackSize();
	}

	return Offset;
	}

	static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
	// Do not set a kill flag on values that are also marked as live-in. This
	// happens with the @llvm-returnaddress intrinsic and with arguments passed in
	// callee saved registers.
	// Omitting the kill flags is conservatively correct even if the live-in
	// is not used after all.
	bool IsLiveIn = MF.getRegInfo().isLiveIn(Reg);
	return getKillRegState(!IsLiveIn);
	}

	static bool produceCompactUnwindFrame(MachineFunction &MF) {
	const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	AttributeList Attrs = MF.getFunction()->getAttributes();
	return Subtarget.isTargetMachO() &&
	!(Subtarget.getTargetLowering()->supportSwiftError() &&
	Attrs.hasAttrSomewhere(Attribute::SwiftError));
	}

	namespace {

	struct RegPairInfo {
	unsigned Reg1 = AArch64::NoRegister;
	unsigned Reg2 = AArch64::NoRegister;
	int FrameIdx;
	int Offset;
	bool IsGPR;

	RegPairInfo() = default;

	bool isPaired() const { return Reg2 != AArch64::NoRegister; }
	};

	} // end anonymous namespace

	static void computeCalleeSaveRegisterPairs(
	MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
	const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs) {

	if (CSI.empty())
	return;

	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	CallingConv::ID CC = MF.getFunction()->getCallingConv();
	unsigned Count = CSI.size();
	(void)CC;
	// MachO's compact unwind format relies on all registers being stored in
	// pairs.
	assert((!produceCompactUnwindFrame(MF) \|\|
	CC == CallingConv::PreserveMost \|\|
	(Count & 1) == 0) &&
	"Odd number of callee-saved regs to spill!");
	int Offset = AFI->getCalleeSavedStackSize();
	-
	- unsigned GPRSaveSize = AFI->getVarArgsGPRSize();
	- const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
	- bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv());
	- if (IsWin64)
	- Offset -= alignTo(GPRSaveSize, 16);

	for (unsigned i = 0; i < Count; ++i) {
	RegPairInfo RPI;
	RPI.Reg1 = CSI[i].getReg();

	assert(AArch64::GPR64RegClass.contains(RPI.Reg1) \|\|
	AArch64::FPR64RegClass.contains(RPI.Reg1));
	RPI.IsGPR = AArch64::GPR64RegClass.contains(RPI.Reg1);

	// Add the next reg to the pair if it is in the same register class.
	if (i + 1 < Count) {
	unsigned NextReg = CSI[i + 1].getReg();
	if ((RPI.IsGPR && AArch64::GPR64RegClass.contains(NextReg)) \|\|
	(!RPI.IsGPR && AArch64::FPR64RegClass.contains(NextReg)))
	RPI.Reg2 = NextReg;
	}

	// GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
	// list to come in sorted by frame index so that we can issue the store
	// pair instructions directly. Assert if we see anything otherwise.
	//
	// The order of the registers in the list is controlled by
	// getCalleeSavedRegs(), so they will always be in-order, as well.
	assert((!RPI.isPaired() \|\|
	(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx())) &&
	"Out of order callee saved regs!");

	// MachO's compact unwind format relies on all registers being stored in
	// adjacent register pairs.
	assert((!produceCompactUnwindFrame(MF) \|\|
	CC == CallingConv::PreserveMost \|\|
	(RPI.isPaired() &&
	((RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) \|\|
	RPI.Reg1 + 1 == RPI.Reg2))) &&
	"Callee-save registers not saved as adjacent register pair!");

	RPI.FrameIdx = CSI[i].getFrameIdx();

	if (Count * 8 != AFI->getCalleeSavedStackSize() && !RPI.isPaired()) {
	// Round up size of non-pair to pair size if we need to pad the
	// callee-save area to ensure 16-byte alignment.
	Offset -= 16;
	assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
	MFI.setObjectAlignment(RPI.FrameIdx, 16);
	AFI->setCalleeSaveStackHasFreeSpace(true);
	} else
	Offset -= RPI.isPaired() ? 16 : 8;
	assert(Offset % 8 == 0);
	RPI.Offset = Offset / 8;
	assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
	"Offset out of bounds for LDP/STP immediate");

	RegPairs.push_back(RPI);
	if (RPI.isPaired())
	++i;
	}
	}

	bool AArch64FrameLowering::spillCalleeSavedRegisters(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	const std::vector<CalleeSavedInfo> &CSI,
	const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	DebugLoc DL;
	SmallVector<RegPairInfo, 8> RegPairs;

	computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
	const MachineRegisterInfo &MRI = MF.getRegInfo();

	for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
	++RPII) {
	RegPairInfo RPI = *RPII;
	unsigned Reg1 = RPI.Reg1;
	unsigned Reg2 = RPI.Reg2;
	unsigned StrOpc;

	// Issue sequence of spills for cs regs. The first spill may be converted
	// to a pre-decrement store later by emitPrologue if the callee-save stack
	// area allocation can't be combined with the local stack area allocation.
	// For example:
	// stp x22, x21, [sp, #0] // addImm(+0)
	// stp x20, x19, [sp, #16] // addImm(+2)
	// stp fp, lr, [sp, #32] // addImm(+4)
	// Rationale: This sequence saves uop updates compared to a sequence of
	// pre-increment spills like stp xi,xj,[sp,#-16]!
	// Note: Similar rationale and sequence for restores in epilog.
	if (RPI.IsGPR)
	StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
	else
	StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
	DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1);
	if (RPI.isPaired())
	dbgs() << ", " << TRI->getName(Reg2);
	dbgs() << ") -> fi#(" << RPI.FrameIdx;
	if (RPI.isPaired())
	dbgs() << ", " << RPI.FrameIdx+1;
	dbgs() << ")\n");

	MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
	if (!MRI.isReserved(Reg1))
	MBB.addLiveIn(Reg1);
	if (RPI.isPaired()) {
	if (!MRI.isReserved(Reg2))
	MBB.addLiveIn(Reg2);
	MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
	MachineMemOperand::MOStore, 8, 8));
	}
	MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
	.addReg(AArch64::SP)
	.addImm(RPI.Offset) // [sp, #offset8], where factor8 is implicit
	.setMIFlag(MachineInstr::FrameSetup);
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
	MachineMemOperand::MOStore, 8, 8));
	}
	return true;
	}

	bool AArch64FrameLowering::restoreCalleeSavedRegisters(
	MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
	const std::vector<CalleeSavedInfo> &CSI,
	const TargetRegisterInfo *TRI) const {
	MachineFunction &MF = *MBB.getParent();
	const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
	DebugLoc DL;
	SmallVector<RegPairInfo, 8> RegPairs;

	if (MI != MBB.end())
	DL = MI->getDebugLoc();

	computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);

	for (auto RPII = RegPairs.begin(), RPIE = RegPairs.end(); RPII != RPIE;
	++RPII) {
	RegPairInfo RPI = *RPII;
	unsigned Reg1 = RPI.Reg1;
	unsigned Reg2 = RPI.Reg2;

	// Issue sequence of restores for cs regs. The last restore may be converted
	// to a post-increment load later by emitEpilogue if the callee-save stack
	// area allocation can't be combined with the local stack area allocation.
	// For example:
	// ldp fp, lr, [sp, #32] // addImm(+4)
	// ldp x20, x19, [sp, #16] // addImm(+2)
	// ldp x22, x21, [sp, #0] // addImm(+0)
	// Note: see comment in spillCalleeSavedRegisters()
	unsigned LdrOpc;
	if (RPI.IsGPR)
	LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
	else
	LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
	DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1);
	if (RPI.isPaired())
	dbgs() << ", " << TRI->getName(Reg2);
	dbgs() << ") -> fi#(" << RPI.FrameIdx;
	if (RPI.isPaired())
	dbgs() << ", " << RPI.FrameIdx+1;
	dbgs() << ")\n");

	MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
	if (RPI.isPaired()) {
	MIB.addReg(Reg2, getDefRegState(true));
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
	MachineMemOperand::MOLoad, 8, 8));
	}
	MIB.addReg(Reg1, getDefRegState(true))
	.addReg(AArch64::SP)
	.addImm(RPI.Offset) // [sp, #offset8] where the factor8 is implicit
	.setMIFlag(MachineInstr::FrameDestroy);
	MIB.addMemOperand(MF.getMachineMemOperand(
	MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx),
	MachineMemOperand::MOLoad, 8, 8));
	}
	return true;
	}

	void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
	BitVector &SavedRegs,
	RegScavenger *RS) const {
	// All calls are tail calls in GHC calling conv, and functions have no
	// prologue/epilogue.
	if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
	return;

	TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
	const AArch64RegisterInfo RegInfo = static_cast<const AArch64RegisterInfo >(
	MF.getSubtarget().getRegisterInfo());
	AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	unsigned UnspilledCSGPR = AArch64::NoRegister;
	unsigned UnspilledCSGPRPaired = AArch64::NoRegister;

	// The frame record needs to be created by saving the appropriate registers
	if (hasFP(MF)) {
	SavedRegs.set(AArch64::FP);
	SavedRegs.set(AArch64::LR);
	}

	unsigned BasePointerReg = AArch64::NoRegister;
	if (RegInfo->hasBasePointer(MF))
	BasePointerReg = RegInfo->getBaseRegister();

	unsigned ExtraCSSpill = 0;
	const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
	// Figure out which callee-saved registers to save/restore.
	for (unsigned i = 0; CSRegs[i]; ++i) {
	const unsigned Reg = CSRegs[i];

	// Add the base pointer register to SavedRegs if it is callee-save.
	if (Reg == BasePointerReg)
	SavedRegs.set(Reg);

	bool RegUsed = SavedRegs.test(Reg);
	unsigned PairedReg = CSRegs[i ^ 1];
	if (!RegUsed) {
	if (AArch64::GPR64RegClass.contains(Reg) &&
	!RegInfo->isReservedReg(MF, Reg)) {
	UnspilledCSGPR = Reg;
	UnspilledCSGPRPaired = PairedReg;
	}
	continue;
	}

	// MachO's compact unwind format relies on all registers being stored in
	// pairs.
	// FIXME: the usual format is actually better if unwinding isn't needed.
	if (produceCompactUnwindFrame(MF) && !SavedRegs.test(PairedReg)) {
	SavedRegs.set(PairedReg);
	if (AArch64::GPR64RegClass.contains(PairedReg) &&
	!RegInfo->isReservedReg(MF, PairedReg))
	ExtraCSSpill = PairedReg;
	}
	}

	DEBUG(dbgs() << "*** determineCalleeSaves\nUsed CSRs:";
	for (unsigned Reg : SavedRegs.set_bits())
	dbgs() << ' ' << PrintReg(Reg, RegInfo);
	dbgs() << "\n";);

	// If any callee-saved registers are used, the frame cannot be eliminated.
	unsigned NumRegsSpilled = SavedRegs.count();
	bool CanEliminateFrame = NumRegsSpilled == 0;

	// The CSR spill slots have not been allocated yet, so estimateStackSize
	// won't include them.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
	DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
	unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
	bool BigStack = (CFSize > EstimatedStackSizeLimit);
	if (BigStack \|\| !CanEliminateFrame \|\| RegInfo->cannotEliminateFrame(MF))
	AFI->setHasStackFrame(true);

	// Estimate if we might need to scavenge a register at some point in order
	// to materialize a stack offset. If so, either spill one additional
	// callee-saved register or reserve a special spill slot to facilitate
	// register scavenging. If we already spilled an extra callee-saved register
	// above to keep the number of spills even, we don't need to do anything else
	// here.
	if (BigStack) {
	if (!ExtraCSSpill && UnspilledCSGPR != AArch64::NoRegister) {
	DEBUG(dbgs() << "Spilling " << PrintReg(UnspilledCSGPR, RegInfo)
	<< " to get a scratch register.\n");
	SavedRegs.set(UnspilledCSGPR);
	// MachO's compact unwind format relies on all registers being stored in
	// pairs, so if we need to spill one extra for BigStack, then we need to
	// store the pair.
	if (produceCompactUnwindFrame(MF))
	SavedRegs.set(UnspilledCSGPRPaired);
	ExtraCSSpill = UnspilledCSGPRPaired;
	NumRegsSpilled = SavedRegs.count();
	}

	// If we didn't find an extra callee-saved register to spill, create
	// an emergency spill slot.
	if (!ExtraCSSpill \|\| MF.getRegInfo().isPhysRegUsed(ExtraCSSpill)) {
	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
	const TargetRegisterClass &RC = AArch64::GPR64RegClass;
	unsigned Size = TRI->getSpillSize(RC);
	unsigned Align = TRI->getSpillAlignment(RC);
	int FI = MFI.CreateStackObject(Size, Align, false);
	RS->addScavengingFrameIndex(FI);
	DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
	<< " as the emergency spill slot.\n");
	}
	}

	// Round up to register pair alignment to avoid additional SP adjustment
	// instructions.
	AFI->setCalleeSavedStackSize(alignTo(8 * NumRegsSpilled, 16));
	}

	bool AArch64FrameLowering::enableStackSlotScavenging(
	const MachineFunction &MF) const {
	const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
	return AFI->hasCalleeSaveStackFreeSpace();
	}
	Index: head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (revision 322320)
	@@ -1,10835 +1,10835 @@
	//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation ----===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the AArch64TargetLowering class.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64ISelLowering.h"
	#include "AArch64CallingConvention.h"
	#include "AArch64MachineFunctionInfo.h"
	#include "AArch64PerfectShuffle.h"
	#include "AArch64RegisterInfo.h"
	#include "AArch64Subtarget.h"
	#include "MCTargetDesc/AArch64AddressingModes.h"
	#include "Utils/AArch64BaseInfo.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/APInt.h"
	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/VectorUtils.h"
	#include "llvm/CodeGen/CallingConvLower.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstr.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineMemOperand.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/MachineValueType.h"
	#include "llvm/CodeGen/RuntimeLibcalls.h"
	#include "llvm/CodeGen/SelectionDAG.h"
	#include "llvm/CodeGen/SelectionDAGNodes.h"
	#include "llvm/CodeGen/ValueTypes.h"
	#include "llvm/IR/Attributes.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugLoc.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GetElementPtrTypeIterator.h"
	#include "llvm/IR/GlobalValue.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/Instruction.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/OperandTraits.h"
	#include "llvm/IR/Type.h"
	#include "llvm/IR/Use.h"
	#include "llvm/IR/Value.h"
	#include "llvm/MC/MCRegisterInfo.h"
	#include "llvm/Support/Casting.h"
	#include "llvm/Support/CodeGen.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Compiler.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Target/TargetCallingConv.h"
	#include "llvm/Target/TargetInstrInfo.h"
	#include "llvm/Target/TargetMachine.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cassert>
	#include <cctype>
	#include <cstdint>
	#include <cstdlib>
	#include <iterator>
	#include <limits>
	#include <tuple>
	#include <utility>
	#include <vector>

	using namespace llvm;

	#define DEBUG_TYPE "aarch64-lower"

	STATISTIC(NumTailCalls, "Number of tail calls");
	STATISTIC(NumShiftInserts, "Number of vector shift inserts");
	STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");

	static cl::opt<bool>
	EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
	cl::desc("Allow AArch64 SLI/SRI formation"),
	cl::init(false));

	// FIXME: The necessary dtprel relocations don't seem to be supported
	// well in the GNU bfd and gold linkers at the moment. Therefore, by
	// default, for now, fall back to GeneralDynamic code generation.
	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
	"aarch64-elf-ldtls-generation", cl::Hidden,
	cl::desc("Allow AArch64 Local Dynamic TLS code generation"),
	cl::init(false));

	static cl::opt<bool>
	EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
	cl::desc("Enable AArch64 logical imm instruction "
	"optimization"),
	cl::init(true));

	/// Value type used for condition codes.
	static const MVT MVT_CC = MVT::i32;

	AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
	const AArch64Subtarget &STI)
	: TargetLowering(TM), Subtarget(&STI) {
	// AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
	// we have to make something up. Arbitrarily, choose ZeroOrOne.
	setBooleanContents(ZeroOrOneBooleanContent);
	// When comparing vectors the result sets the different elements in the
	// vector to all-one or all-zero.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// Set up the register classes.
	addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
	addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);

	if (Subtarget->hasFPARMv8()) {
	addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
	addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
	addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
	addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
	}

	if (Subtarget->hasNEON()) {
	addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
	addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
	// Someone set us up the NEON.
	addDRTypeForNEON(MVT::v2f32);
	addDRTypeForNEON(MVT::v8i8);
	addDRTypeForNEON(MVT::v4i16);
	addDRTypeForNEON(MVT::v2i32);
	addDRTypeForNEON(MVT::v1i64);
	addDRTypeForNEON(MVT::v1f64);
	addDRTypeForNEON(MVT::v4f16);

	addQRTypeForNEON(MVT::v4f32);
	addQRTypeForNEON(MVT::v2f64);
	addQRTypeForNEON(MVT::v16i8);
	addQRTypeForNEON(MVT::v8i16);
	addQRTypeForNEON(MVT::v4i32);
	addQRTypeForNEON(MVT::v2i64);
	addQRTypeForNEON(MVT::v8f16);
	}

	// Compute derived properties from the register classes
	computeRegisterProperties(Subtarget->getRegisterInfo());

	// Provide all sorts of operation actions
	setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
	setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::i32, Custom);
	setOperationAction(ISD::SETCC, MVT::i64, Custom);
	setOperationAction(ISD::SETCC, MVT::f32, Custom);
	setOperationAction(ISD::SETCC, MVT::f64, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
	setOperationAction(ISD::BRCOND, MVT::Other, Expand);
	setOperationAction(ISD::BR_CC, MVT::i32, Custom);
	setOperationAction(ISD::BR_CC, MVT::i64, Custom);
	setOperationAction(ISD::BR_CC, MVT::f32, Custom);
	setOperationAction(ISD::BR_CC, MVT::f64, Custom);
	setOperationAction(ISD::SELECT, MVT::i32, Custom);
	setOperationAction(ISD::SELECT, MVT::i64, Custom);
	setOperationAction(ISD::SELECT, MVT::f32, Custom);
	setOperationAction(ISD::SELECT, MVT::f64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
	setOperationAction(ISD::BR_JT, MVT::Other, Expand);
	setOperationAction(ISD::JumpTable, MVT::i64, Custom);

	setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
	setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);

	setOperationAction(ISD::FREM, MVT::f32, Expand);
	setOperationAction(ISD::FREM, MVT::f64, Expand);
	setOperationAction(ISD::FREM, MVT::f80, Expand);

	// Custom lowering hooks are needed for XOR
	// to fold it into CSINC/CSINV.
	setOperationAction(ISD::XOR, MVT::i32, Custom);
	setOperationAction(ISD::XOR, MVT::i64, Custom);

	// Virtually no operation on f128 is legal, but LLVM can't expand them when
	// there's a valid register class, so we need custom operations in most cases.
	setOperationAction(ISD::FABS, MVT::f128, Expand);
	setOperationAction(ISD::FADD, MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
	setOperationAction(ISD::FCOS, MVT::f128, Expand);
	setOperationAction(ISD::FDIV, MVT::f128, Custom);
	setOperationAction(ISD::FMA, MVT::f128, Expand);
	setOperationAction(ISD::FMUL, MVT::f128, Custom);
	setOperationAction(ISD::FNEG, MVT::f128, Expand);
	setOperationAction(ISD::FPOW, MVT::f128, Expand);
	setOperationAction(ISD::FREM, MVT::f128, Expand);
	setOperationAction(ISD::FRINT, MVT::f128, Expand);
	setOperationAction(ISD::FSIN, MVT::f128, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
	setOperationAction(ISD::FSQRT, MVT::f128, Expand);
	setOperationAction(ISD::FSUB, MVT::f128, Custom);
	setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
	setOperationAction(ISD::SETCC, MVT::f128, Custom);
	setOperationAction(ISD::BR_CC, MVT::f128, Custom);
	setOperationAction(ISD::SELECT, MVT::f128, Custom);
	setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
	setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);

	// Lowering for many of the conversions is actually specified by the non-f128
	// type. The LowerXXX function will be trivial when f128 isn't involved.
	setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);

	// Variable arguments.
	setOperationAction(ISD::VASTART, MVT::Other, Custom);
	setOperationAction(ISD::VAARG, MVT::Other, Custom);
	setOperationAction(ISD::VACOPY, MVT::Other, Custom);
	setOperationAction(ISD::VAEND, MVT::Other, Expand);

	// Variable-sized objects.
	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
	setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);

	// Constant pool entries
	setOperationAction(ISD::ConstantPool, MVT::i64, Custom);

	// BlockAddress
	setOperationAction(ISD::BlockAddress, MVT::i64, Custom);

	// Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
	setOperationAction(ISD::ADDC, MVT::i32, Custom);
	setOperationAction(ISD::ADDE, MVT::i32, Custom);
	setOperationAction(ISD::SUBC, MVT::i32, Custom);
	setOperationAction(ISD::SUBE, MVT::i32, Custom);
	setOperationAction(ISD::ADDC, MVT::i64, Custom);
	setOperationAction(ISD::ADDE, MVT::i64, Custom);
	setOperationAction(ISD::SUBC, MVT::i64, Custom);
	setOperationAction(ISD::SUBE, MVT::i64, Custom);

	// AArch64 lacks both left-rotate and popcount instructions.
	setOperationAction(ISD::ROTL, MVT::i32, Expand);
	setOperationAction(ISD::ROTL, MVT::i64, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	}

	// AArch64 doesn't have {U\|S}MUL_LOHI.
	setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
	setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);

	setOperationAction(ISD::CTPOP, MVT::i32, Custom);
	setOperationAction(ISD::CTPOP, MVT::i64, Custom);

	setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	}
	setOperationAction(ISD::SREM, MVT::i32, Expand);
	setOperationAction(ISD::SREM, MVT::i64, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
	setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
	setOperationAction(ISD::UREM, MVT::i32, Expand);
	setOperationAction(ISD::UREM, MVT::i64, Expand);

	// Custom lower Add/Sub/Mul with overflow.
	setOperationAction(ISD::SADDO, MVT::i32, Custom);
	setOperationAction(ISD::SADDO, MVT::i64, Custom);
	setOperationAction(ISD::UADDO, MVT::i32, Custom);
	setOperationAction(ISD::UADDO, MVT::i64, Custom);
	setOperationAction(ISD::SSUBO, MVT::i32, Custom);
	setOperationAction(ISD::SSUBO, MVT::i64, Custom);
	setOperationAction(ISD::USUBO, MVT::i32, Custom);
	setOperationAction(ISD::USUBO, MVT::i64, Custom);
	setOperationAction(ISD::SMULO, MVT::i32, Custom);
	setOperationAction(ISD::SMULO, MVT::i64, Custom);
	setOperationAction(ISD::UMULO, MVT::i32, Custom);
	setOperationAction(ISD::UMULO, MVT::i64, Custom);

	setOperationAction(ISD::FSIN, MVT::f32, Expand);
	setOperationAction(ISD::FSIN, MVT::f64, Expand);
	setOperationAction(ISD::FCOS, MVT::f32, Expand);
	setOperationAction(ISD::FCOS, MVT::f64, Expand);
	setOperationAction(ISD::FPOW, MVT::f32, Expand);
	setOperationAction(ISD::FPOW, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// f16 is a storage-only type, always promote it to f32.
	setOperationAction(ISD::SETCC, MVT::f16, Promote);
	setOperationAction(ISD::BR_CC, MVT::f16, Promote);
	setOperationAction(ISD::SELECT_CC, MVT::f16, Promote);
	setOperationAction(ISD::SELECT, MVT::f16, Promote);
	setOperationAction(ISD::FADD, MVT::f16, Promote);
	setOperationAction(ISD::FSUB, MVT::f16, Promote);
	setOperationAction(ISD::FMUL, MVT::f16, Promote);
	setOperationAction(ISD::FDIV, MVT::f16, Promote);
	setOperationAction(ISD::FREM, MVT::f16, Promote);
	setOperationAction(ISD::FMA, MVT::f16, Promote);
	setOperationAction(ISD::FNEG, MVT::f16, Promote);
	setOperationAction(ISD::FABS, MVT::f16, Promote);
	setOperationAction(ISD::FCEIL, MVT::f16, Promote);
	setOperationAction(ISD::FCOPYSIGN, MVT::f16, Promote);
	setOperationAction(ISD::FCOS, MVT::f16, Promote);
	setOperationAction(ISD::FFLOOR, MVT::f16, Promote);
	setOperationAction(ISD::FNEARBYINT, MVT::f16, Promote);
	setOperationAction(ISD::FPOW, MVT::f16, Promote);
	setOperationAction(ISD::FPOWI, MVT::f16, Promote);
	setOperationAction(ISD::FRINT, MVT::f16, Promote);
	setOperationAction(ISD::FSIN, MVT::f16, Promote);
	setOperationAction(ISD::FSINCOS, MVT::f16, Promote);
	setOperationAction(ISD::FSQRT, MVT::f16, Promote);
	setOperationAction(ISD::FEXP, MVT::f16, Promote);
	setOperationAction(ISD::FEXP2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG, MVT::f16, Promote);
	setOperationAction(ISD::FLOG2, MVT::f16, Promote);
	setOperationAction(ISD::FLOG10, MVT::f16, Promote);
	setOperationAction(ISD::FROUND, MVT::f16, Promote);
	setOperationAction(ISD::FTRUNC, MVT::f16, Promote);
	setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
	setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
	setOperationAction(ISD::FMAXNAN, MVT::f16, Promote);

	// v4f16 is also a storage-only type, so promote it to v4f32 when that is
	// known to be safe.
	setOperationAction(ISD::FADD, MVT::v4f16, Promote);
	setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
	setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
	setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
	setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
	setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
	AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
	AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);

	// Expand all other v4f16 operations.
	// FIXME: We could generate better code by promoting some operations to
	// a pair of v4f32s
	setOperationAction(ISD::FABS, MVT::v4f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f16, Expand);
	setOperationAction(ISD::FCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v4f16, Expand);
	setOperationAction(ISD::FMA, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
	setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
	setOperationAction(ISD::FREM, MVT::v4f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
	setOperationAction(ISD::FSIN, MVT::v4f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v4f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v4f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v4f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v4f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v4f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP, MVT::v4f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::v4f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::v4f16, Expand);


	// v8f16 is also a storage-only type, so expand it.
	setOperationAction(ISD::FABS, MVT::v8f16, Expand);
	setOperationAction(ISD::FADD, MVT::v8f16, Expand);
	setOperationAction(ISD::FCEIL, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v8f16, Expand);
	setOperationAction(ISD::FCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FDIV, MVT::v8f16, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v8f16, Expand);
	setOperationAction(ISD::FMA, MVT::v8f16, Expand);
	setOperationAction(ISD::FMUL, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
	setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
	setOperationAction(ISD::FREM, MVT::v8f16, Expand);
	setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
	setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSIN, MVT::v8f16, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v8f16, Expand);
	setOperationAction(ISD::FSQRT, MVT::v8f16, Expand);
	setOperationAction(ISD::FSUB, MVT::v8f16, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v8f16, Expand);
	setOperationAction(ISD::SETCC, MVT::v8f16, Expand);
	setOperationAction(ISD::BR_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT, MVT::v8f16, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v8f16, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP, MVT::v8f16, Expand);
	setOperationAction(ISD::FEXP2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG2, MVT::v8f16, Expand);
	setOperationAction(ISD::FLOG10, MVT::v8f16, Expand);

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::f32, MVT::f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	setOperationAction(ISD::FMINNUM, Ty, Legal);
	setOperationAction(ISD::FMAXNUM, Ty, Legal);
	setOperationAction(ISD::FMINNAN, Ty, Legal);
	setOperationAction(ISD::FMAXNAN, Ty, Legal);
	}

	setOperationAction(ISD::PREFETCH, MVT::Other, Custom);

	setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);

	// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
	// This requires the Performance Monitors extension.
	if (Subtarget->hasPerfMon())
	setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);

	if (Subtarget->isTargetMachO()) {
	// For iOS, we don't want to the normal expansion of a libcall to
	// sincos. We want to issue a libcall to __sincos_stret to avoid memory
	// traffic.
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	} else {
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
	}

	// Make floating-point constants legal for the large code model, so they don't
	// become loads from the constant pool.
	if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
	setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
	setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
	}

	// AArch64 does not have floating-point extending loads, i1 sign-extending
	// load, floating-point truncating stores, or v2i32->v2i16 truncating store.
	for (MVT VT : MVT::fp_valuetypes()) {
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand);
	}
	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Expand);

	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f32, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f128, MVT::f80, Expand);
	setTruncStoreAction(MVT::f128, MVT::f64, Expand);
	setTruncStoreAction(MVT::f128, MVT::f32, Expand);
	setTruncStoreAction(MVT::f128, MVT::f16, Expand);

	setOperationAction(ISD::BITCAST, MVT::i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::f16, Custom);

	// Indexed loads and stores are supported.
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, MVT::i8, Legal);
	setIndexedLoadAction(im, MVT::i16, Legal);
	setIndexedLoadAction(im, MVT::i32, Legal);
	setIndexedLoadAction(im, MVT::i64, Legal);
	setIndexedLoadAction(im, MVT::f64, Legal);
	setIndexedLoadAction(im, MVT::f32, Legal);
	setIndexedLoadAction(im, MVT::f16, Legal);
	setIndexedStoreAction(im, MVT::i8, Legal);
	setIndexedStoreAction(im, MVT::i16, Legal);
	setIndexedStoreAction(im, MVT::i32, Legal);
	setIndexedStoreAction(im, MVT::i64, Legal);
	setIndexedStoreAction(im, MVT::f64, Legal);
	setIndexedStoreAction(im, MVT::f32, Legal);
	setIndexedStoreAction(im, MVT::f16, Legal);
	}

	// Trap.
	setOperationAction(ISD::TRAP, MVT::Other, Legal);

	// We combine OR nodes for bitfield operations.
	setTargetDAGCombine(ISD::OR);

	// Vector add and sub nodes may conceal a high-half opportunity.
	// Also, try to fold ADD into CSINC/CSINV..
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);

	setTargetDAGCombine(ISD::FP_TO_SINT);
	setTargetDAGCombine(ISD::FP_TO_UINT);
	setTargetDAGCombine(ISD::FDIV);

	setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);

	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::CONCAT_VECTORS);
	setTargetDAGCombine(ISD::STORE);
	if (Subtarget->supportsAddressTopByteIgnored())
	setTargetDAGCombine(ISD::LOAD);

	setTargetDAGCombine(ISD::MUL);

	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::VSELECT);

	setTargetDAGCombine(ISD::INTRINSIC_VOID);
	setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
	setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);

	MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;

	setStackPointerRegisterToSaveRestore(AArch64::SP);

	setSchedulingPreference(Sched::Hybrid);

	EnableExtLdPromotion = true;

	// Set required alignment.
	setMinFunctionAlignment(2);
	// Set preferred alignments.
	setPrefFunctionAlignment(STI.getPrefFunctionAlignment());
	setPrefLoopAlignment(STI.getPrefLoopAlignment());

	// Only change the limit for entries in a jump table if specified by
	// the subtarget, but not at the command line.
	unsigned MaxJT = STI.getMaximumJumpTableSize();
	if (MaxJT && getMaximumJumpTableSize() == 0)
	setMaximumJumpTableSize(MaxJT);

	setHasExtractBitsInsn(true);

	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);

	if (Subtarget->hasNEON()) {
	// FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
	// silliness like this:
	setOperationAction(ISD::FABS, MVT::v1f64, Expand);
	setOperationAction(ISD::FADD, MVT::v1f64, Expand);
	setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
	setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
	setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
	setOperationAction(ISD::FMA, MVT::v1f64, Expand);
	setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
	setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
	setOperationAction(ISD::FREM, MVT::v1f64, Expand);
	setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
	setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
	setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
	setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
	setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
	setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
	setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
	setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
	setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);

	setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
	setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
	setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);

	setOperationAction(ISD::MUL, MVT::v1i64, Expand);

	// AArch64 doesn't have a direct vector ->f32 conversion instructions for
	// elements smaller than i32, so promote the input to i32 first.
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
	// i8 and i16 vector elements also need promotion to i32 for v8i8 or v8i16
	// -> v8f16 conversions.
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i8, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Promote);
	// Similarly, there is no direct i32 -> f64 vector conversion instruction.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
	// Or, direct i32 -> f16 vector conversion. Set it so custom, so the
	// conversion happens in two steps: v4i32 -> v4f32 -> v4f16
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);

	setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);

	setOperationAction(ISD::CTTZ, MVT::v2i8, Expand);
	setOperationAction(ISD::CTTZ, MVT::v4i16, Expand);
	setOperationAction(ISD::CTTZ, MVT::v2i32, Expand);
	setOperationAction(ISD::CTTZ, MVT::v1i64, Expand);
	setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
	setOperationAction(ISD::CTTZ, MVT::v8i16, Expand);
	setOperationAction(ISD::CTTZ, MVT::v4i32, Expand);
	setOperationAction(ISD::CTTZ, MVT::v2i64, Expand);

	// AArch64 doesn't have MUL.2d:
	setOperationAction(ISD::MUL, MVT::v2i64, Expand);
	// Custom handling for some quad-vector types to detect MULL.
	setOperationAction(ISD::MUL, MVT::v8i16, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);

	// Vector reductions
	for (MVT VT : MVT::integer_valuetypes()) {
	setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
	}
	for (MVT VT : MVT::fp_valuetypes()) {
	setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
	setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
	}

	setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
	// Likewise, narrowing and extending vector loads/stores aren't handled
	// directly.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Expand);

	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);

	setOperationAction(ISD::BSWAP, VT, Expand);

	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(VT, InnerVT, Expand);
	setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Expand);
	setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
	}
	}

	// AArch64 has implementations of a lot of rounding-like FP operations.
	for (MVT Ty : {MVT::v2f32, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, Ty, Legal);
	setOperationAction(ISD::FNEARBYINT, Ty, Legal);
	setOperationAction(ISD::FCEIL, Ty, Legal);
	setOperationAction(ISD::FRINT, Ty, Legal);
	setOperationAction(ISD::FTRUNC, Ty, Legal);
	setOperationAction(ISD::FROUND, Ty, Legal);
	}
	}

	PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
	}

	void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
	if (VT == MVT::v2f32 \|\| VT == MVT::v4f16) {
	setOperationAction(ISD::LOAD, VT, Promote);
	AddPromotedToType(ISD::LOAD, VT, MVT::v2i32);

	setOperationAction(ISD::STORE, VT, Promote);
	AddPromotedToType(ISD::STORE, VT, MVT::v2i32);
	} else if (VT == MVT::v2f64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v8f16) {
	setOperationAction(ISD::LOAD, VT, Promote);
	AddPromotedToType(ISD::LOAD, VT, MVT::v2i64);

	setOperationAction(ISD::STORE, VT, Promote);
	AddPromotedToType(ISD::STORE, VT, MVT::v2i64);
	}

	// Mark vector float intrinsics as expand.
	if (VT == MVT::v2f32 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);

	// But we do support custom-lowering for FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::AND, VT, Custom);
	setOperationAction(ISD::OR, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Legal);

	setOperationAction(ISD::SELECT, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	setOperationAction(ISD::VSELECT, VT, Expand);
	for (MVT InnerVT : MVT::all_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// CNT supports only B element sizes.
	if (VT != MVT::v8i8 && VT != MVT::v16i8)
	setOperationAction(ISD::CTPOP, VT, Expand);

	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);

	setOperationAction(ISD::FP_TO_SINT, VT, Custom);
	setOperationAction(ISD::FP_TO_UINT, VT, Custom);

	if (!VT.isFloatingPoint())
	setOperationAction(ISD::ABS, VT, Legal);

	// [SU][MIN\|MAX] are available for all NEON types apart from i64.
	if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
	for (unsigned Opcode : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
	setOperationAction(Opcode, VT, Legal);

	// F[MIN\|MAX][NUM\|NAN] are available for all FP NEON types (not f16 though!).
	if (VT.isFloatingPoint() && VT.getVectorElementType() != MVT::f16)
	for (unsigned Opcode : {ISD::FMINNAN, ISD::FMAXNAN,
	ISD::FMINNUM, ISD::FMAXNUM})
	setOperationAction(Opcode, VT, Legal);

	if (Subtarget->isLittleEndian()) {
	for (unsigned im = (unsigned)ISD::PRE_INC;
	im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
	setIndexedLoadAction(im, VT, Legal);
	setIndexedStoreAction(im, VT, Legal);
	}
	}
	}

	void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR64RegClass);
	addTypeForNEON(VT, MVT::v2i32);
	}

	void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
	addRegisterClass(VT, &AArch64::FPR128RegClass);
	addTypeForNEON(VT, MVT::v4i32);
	}

	EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i32;
	return VT.changeVectorElementTypeToInteger();
	}

	static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
	const APInt &Demanded,
	TargetLowering::TargetLoweringOpt &TLO,
	unsigned NewOpc) {
	uint64_t OldImm = Imm, NewImm, Enc;
	uint64_t Mask = ((uint64_t)(-1LL) >> (64 - Size)), OrigMask = Mask;

	// Return if the immediate is already all zeros, all ones, a bimm32 or a
	// bimm64.
	if (Imm == 0 \|\| Imm == Mask \|\|
	AArch64_AM::isLogicalImmediate(Imm & Mask, Size))
	return false;

	unsigned EltSize = Size;
	uint64_t DemandedBits = Demanded.getZExtValue();

	// Clear bits that are not demanded.
	Imm &= DemandedBits;

	while (true) {
	// The goal here is to set the non-demanded bits in a way that minimizes
	// the number of switching between 0 and 1. In order to achieve this goal,
	// we set the non-demanded bits to the value of the preceding demanded bits.
	// For example, if we have an immediate 0bx10xx0x1 ('x' indicates a
	// non-demanded bit), we copy bit0 (1) to the least significant 'x',
	// bit2 (0) to 'xx', and bit6 (1) to the most significant 'x'.
	// The final result is 0b11000011.
	uint64_t NonDemandedBits = ~DemandedBits;
	uint64_t InvertedImm = ~Imm & DemandedBits;
	uint64_t RotatedImm =
	((InvertedImm << 1) \| (InvertedImm >> (EltSize - 1) & 1)) &
	NonDemandedBits;
	uint64_t Sum = RotatedImm + NonDemandedBits;
	bool Carry = NonDemandedBits & ~Sum & (1ULL << (EltSize - 1));
	uint64_t Ones = (Sum + Carry) & NonDemandedBits;
	NewImm = (Imm \| Ones) & Mask;

	// If NewImm or its bitwise NOT is a shifted mask, it is a bitmask immediate
	// or all-ones or all-zeros, in which case we can stop searching. Otherwise,
	// we halve the element size and continue the search.
	if (isShiftedMask_64(NewImm) \|\| isShiftedMask_64(~(NewImm \| ~Mask)))
	break;

	// We cannot shrink the element size any further if it is 2-bits.
	if (EltSize == 2)
	return false;

	EltSize /= 2;
	Mask >>= EltSize;
	uint64_t Hi = Imm >> EltSize, DemandedBitsHi = DemandedBits >> EltSize;

	// Return if there is mismatch in any of the demanded bits of Imm and Hi.
	if (((Imm ^ Hi) & (DemandedBits & DemandedBitsHi) & Mask) != 0)
	return false;

	// Merge the upper and lower halves of Imm and DemandedBits.
	Imm \|= Hi;
	DemandedBits \|= DemandedBitsHi;
	}

	++NumOptimizedImms;

	// Replicate the element across the register width.
	while (EltSize < Size) {
	NewImm \|= NewImm << EltSize;
	EltSize *= 2;
	}

	(void)OldImm;
	assert(((OldImm ^ NewImm) & Demanded.getZExtValue()) == 0 &&
	"demanded bits should never be altered");
	assert(OldImm != NewImm && "the new imm shouldn't be equal to the old imm");

	// Create the new constant immediate node.
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue New;

	// If the new constant immediate is all-zeros or all-ones, let the target
	// independent DAG combine optimize this node.
	if (NewImm == 0 \|\| NewImm == OrigMask) {
	New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
	TLO.DAG.getConstant(NewImm, DL, VT));
	// Otherwise, create a machine node so that target independent DAG combine
	// doesn't undo this optimization.
	} else {
	Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
	SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
	New = SDValue(
	TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
	}

	return TLO.CombineTo(Op, New);
	}

	bool AArch64TargetLowering::targetShrinkDemandedConstant(
	SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
	// Delay this optimization to as late as possible.
	if (!TLO.LegalOps)
	return false;

	if (!EnableOptimizeLogicalImm)
	return false;

	EVT VT = Op.getValueType();
	if (VT.isVector())
	return false;

	unsigned Size = VT.getSizeInBits();
	assert((Size == 32 \|\| Size == 64) &&
	"i32 or i64 is expected after legalization.");

	// Exit early if we demand all bits.
	if (Demanded.countPopulation() == Size)
	return false;

	unsigned NewOpc;
	switch (Op.getOpcode()) {
	default:
	return false;
	case ISD::AND:
	NewOpc = Size == 32 ? AArch64::ANDWri : AArch64::ANDXri;
	break;
	case ISD::OR:
	NewOpc = Size == 32 ? AArch64::ORRWri : AArch64::ORRXri;
	break;
	case ISD::XOR:
	NewOpc = Size == 32 ? AArch64::EORWri : AArch64::EORXri;
	break;
	}
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!C)
	return false;
	uint64_t Imm = C->getZExtValue();
	return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
	}

	/// computeKnownBitsForTargetNode - Determine which of the bits specified in
	/// Mask are known to be either zero or one and return them Known.
	void AArch64TargetLowering::computeKnownBitsForTargetNode(
	const SDValue Op, KnownBits &Known,
	const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
	switch (Op.getOpcode()) {
	default:
	break;
	case AArch64ISD::CSEL: {
	KnownBits Known2;
	DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1);
	DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1);
	Known.Zero &= Known2.Zero;
	Known.One &= Known2.One;
	break;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
	Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
	switch (IntID) {
	default: return;
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	unsigned BitWidth = Known.getBitWidth();
	EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
	unsigned MemBits = VT.getScalarSizeInBits();
	Known.Zero \|= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
	return;
	}
	}
	break;
	}
	case ISD::INTRINSIC_WO_CHAIN:
	case ISD::INTRINSIC_VOID: {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	switch (IntNo) {
	default:
	break;
	case Intrinsic::aarch64_neon_umaxv:
	case Intrinsic::aarch64_neon_uminv: {
	// Figure out the datatype of the vector operand. The UMINV instruction
	// will zero extend the result, so we can mark as known zero all the
	// bits larger than the element datatype. 32-bit or larget doesn't need
	// this as those are legal types and will be handled by isel directly.
	MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
	unsigned BitWidth = Known.getBitWidth();
	if (VT == MVT::v8i8 \|\| VT == MVT::v16i8) {
	assert(BitWidth >= 8 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
	Known.Zero \|= Mask;
	} else if (VT == MVT::v4i16 \|\| VT == MVT::v8i16) {
	assert(BitWidth >= 16 && "Unexpected width!");
	APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
	Known.Zero \|= Mask;
	}
	break;
	} break;
	}
	}
	}
	}

	MVT AArch64TargetLowering::getScalarShiftAmountTy(const DataLayout &DL,
	EVT) const {
	return MVT::i64;
	}

	bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned AddrSpace,
	unsigned Align,
	bool *Fast) const {
	if (Subtarget->requiresStrictAlign())
	return false;

	if (Fast) {
	// Some CPUs are fine with unaligned stores except for 128-bit ones.
	*Fast = !Subtarget->isMisaligned128StoreSlow() \|\| VT.getStoreSize() != 16 \|\|
	// See comments in performSTORECombine() for more details about
	// these conditions.

	// Code that uses clang vector extensions can mark that it
	// wants unaligned accesses to be treated as fast by
	// underspecifying alignment to be 1 or 2.
	Align <= 2 \|\|

	// Disregard v2i64. Memcpy lowering produces those and splitting
	// them regresses performance on micro-benchmarks and olden/bh.
	VT == MVT::v2i64;
	}
	return true;
	}

	FastISel *
	AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return AArch64::createFastISel(funcInfo, libInfo);
	}

	const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((AArch64ISD::NodeType)Opcode) {
	case AArch64ISD::FIRST_NUMBER: break;
	case AArch64ISD::CALL: return "AArch64ISD::CALL";
	case AArch64ISD::ADRP: return "AArch64ISD::ADRP";
	case AArch64ISD::ADDlow: return "AArch64ISD::ADDlow";
	case AArch64ISD::LOADgot: return "AArch64ISD::LOADgot";
	case AArch64ISD::RET_FLAG: return "AArch64ISD::RET_FLAG";
	case AArch64ISD::BRCOND: return "AArch64ISD::BRCOND";
	case AArch64ISD::CSEL: return "AArch64ISD::CSEL";
	case AArch64ISD::FCSEL: return "AArch64ISD::FCSEL";
	case AArch64ISD::CSINV: return "AArch64ISD::CSINV";
	case AArch64ISD::CSNEG: return "AArch64ISD::CSNEG";
	case AArch64ISD::CSINC: return "AArch64ISD::CSINC";
	case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
	case AArch64ISD::TLSDESC_CALLSEQ: return "AArch64ISD::TLSDESC_CALLSEQ";
	case AArch64ISD::ADC: return "AArch64ISD::ADC";
	case AArch64ISD::SBC: return "AArch64ISD::SBC";
	case AArch64ISD::ADDS: return "AArch64ISD::ADDS";
	case AArch64ISD::SUBS: return "AArch64ISD::SUBS";
	case AArch64ISD::ADCS: return "AArch64ISD::ADCS";
	case AArch64ISD::SBCS: return "AArch64ISD::SBCS";
	case AArch64ISD::ANDS: return "AArch64ISD::ANDS";
	case AArch64ISD::CCMP: return "AArch64ISD::CCMP";
	case AArch64ISD::CCMN: return "AArch64ISD::CCMN";
	case AArch64ISD::FCCMP: return "AArch64ISD::FCCMP";
	case AArch64ISD::FCMP: return "AArch64ISD::FCMP";
	case AArch64ISD::DUP: return "AArch64ISD::DUP";
	case AArch64ISD::DUPLANE8: return "AArch64ISD::DUPLANE8";
	case AArch64ISD::DUPLANE16: return "AArch64ISD::DUPLANE16";
	case AArch64ISD::DUPLANE32: return "AArch64ISD::DUPLANE32";
	case AArch64ISD::DUPLANE64: return "AArch64ISD::DUPLANE64";
	case AArch64ISD::MOVI: return "AArch64ISD::MOVI";
	case AArch64ISD::MOVIshift: return "AArch64ISD::MOVIshift";
	case AArch64ISD::MOVIedit: return "AArch64ISD::MOVIedit";
	case AArch64ISD::MOVImsl: return "AArch64ISD::MOVImsl";
	case AArch64ISD::FMOV: return "AArch64ISD::FMOV";
	case AArch64ISD::MVNIshift: return "AArch64ISD::MVNIshift";
	case AArch64ISD::MVNImsl: return "AArch64ISD::MVNImsl";
	case AArch64ISD::BICi: return "AArch64ISD::BICi";
	case AArch64ISD::ORRi: return "AArch64ISD::ORRi";
	case AArch64ISD::BSL: return "AArch64ISD::BSL";
	case AArch64ISD::NEG: return "AArch64ISD::NEG";
	case AArch64ISD::EXTR: return "AArch64ISD::EXTR";
	case AArch64ISD::ZIP1: return "AArch64ISD::ZIP1";
	case AArch64ISD::ZIP2: return "AArch64ISD::ZIP2";
	case AArch64ISD::UZP1: return "AArch64ISD::UZP1";
	case AArch64ISD::UZP2: return "AArch64ISD::UZP2";
	case AArch64ISD::TRN1: return "AArch64ISD::TRN1";
	case AArch64ISD::TRN2: return "AArch64ISD::TRN2";
	case AArch64ISD::REV16: return "AArch64ISD::REV16";
	case AArch64ISD::REV32: return "AArch64ISD::REV32";
	case AArch64ISD::REV64: return "AArch64ISD::REV64";
	case AArch64ISD::EXT: return "AArch64ISD::EXT";
	case AArch64ISD::VSHL: return "AArch64ISD::VSHL";
	case AArch64ISD::VLSHR: return "AArch64ISD::VLSHR";
	case AArch64ISD::VASHR: return "AArch64ISD::VASHR";
	case AArch64ISD::CMEQ: return "AArch64ISD::CMEQ";
	case AArch64ISD::CMGE: return "AArch64ISD::CMGE";
	case AArch64ISD::CMGT: return "AArch64ISD::CMGT";
	case AArch64ISD::CMHI: return "AArch64ISD::CMHI";
	case AArch64ISD::CMHS: return "AArch64ISD::CMHS";
	case AArch64ISD::FCMEQ: return "AArch64ISD::FCMEQ";
	case AArch64ISD::FCMGE: return "AArch64ISD::FCMGE";
	case AArch64ISD::FCMGT: return "AArch64ISD::FCMGT";
	case AArch64ISD::CMEQz: return "AArch64ISD::CMEQz";
	case AArch64ISD::CMGEz: return "AArch64ISD::CMGEz";
	case AArch64ISD::CMGTz: return "AArch64ISD::CMGTz";
	case AArch64ISD::CMLEz: return "AArch64ISD::CMLEz";
	case AArch64ISD::CMLTz: return "AArch64ISD::CMLTz";
	case AArch64ISD::FCMEQz: return "AArch64ISD::FCMEQz";
	case AArch64ISD::FCMGEz: return "AArch64ISD::FCMGEz";
	case AArch64ISD::FCMGTz: return "AArch64ISD::FCMGTz";
	case AArch64ISD::FCMLEz: return "AArch64ISD::FCMLEz";
	case AArch64ISD::FCMLTz: return "AArch64ISD::FCMLTz";
	case AArch64ISD::SADDV: return "AArch64ISD::SADDV";
	case AArch64ISD::UADDV: return "AArch64ISD::UADDV";
	case AArch64ISD::SMINV: return "AArch64ISD::SMINV";
	case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
	case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
	case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
	case AArch64ISD::NOT: return "AArch64ISD::NOT";
	case AArch64ISD::BIT: return "AArch64ISD::BIT";
	case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
	case AArch64ISD::CBNZ: return "AArch64ISD::CBNZ";
	case AArch64ISD::TBZ: return "AArch64ISD::TBZ";
	case AArch64ISD::TBNZ: return "AArch64ISD::TBNZ";
	case AArch64ISD::TC_RETURN: return "AArch64ISD::TC_RETURN";
	case AArch64ISD::PREFETCH: return "AArch64ISD::PREFETCH";
	case AArch64ISD::SITOF: return "AArch64ISD::SITOF";
	case AArch64ISD::UITOF: return "AArch64ISD::UITOF";
	case AArch64ISD::NVCAST: return "AArch64ISD::NVCAST";
	case AArch64ISD::SQSHL_I: return "AArch64ISD::SQSHL_I";
	case AArch64ISD::UQSHL_I: return "AArch64ISD::UQSHL_I";
	case AArch64ISD::SRSHR_I: return "AArch64ISD::SRSHR_I";
	case AArch64ISD::URSHR_I: return "AArch64ISD::URSHR_I";
	case AArch64ISD::SQSHLU_I: return "AArch64ISD::SQSHLU_I";
	case AArch64ISD::WrapperLarge: return "AArch64ISD::WrapperLarge";
	case AArch64ISD::LD2post: return "AArch64ISD::LD2post";
	case AArch64ISD::LD3post: return "AArch64ISD::LD3post";
	case AArch64ISD::LD4post: return "AArch64ISD::LD4post";
	case AArch64ISD::ST2post: return "AArch64ISD::ST2post";
	case AArch64ISD::ST3post: return "AArch64ISD::ST3post";
	case AArch64ISD::ST4post: return "AArch64ISD::ST4post";
	case AArch64ISD::LD1x2post: return "AArch64ISD::LD1x2post";
	case AArch64ISD::LD1x3post: return "AArch64ISD::LD1x3post";
	case AArch64ISD::LD1x4post: return "AArch64ISD::LD1x4post";
	case AArch64ISD::ST1x2post: return "AArch64ISD::ST1x2post";
	case AArch64ISD::ST1x3post: return "AArch64ISD::ST1x3post";
	case AArch64ISD::ST1x4post: return "AArch64ISD::ST1x4post";
	case AArch64ISD::LD1DUPpost: return "AArch64ISD::LD1DUPpost";
	case AArch64ISD::LD2DUPpost: return "AArch64ISD::LD2DUPpost";
	case AArch64ISD::LD3DUPpost: return "AArch64ISD::LD3DUPpost";
	case AArch64ISD::LD4DUPpost: return "AArch64ISD::LD4DUPpost";
	case AArch64ISD::LD1LANEpost: return "AArch64ISD::LD1LANEpost";
	case AArch64ISD::LD2LANEpost: return "AArch64ISD::LD2LANEpost";
	case AArch64ISD::LD3LANEpost: return "AArch64ISD::LD3LANEpost";
	case AArch64ISD::LD4LANEpost: return "AArch64ISD::LD4LANEpost";
	case AArch64ISD::ST2LANEpost: return "AArch64ISD::ST2LANEpost";
	case AArch64ISD::ST3LANEpost: return "AArch64ISD::ST3LANEpost";
	case AArch64ISD::ST4LANEpost: return "AArch64ISD::ST4LANEpost";
	case AArch64ISD::SMULL: return "AArch64ISD::SMULL";
	case AArch64ISD::UMULL: return "AArch64ISD::UMULL";
	case AArch64ISD::FRECPE: return "AArch64ISD::FRECPE";
	case AArch64ISD::FRECPS: return "AArch64ISD::FRECPS";
	case AArch64ISD::FRSQRTE: return "AArch64ISD::FRSQRTE";
	case AArch64ISD::FRSQRTS: return "AArch64ISD::FRSQRTS";
	}
	return nullptr;
	}

	MachineBasicBlock *
	AArch64TargetLowering::EmitF128CSEL(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// We materialise the F128CSEL pseudo-instruction as some control flow and a
	// phi node:

	// OrigBB:
	// [... previous instrs leading to comparison ...]
	// b.ne TrueBB
	// b EndBB
	// TrueBB:
	// ; Fallthrough
	// EndBB:
	// Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]

	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction::iterator It = ++MBB->getIterator();

	unsigned DestReg = MI.getOperand(0).getReg();
	unsigned IfTrueReg = MI.getOperand(1).getReg();
	unsigned IfFalseReg = MI.getOperand(2).getReg();
	unsigned CondCode = MI.getOperand(3).getImm();
	bool NZCVKilled = MI.getOperand(4).isKill();

	MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MF->insert(It, TrueBB);
	MF->insert(It, EndBB);

	// Transfer rest of current basic-block to EndBB
	EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
	MBB->end());
	EndBB->transferSuccessorsAndUpdatePHIs(MBB);

	BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
	BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
	MBB->addSuccessor(TrueBB);
	MBB->addSuccessor(EndBB);

	// TrueBB falls through to the end.
	TrueBB->addSuccessor(EndBB);

	if (!NZCVKilled) {
	TrueBB->addLiveIn(AArch64::NZCV);
	EndBB->addLiveIn(AArch64::NZCV);
	}

	BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
	.addReg(IfTrueReg)
	.addMBB(TrueBB)
	.addReg(IfFalseReg)
	.addMBB(MBB);

	MI.eraseFromParent();
	return EndBB;
	}

	MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *BB) const {
	switch (MI.getOpcode()) {
	default:
	#ifndef NDEBUG
	MI.dump();
	#endif
	llvm_unreachable("Unexpected instruction for custom inserter!");

	case AArch64::F128CSEL:
	return EmitF128CSEL(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);
	}
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Lowering private implementation.
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Lowering Code
	//===----------------------------------------------------------------------===//

	/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
	/// CC
	static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
	switch (CC) {
	default:
	llvm_unreachable("Unknown condition code!");
	case ISD::SETNE:
	return AArch64CC::NE;
	case ISD::SETEQ:
	return AArch64CC::EQ;
	case ISD::SETGT:
	return AArch64CC::GT;
	case ISD::SETGE:
	return AArch64CC::GE;
	case ISD::SETLT:
	return AArch64CC::LT;
	case ISD::SETLE:
	return AArch64CC::LE;
	case ISD::SETUGT:
	return AArch64CC::HI;
	case ISD::SETUGE:
	return AArch64CC::HS;
	case ISD::SETULT:
	return AArch64CC::LO;
	case ISD::SETULE:
	return AArch64CC::LS;
	}
	}

	/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
	static void changeFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	llvm_unreachable("Unknown FP condition!");
	case ISD::SETEQ:
	case ISD::SETOEQ:
	CondCode = AArch64CC::EQ;
	break;
	case ISD::SETGT:
	case ISD::SETOGT:
	CondCode = AArch64CC::GT;
	break;
	case ISD::SETGE:
	case ISD::SETOGE:
	CondCode = AArch64CC::GE;
	break;
	case ISD::SETOLT:
	CondCode = AArch64CC::MI;
	break;
	case ISD::SETOLE:
	CondCode = AArch64CC::LS;
	break;
	case ISD::SETONE:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GT;
	break;
	case ISD::SETO:
	CondCode = AArch64CC::VC;
	break;
	case ISD::SETUO:
	CondCode = AArch64CC::VS;
	break;
	case ISD::SETUEQ:
	CondCode = AArch64CC::EQ;
	CondCode2 = AArch64CC::VS;
	break;
	case ISD::SETUGT:
	CondCode = AArch64CC::HI;
	break;
	case ISD::SETUGE:
	CondCode = AArch64CC::PL;
	break;
	case ISD::SETLT:
	case ISD::SETULT:
	CondCode = AArch64CC::LT;
	break;
	case ISD::SETLE:
	case ISD::SETULE:
	CondCode = AArch64CC::LE;
	break;
	case ISD::SETNE:
	case ISD::SETUNE:
	CondCode = AArch64CC::NE;
	break;
	}
	}

	/// Convert a DAG fp condition code to an AArch64 CC.
	/// This differs from changeFPCCToAArch64CC in that it returns cond codes that
	/// should be AND'ed instead of OR'ed.
	static void changeFPCCToANDAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2) {
	CondCode2 = AArch64CC::AL;
	switch (CC) {
	default:
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	assert(CondCode2 == AArch64CC::AL);
	break;
	case ISD::SETONE:
	// (a one b)
	// == ((a olt b) \|\| (a ogt b))
	// == ((a ord b) && (a une b))
	CondCode = AArch64CC::VC;
	CondCode2 = AArch64CC::NE;
	break;
	case ISD::SETUEQ:
	// (a ueq b)
	// == ((a uno b) \|\| (a oeq b))
	// == ((a ule b) && (a uge b))
	CondCode = AArch64CC::PL;
	CondCode2 = AArch64CC::LE;
	break;
	}
	}

	/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
	/// CC usable with the vector instructions. Fewer operations are available
	/// without a real NZCV register, so we have to use less efficient combinations
	/// to get the same effect.
	static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
	AArch64CC::CondCode &CondCode,
	AArch64CC::CondCode &CondCode2,
	bool &Invert) {
	Invert = false;
	switch (CC) {
	default:
	// Mostly the scalar mappings work fine.
	changeFPCCToAArch64CC(CC, CondCode, CondCode2);
	break;
	case ISD::SETUO:
	Invert = true;
	LLVM_FALLTHROUGH;
	case ISD::SETO:
	CondCode = AArch64CC::MI;
	CondCode2 = AArch64CC::GE;
	break;
	case ISD::SETUEQ:
	case ISD::SETULT:
	case ISD::SETULE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	// All of the compare-mask comparisons are ordered, but we can switch
	// between the two by a double inversion. E.g. ULE == !OGT.
	Invert = true;
	changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
	break;
	}
	}

	static bool isLegalArithImmed(uint64_t C) {
	// Matches AArch64DAGToDAGISel::SelectArithImmed().
	return (C >> 12 == 0) \|\| ((C & 0xFFFULL) == 0 && C >> 24 == 0);
	}

	static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT VT = LHS.getValueType();

	if (VT.isFloatingPoint()) {
	assert(VT != MVT::f128);
	if (VT == MVT::f16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	VT = MVT::f32;
	}
	return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
	}

	// The CMP instruction is just an alias for SUBS, and representing it as
	// SUBS means that it's possible to get CSE with subtract operations.
	// A later phase can perform the optimization of setting the destination
	// register to WZR/XZR if it ends up being unused.
	unsigned Opcode = AArch64ISD::SUBS;

	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
	// the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
	// can be set differently by this operation. It comes down to whether
	// "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
	// everything is fine. If not then the optimization is wrong. Thus general
	// comparisons are only valid if op2 != 0.

	// So, finally, the only LLVM-native comparisons that don't mention C and V
	// are SETEQ and SETNE. They're the only ones we can safely use CMN for in
	// the absence of information about op2.
	Opcode = AArch64ISD::ADDS;
	RHS = RHS.getOperand(1);
	} else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
	!isUnsignedIntSetCC(CC)) {
	// Similarly, (CMP (and X, Y), 0) can be implemented with a TST
	// (a.k.a. ANDS) except that the flags are only guaranteed to work for one
	// of the signed comparisons.
	Opcode = AArch64ISD::ANDS;
	RHS = LHS.getOperand(1);
	LHS = LHS.getOperand(0);
	}

	return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
	.getValue(1);
	}

	/// \defgroup AArch64CCMP CMP;CCMP matching
	///
	/// These functions deal with the formation of CMP;CCMP;... sequences.
	/// The CCMP/CCMN/FCCMP/FCCMPE instructions allow the conditional execution of
	/// a comparison. They set the NZCV flags to a predefined value if their
	/// predicate is false. This allows to express arbitrary conjunctions, for
	/// example "cmp 0 (and (setCA (cmp A)) (setCB (cmp B))))"
	/// expressed as:
	/// cmp A
	/// ccmp B, inv(CB), CA
	/// check for CB flags
	///
	/// In general we can create code for arbitrary "... (and (and A B) C)"
	/// sequences. We can also implement some "or" expressions, because "(or A B)"
	/// is equivalent to "not (and (not A) (not B))" and we can implement some
	/// negation operations:
	/// We can negate the results of a single comparison by inverting the flags
	/// used when the predicate fails and inverting the flags tested in the next
	/// instruction; We can also negate the results of the whole previous
	/// conditional compare sequence by inverting the flags tested in the next
	/// instruction. However there is no way to negate the result of a partial
	/// sequence.
	///
	/// Therefore on encountering an "or" expression we can negate the subtree on
	/// one side and have to be able to push the negate to the leafs of the subtree
	/// on the other side (see also the comments in code). As complete example:
	/// "or (or (setCA (cmp A)) (setCB (cmp B)))
	/// (and (setCC (cmp C)) (setCD (cmp D)))"
	/// is transformed to
	/// "not (and (not (and (setCC (cmp C)) (setCC (cmp D))))
	/// (and (not (setCA (cmp A)) (not (setCB (cmp B))))))"
	/// and implemented as:
	/// cmp C
	/// ccmp D, inv(CD), CC
	/// ccmp A, CA, inv(CD)
	/// ccmp B, CB, inv(CA)
	/// check for CB flags
	/// A counterexample is "or (and A B) (and C D)" which cannot be implemented
	/// by conditional compare sequences.
	/// @{

	/// Create a conditional comparison; Use CCMP, CCMN or FCCMP as appropriate.
	static SDValue emitConditionalComparison(SDValue LHS, SDValue RHS,
	ISD::CondCode CC, SDValue CCOp,
	AArch64CC::CondCode Predicate,
	AArch64CC::CondCode OutCC,
	const SDLoc &DL, SelectionDAG &DAG) {
	unsigned Opcode = 0;
	if (LHS.getValueType().isFloatingPoint()) {
	assert(LHS.getValueType() != MVT::f128);
	if (LHS.getValueType() == MVT::f16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, RHS);
	}
	Opcode = AArch64ISD::FCCMP;
	} else if (RHS.getOpcode() == ISD::SUB) {
	SDValue SubOp0 = RHS.getOperand(0);
	if (isNullConstant(SubOp0) && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	// See emitComparison() on why we can only do this for SETEQ and SETNE.
	Opcode = AArch64ISD::CCMN;
	RHS = RHS.getOperand(1);
	}
	}
	if (Opcode == 0)
	Opcode = AArch64ISD::CCMP;

	SDValue Condition = DAG.getConstant(Predicate, DL, MVT_CC);
	AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
	unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
	SDValue NZCVOp = DAG.getConstant(NZCV, DL, MVT::i32);
	return DAG.getNode(Opcode, DL, MVT_CC, LHS, RHS, NZCVOp, Condition, CCOp);
	}

	/// Returns true if @p Val is a tree of AND/OR/SETCC operations.
	/// CanPushNegate is set to true if we can push a negate operation through
	/// the tree in a was that we are left with AND operations and negate operations
	/// at the leafs only. i.e. "not (or (or x y) z)" can be changed to
	/// "and (and (not x) (not y)) (not z)"; "not (or (and x y) z)" cannot be
	/// brought into such a form.
	static bool isConjunctionDisjunctionTree(const SDValue Val, bool &CanNegate,
	unsigned Depth = 0) {
	if (!Val.hasOneUse())
	return false;
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	if (Val->getOperand(0).getValueType() == MVT::f128)
	return false;
	CanNegate = true;
	return true;
	}
	// Protect against exponential runtime and stack overflow.
	if (Depth > 6)
	return false;
	if (Opcode == ISD::AND \|\| Opcode == ISD::OR) {
	SDValue O0 = Val->getOperand(0);
	SDValue O1 = Val->getOperand(1);
	bool CanNegateL;
	if (!isConjunctionDisjunctionTree(O0, CanNegateL, Depth+1))
	return false;
	bool CanNegateR;
	if (!isConjunctionDisjunctionTree(O1, CanNegateR, Depth+1))
	return false;

	if (Opcode == ISD::OR) {
	// For an OR expression we need to be able to negate at least one side or
	// we cannot do the transformation at all.
	if (!CanNegateL && !CanNegateR)
	return false;
	// We can however change a (not (or x y)) to (and (not x) (not y)) if we
	// can negate the x and y subtrees.
	CanNegate = CanNegateL && CanNegateR;
	} else {
	// If the operands are OR expressions then we finally need to negate their
	// outputs, we can only do that for the operand with emitted last by
	// negating OutCC, not for both operands.
	bool NeedsNegOutL = O0->getOpcode() == ISD::OR;
	bool NeedsNegOutR = O1->getOpcode() == ISD::OR;
	if (NeedsNegOutL && NeedsNegOutR)
	return false;
	// We cannot negate an AND operation (it would become an OR),
	CanNegate = false;
	}
	return true;
	}
	return false;
	}

	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
	/// Tries to transform the given i1 producing node @p Val to a series compare
	/// and conditional compare operations. @returns an NZCV flags producing node
	/// and sets @p OutCC to the flags that should be tested or returns SDValue() if
	/// transformation was not possible.
	/// On recursive invocations @p PushNegate may be set to true to have negation
	/// effects pushed to the tree leafs; @p Predicate is an NZCV flag predicate
	/// for the comparisons in the current subtree; @p Depth limits the search
	/// depth to avoid stack overflow.
	static SDValue emitConjunctionDisjunctionTreeRec(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC, bool Negate, SDValue CCOp,
	AArch64CC::CondCode Predicate) {
	// We're at a tree leaf, produce a conditional comparison operation.
	unsigned Opcode = Val->getOpcode();
	if (Opcode == ISD::SETCC) {
	SDValue LHS = Val->getOperand(0);
	SDValue RHS = Val->getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
	bool isInteger = LHS.getValueType().isInteger();
	if (Negate)
	CC = getSetCCInverse(CC, isInteger);
	SDLoc DL(Val);
	// Determine OutCC and handle FP special case.
	if (isInteger) {
	OutCC = changeIntCCToAArch64CC(CC);
	} else {
	assert(LHS.getValueType().isFloatingPoint());
	AArch64CC::CondCode ExtraCC;
	changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
	// Some floating point conditions can't be tested with a single condition
	// code. Construct an additional comparison in this case.
	if (ExtraCC != AArch64CC::AL) {
	SDValue ExtraCmp;
	if (!CCOp.getNode())
	ExtraCmp = emitComparison(LHS, RHS, CC, DL, DAG);
	else
	ExtraCmp = emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate,
	ExtraCC, DL, DAG);
	CCOp = ExtraCmp;
	Predicate = ExtraCC;
	}
	}

	// Produce a normal comparison if we are first in the chain
	if (!CCOp)
	return emitComparison(LHS, RHS, CC, DL, DAG);
	// Otherwise produce a ccmp.
	return emitConditionalComparison(LHS, RHS, CC, CCOp, Predicate, OutCC, DL,
	DAG);
	}
	assert((Opcode == ISD::AND \|\| (Opcode == ISD::OR && Val->hasOneUse())) &&
	"Valid conjunction/disjunction tree");

	// Check if both sides can be transformed.
	SDValue LHS = Val->getOperand(0);
	SDValue RHS = Val->getOperand(1);

	// In case of an OR we need to negate our operands and the result.
	// (A v B) <=> not(not(A) ^ not(B))
	bool NegateOpsAndResult = Opcode == ISD::OR;
	// We can negate the results of all previous operations by inverting the
	// predicate flags giving us a free negation for one side. The other side
	// must be negatable by itself.
	if (NegateOpsAndResult) {
	// See which side we can negate.
	bool CanNegateL;
	bool isValidL = isConjunctionDisjunctionTree(LHS, CanNegateL);
	assert(isValidL && "Valid conjunction/disjunction tree");
	(void)isValidL;

	#ifndef NDEBUG
	bool CanNegateR;
	bool isValidR = isConjunctionDisjunctionTree(RHS, CanNegateR);
	assert(isValidR && "Valid conjunction/disjunction tree");
	assert((CanNegateL \|\| CanNegateR) && "Valid conjunction/disjunction tree");
	#endif

	// Order the side which we cannot negate to RHS so we can emit it first.
	if (!CanNegateL)
	std::swap(LHS, RHS);
	} else {
	bool NeedsNegOutL = LHS->getOpcode() == ISD::OR;
	assert((!NeedsNegOutL \|\| RHS->getOpcode() != ISD::OR) &&
	"Valid conjunction/disjunction tree");
	// Order the side where we need to negate the output flags to RHS so it
	// gets emitted first.
	if (NeedsNegOutL)
	std::swap(LHS, RHS);
	}

	// Emit RHS. If we want to negate the tree we only need to push a negate
	// through if we are already in a PushNegate case, otherwise we can negate
	// the "flags to test" afterwards.
	AArch64CC::CondCode RHSCC;
	SDValue CmpR = emitConjunctionDisjunctionTreeRec(DAG, RHS, RHSCC, Negate,
	CCOp, Predicate);
	if (NegateOpsAndResult && !Negate)
	RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
	// Emit LHS. We may need to negate it.
	SDValue CmpL = emitConjunctionDisjunctionTreeRec(DAG, LHS, OutCC,
	NegateOpsAndResult, CmpR,
	RHSCC);
	// If we transformed an OR to and AND then we have to negate the result
	// (or absorb the Negate parameter).
	if (NegateOpsAndResult && !Negate)
	OutCC = AArch64CC::getInvertedCondCode(OutCC);
	return CmpL;
	}

	/// Emit conjunction or disjunction tree with the CMP/FCMP followed by a chain
	/// of CCMP/CFCMP ops. See @ref AArch64CCMP.
	/// \see emitConjunctionDisjunctionTreeRec().
	static SDValue emitConjunctionDisjunctionTree(SelectionDAG &DAG, SDValue Val,
	AArch64CC::CondCode &OutCC) {
	bool CanNegate;
	if (!isConjunctionDisjunctionTree(Val, CanNegate))
	return SDValue();

	return emitConjunctionDisjunctionTreeRec(DAG, Val, OutCC, false, SDValue(),
	AArch64CC::AL);
	}

	/// @}

	static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
	SDValue &AArch64cc, SelectionDAG &DAG,
	const SDLoc &dl) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
	EVT VT = RHS.getValueType();
	uint64_t C = RHSC->getZExtValue();
	if (!isLegalArithImmed(C)) {
	// Constant does not fit, try adjusting it by one?
	switch (CC) {
	default:
	break;
	case ISD::SETLT:
	case ISD::SETGE:
	if ((VT == MVT::i32 && C != 0x80000000 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0x80000000ULL &&
	isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULT:
	case ISD::SETUGE:
	if ((VT == MVT::i32 && C != 0 &&
	isLegalArithImmed((uint32_t)(C - 1))) \|\|
	(VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
	CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
	C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETLE:
	case ISD::SETGT:
	if ((VT == MVT::i32 && C != INT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != INT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	case ISD::SETULE:
	case ISD::SETUGT:
	if ((VT == MVT::i32 && C != UINT32_MAX &&
	isLegalArithImmed((uint32_t)(C + 1))) \|\|
	(VT == MVT::i64 && C != UINT64_MAX &&
	isLegalArithImmed(C + 1ULL))) {
	CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
	C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
	RHS = DAG.getConstant(C, dl, VT);
	}
	break;
	}
	}
	}
	SDValue Cmp;
	AArch64CC::CondCode AArch64CC;
	if ((CC == ISD::SETEQ \|\| CC == ISD::SETNE) && isa<ConstantSDNode>(RHS)) {
	const ConstantSDNode *RHSC = cast<ConstantSDNode>(RHS);

	// The imm operand of ADDS is an unsigned immediate, in the range 0 to 4095.
	// For the i8 operand, the largest immediate is 255, so this can be easily
	// encoded in the compare instruction. For the i16 operand, however, the
	// largest immediate cannot be encoded in the compare.
	// Therefore, use a sign extending load and cmn to avoid materializing the
	// -1 constant. For example,
	// movz w1, #65535
	// ldrh w0, [x0, #0]
	// cmp w0, w1
	// >
	// ldrsh w0, [x0, #0]
	// cmn w0, #1
	// Fundamental, we're relying on the property that (zext LHS) == (zext RHS)
	// if and only if (sext LHS) == (sext RHS). The checks are in place to
	// ensure both the LHS and RHS are truly zero extended and to make sure the
	// transformation is profitable.
	if ((RHSC->getZExtValue() >> 16 == 0) && isa<LoadSDNode>(LHS) &&
	cast<LoadSDNode>(LHS)->getExtensionType() == ISD::ZEXTLOAD &&
	cast<LoadSDNode>(LHS)->getMemoryVT() == MVT::i16 &&
	LHS.getNode()->hasNUsesOfValue(1, 0)) {
	int16_t ValueofRHS = cast<ConstantSDNode>(RHS)->getZExtValue();
	if (ValueofRHS < 0 && isLegalArithImmed(-ValueofRHS)) {
	SDValue SExt =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, LHS.getValueType(), LHS,
	DAG.getValueType(MVT::i16));
	Cmp = emitComparison(SExt, DAG.getConstant(ValueofRHS, dl,
	RHS.getValueType()),
	CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	}

	if (!Cmp && (RHSC->isNullValue() \|\| RHSC->isOne())) {
	if ((Cmp = emitConjunctionDisjunctionTree(DAG, LHS, AArch64CC))) {
	if ((CC == ISD::SETNE) ^ RHSC->isNullValue())
	AArch64CC = AArch64CC::getInvertedCondCode(AArch64CC);
	}
	}
	}

	if (!Cmp) {
	Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC = changeIntCCToAArch64CC(CC);
	}
	AArch64cc = DAG.getConstant(AArch64CC, dl, MVT_CC);
	return Cmp;
	}

	static std::pair<SDValue, SDValue>
	getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
	assert((Op.getValueType() == MVT::i32 \|\| Op.getValueType() == MVT::i64) &&
	"Unsupported value type");
	SDValue Value, Overflow;
	SDLoc DL(Op);
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	unsigned Opc = 0;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Unknown overflow instruction!");
	case ISD::SADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::VS;
	break;
	case ISD::UADDO:
	Opc = AArch64ISD::ADDS;
	CC = AArch64CC::HS;
	break;
	case ISD::SSUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::VS;
	break;
	case ISD::USUBO:
	Opc = AArch64ISD::SUBS;
	CC = AArch64CC::LO;
	break;
	// Multiply needs a little bit extra work.
	case ISD::SMULO:
	case ISD::UMULO: {
	CC = AArch64CC::NE;
	bool IsSigned = Op.getOpcode() == ISD::SMULO;
	if (Op.getValueType() == MVT::i32) {
	unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	// For a 32 bit multiply with overflow check we want the instruction
	// selector to generate a widening multiply (SMADDL/UMADDL). For that we
	// need to generate the following pattern:
	// (i64 add 0, (i64 mul (i64 sext\|zext i32 %a), (i64 sext\|zext i32 %b))
	LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
	RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
	SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
	DAG.getConstant(0, DL, MVT::i64));
	// On AArch64 the upper 32 bits are always zero extended for a 32 bit
	// operation. We need to clear out the upper 32 bits, because we used a
	// widening multiply that wrote all 64 bits. In the end this should be a
	// noop.
	Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
	if (IsSigned) {
	// The signed overflow check requires more than just a simple check for
	// any bit set in the upper 32 bits of the result. These bits could be
	// just the sign bits of a negative number. To perform the overflow
	// check we have to arithmetic shift right the 32nd bit of the result by
	// 31 bits. Then we compare the result to the upper 32 bits.
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
	DAG.getConstant(32, DL, MVT::i64));
	UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
	DAG.getConstant(31, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	// The overflow check for unsigned multiply is easy. We only need to
	// check if any of the upper 32 bits are set. This can be done with a
	// CMP (shifted register). For that we need to generate the following
	// pattern:
	// (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
	SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
	DAG.getConstant(32, DL, MVT::i64));
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
	// For the 64 bit multiply
	Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
	if (IsSigned) {
	SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
	SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
	DAG.getConstant(63, DL, MVT::i64));
	// It is important that LowerBits is last, otherwise the arithmetic
	// shift will not be folded into the compare (SUBS).
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
	.getValue(1);
	} else {
	SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
	SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
	Overflow =
	DAG.getNode(AArch64ISD::SUBS, DL, VTs,
	DAG.getConstant(0, DL, MVT::i64),
	UpperBits).getValue(1);
	}
	break;
	}
	} // switch (...)

	if (Opc) {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);

	// Emit the AArch64 operation with overflow check.
	Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
	Overflow = Value.getValue(1);
	}
	return std::make_pair(Value, Overflow);
	}

	SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
	RTLIB::Libcall Call) const {
	SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
	return makeLibCall(DAG, Call, MVT::f128, Ops, false, SDLoc(Op)).first;
	}

	static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
	SDValue Sel = Op.getOperand(0);
	SDValue Other = Op.getOperand(1);

	// If neither operand is a SELECT_CC, give up.
	if (Sel.getOpcode() != ISD::SELECT_CC)
	std::swap(Sel, Other);
	if (Sel.getOpcode() != ISD::SELECT_CC)
	return Op;

	// The folding we want to perform is:
	// (xor x, (select_cc a, b, cc, 0, -1) )
	// -->
	// (csel x, (xor x, -1), cc ...)
	//
	// The latter will get matched to a CSINV instruction.

	ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
	SDValue LHS = Sel.getOperand(0);
	SDValue RHS = Sel.getOperand(1);
	SDValue TVal = Sel.getOperand(2);
	SDValue FVal = Sel.getOperand(3);
	SDLoc dl(Sel);

	// FIXME: This could be generalized to non-integer comparisons.
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return Op;

	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	// The values aren't constants, this isn't the pattern we're looking for.
	if (!CFVal \|\| !CTVal)
	return Op;

	// We can commute the SELECT_CC by inverting the condition. This
	// might be needed to make this fit into a CSINV pattern.
	if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}

	// If the constants line up, perform the transform!
	if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);

	FVal = Other;
	TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
	DAG.getConstant(-1ULL, dl, Other.getValueType()));

	return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
	CCVal, Cmp);
	}

	return Op;
	}

	static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
	EVT VT = Op.getValueType();

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	unsigned Opc;
	bool ExtraOp = false;
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("Invalid code");
	case ISD::ADDC:
	Opc = AArch64ISD::ADDS;
	break;
	case ISD::SUBC:
	Opc = AArch64ISD::SUBS;
	break;
	case ISD::ADDE:
	Opc = AArch64ISD::ADCS;
	ExtraOp = true;
	break;
	case ISD::SUBE:
	Opc = AArch64ISD::SBCS;
	ExtraOp = true;
	break;
	}

	if (!ExtraOp)
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
	return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
	return SDValue();

	SDLoc dl(Op);
	AArch64CC::CondCode CC;
	// The actual operation that sets the overflow or carry flag.
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);

	// We use 0 and 1 as false and true values.
	SDValue TVal = DAG.getConstant(1, dl, MVT::i32);
	SDValue FVal = DAG.getConstant(0, dl, MVT::i32);

	// We use an inverted condition, because the conditional select is inverted
	// too. This will allow it to be selected to a single instruction:
	// CSINC Wd, WZR, WZR, invert(cond).
	SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), dl, MVT::i32);
	Overflow = DAG.getNode(AArch64ISD::CSEL, dl, MVT::i32, FVal, TVal,
	CCVal, Overflow);

	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
	}

	// Prefetch operands are:
	// 1: Address to prefetch
	// 2: bool isWrite
	// 3: int locality (0 = no locality ... 3 = extreme locality)
	// 4: bool isDataCache
	static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	unsigned IsData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();

	bool IsStream = !Locality;
	// When the locality number is set
	if (Locality) {
	// The front-end should have filtered out the out-of-range values
	assert(Locality <= 3 && "Prefetch locality out-of-range");
	// The locality degree is the opposite of the cache speed.
	// Put the number the other way around.
	// The encoding starts at 0 for level 1
	Locality = 3 - Locality;
	}

	// built the mask value encoding the expected behavior.
	unsigned PrfOp = (IsWrite << 4) \| // Load/Store bit
	(!IsData << 3) \| // IsDataCache bit
	(Locality << 1) \| // Cache level bits
	(unsigned)IsStream; // Stream bit
	return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
	DAG.getConstant(PrfOp, DL, MVT::i32), Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");

	RTLIB::Libcall LC;
	LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getOperand(0).getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());

	// FP_ROUND node has a second operand indicating whether it is known to be
	// precise. That doesn't take part in the LibCall so we can't directly use
	// LowerF128Call.
	SDValue SrcVal = Op.getOperand(0);
	return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, /isSigned/ false,
	SDLoc(Op)).first;
	}

	static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT InVT = Op.getOperand(0).getValueType();
	EVT VT = Op.getValueType();
	unsigned NumElts = InVT.getVectorNumElements();

	// f16 vectors are promoted to f32 before a conversion.
	if (InVT.getVectorElementType() == MVT::f16) {
	MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts);
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0)));
	}

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	SDLoc dl(Op);
	SDValue Cv =
	DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
	Op.getOperand(0));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	SDLoc dl(Op);
	MVT ExtVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(VT.getScalarSizeInBits()),
	VT.getVectorNumElements());
	SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, ExtVT, Op.getOperand(0));
	return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
	}

	// Type changing conversions are illegal.
	return Op;
	}

	SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getOperand(0).getValueType().isVector())
	return LowerVectorFP_TO_INT(Op, DAG);

	// f16 conversions are promoted to f32.
	if (Op.getOperand(0).getValueType() == MVT::f16) {
	SDLoc dl(Op);
	return DAG.getNode(
	Op.getOpcode(), dl, Op.getValueType(),
	DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
	}

	if (Op.getOperand(0).getValueType() != MVT::f128) {
	// It's legal except when f128 is involved
	return Op;
	}

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::FP_TO_SINT)
	LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
	else
	LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());

	SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
	return makeLibCall(DAG, LC, Op.getValueType(), Ops, false, SDLoc(Op)).first;
	}

	static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
	// Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
	// Any additional optimization in this function should be recorded
	// in the cost tables.
	EVT VT = Op.getValueType();
	SDLoc dl(Op);
	SDValue In = Op.getOperand(0);
	EVT InVT = In.getValueType();

	if (VT.getSizeInBits() < InVT.getSizeInBits()) {
	MVT CastVT =
	MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
	InVT.getVectorNumElements());
	In = DAG.getNode(Op.getOpcode(), dl, CastVT, In);
	return DAG.getNode(ISD::FP_ROUND, dl, VT, In, DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() > InVT.getSizeInBits()) {
	unsigned CastOpc =
	Op.getOpcode() == ISD::SINT_TO_FP ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	EVT CastVT = VT.changeVectorElementTypeToInteger();
	In = DAG.getNode(CastOpc, dl, CastVT, In);
	return DAG.getNode(Op.getOpcode(), dl, VT, In);
	}

	return Op;
	}

	SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	if (Op.getValueType().isVector())
	return LowerVectorINT_TO_FP(Op, DAG);

	// f16 conversions are promoted to f32.
	if (Op.getValueType() == MVT::f16) {
	SDLoc dl(Op);
	return DAG.getNode(
	ISD::FP_ROUND, dl, MVT::f16,
	DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
	DAG.getIntPtrConstant(0, dl));
	}

	// i128 conversions are libcalls.
	if (Op.getOperand(0).getValueType() == MVT::i128)
	return SDValue();

	// Other conversions are legal, unless it's to the completely software-based
	// fp128.
	if (Op.getValueType() != MVT::f128)
	return Op;

	RTLIB::Libcall LC;
	if (Op.getOpcode() == ISD::SINT_TO_FP)
	LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
	else
	LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());

	return LowerF128Call(Op, DAG, LC);
	}

	SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
	SelectionDAG &DAG) const {
	// For iOS, we want to call an alternative entry point: __sincos_stret,
	// which returns the values in two S / D registers.
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	ArgListTy Args;
	ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	const char *LibcallName =
	(ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout()));

	StructType *RetTy = StructType::get(ArgTy, ArgTy);
	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
	return CallResult.first;
	}

	static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
	if (Op.getValueType() != MVT::f16)
	return SDValue();

	assert(Op.getOperand(0).getValueType() == MVT::i16);
	SDLoc DL(Op);

	Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
	return SDValue(
	DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	}

	static EVT getExtensionTo64Bits(const EVT &OrigVT) {
	if (OrigVT.getSizeInBits() >= 64)
	return OrigVT;

	assert(OrigVT.isSimple() && "Expecting a simple value type");

	MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
	switch (OrigSimpleTy) {
	default: llvm_unreachable("Unexpected Vector Type");
	case MVT::v2i8:
	case MVT::v2i16:
	return MVT::v2i32;
	case MVT::v4i8:
	return MVT::v4i16;
	}
	}

	static SDValue addRequiredExtensionForVectorMULL(SDValue N, SelectionDAG &DAG,
	const EVT &OrigTy,
	const EVT &ExtTy,
	unsigned ExtOpcode) {
	// The vector originally had a size of OrigTy. It was then extended to ExtTy.
	// We expect the ExtTy to be 128-bits total. If the OrigTy is less than
	// 64-bits we need to insert a new extension so that it will be 64-bits.
	assert(ExtTy.is128BitVector() && "Unexpected extension size");
	if (OrigTy.getSizeInBits() >= 64)
	return N;

	// Must extend size to at least 64 bits to be used as an operand for VMULL.
	EVT NewVT = getExtensionTo64Bits(OrigTy);

	return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
	}

	static bool isExtendedBUILD_VECTOR(SDNode *N, SelectionDAG &DAG,
	bool isSigned) {
	EVT VT = N->getValueType(0);

	if (N->getOpcode() != ISD::BUILD_VECTOR)
	return false;

	for (const SDValue &Elt : N->op_values()) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
	unsigned EltSize = VT.getScalarSizeInBits();
	unsigned HalfSize = EltSize / 2;
	if (isSigned) {
	if (!isIntN(HalfSize, C->getSExtValue()))
	return false;
	} else {
	if (!isUIntN(HalfSize, C->getZExtValue()))
	return false;
	}
	continue;
	}
	return false;
	}

	return true;
	}

	static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
	if (N->getOpcode() == ISD::SIGN_EXTEND \|\| N->getOpcode() == ISD::ZERO_EXTEND)
	return addRequiredExtensionForVectorMULL(N->getOperand(0), DAG,
	N->getOperand(0)->getValueType(0),
	N->getValueType(0),
	N->getOpcode());

	assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
	EVT VT = N->getValueType(0);
	SDLoc dl(N);
	unsigned EltSize = VT.getScalarSizeInBits() / 2;
	unsigned NumElts = VT.getVectorNumElements();
	MVT TruncVT = MVT::getIntegerVT(EltSize);
	SmallVector<SDValue, 8> Ops;
	for (unsigned i = 0; i != NumElts; ++i) {
	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(i));
	const APInt &CInt = C->getAPIntValue();
	// Element types smaller than 32 bits are not legal, so use i32 elements.
	// The values are implicitly truncated so sext vs. zext doesn't matter.
	Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
	}
	return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
	}

	static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::SIGN_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, true);
	}

	static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
	return N->getOpcode() == ISD::ZERO_EXTEND \|\|
	isExtendedBUILD_VECTOR(N, DAG, false);
	}

	static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
	}
	return false;
	}

	static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
	unsigned Opcode = N->getOpcode();
	if (Opcode == ISD::ADD \|\| Opcode == ISD::SUB) {
	SDNode *N0 = N->getOperand(0).getNode();
	SDNode *N1 = N->getOperand(1).getNode();
	return N0->hasOneUse() && N1->hasOneUse() &&
	isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
	}
	return false;
	}

	static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
	// Multiplications are only custom-lowered for 128-bit vectors so that
	// VMULL can be detected. Otherwise v2i64 multiplications are not legal.
	EVT VT = Op.getValueType();
	assert(VT.is128BitVector() && VT.isInteger() &&
	"unexpected type for custom-lowering ISD::MUL");
	SDNode *N0 = Op.getOperand(0).getNode();
	SDNode *N1 = Op.getOperand(1).getNode();
	unsigned NewOpc = 0;
	bool isMLA = false;
	bool isN0SExt = isSignExtended(N0, DAG);
	bool isN1SExt = isSignExtended(N1, DAG);
	if (isN0SExt && isN1SExt)
	NewOpc = AArch64ISD::SMULL;
	else {
	bool isN0ZExt = isZeroExtended(N0, DAG);
	bool isN1ZExt = isZeroExtended(N1, DAG);
	if (isN0ZExt && isN1ZExt)
	NewOpc = AArch64ISD::UMULL;
	else if (isN1SExt \|\| isN1ZExt) {
	// Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
	// into (s/zext A * s/zext C) + (s/zext B * s/zext C)
	if (isN1SExt && isAddSubSExt(N0, DAG)) {
	NewOpc = AArch64ISD::SMULL;
	isMLA = true;
	} else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	} else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
	std::swap(N0, N1);
	NewOpc = AArch64ISD::UMULL;
	isMLA = true;
	}
	}

	if (!NewOpc) {
	if (VT == MVT::v2i64)
	// Fall through to expand this. It is not legal.
	return SDValue();
	else
	// Other vector multiplications are legal.
	return Op;
	}
	}

	// Legalize to a S/UMULL instruction
	SDLoc DL(Op);
	SDValue Op0;
	SDValue Op1 = skipExtensionForVectorMULL(N1, DAG);
	if (!isMLA) {
	Op0 = skipExtensionForVectorMULL(N0, DAG);
	assert(Op0.getValueType().is64BitVector() &&
	Op1.getValueType().is64BitVector() &&
	"unexpected types for extended operands to VMULL");
	return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
	}
	// Optimizing (zext A + zext B) * C, to (S/UMULL A, C) + (S/UMULL B, C) during
	// isel lowering to take advantage of no-stall back to back s/umul + s/umla.
	// This is true for CPUs with accumulate forwarding such as Cortex-A53/A57
	SDValue N00 = skipExtensionForVectorMULL(N0->getOperand(0).getNode(), DAG);
	SDValue N01 = skipExtensionForVectorMULL(N0->getOperand(1).getNode(), DAG);
	EVT Op1VT = Op1.getValueType();
	return DAG.getNode(N0->getOpcode(), DL, VT,
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
	DAG.getNode(NewOpc, DL, VT,
	DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
	}

	SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
	SelectionDAG &DAG) const {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.
	case Intrinsic::thread_pointer: {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	return DAG.getNode(AArch64ISD::THREAD_POINTER, dl, PtrVT);
	}
	case Intrinsic::aarch64_neon_abs:
	return DAG.getNode(ISD::ABS, dl, Op.getValueType(),
	Op.getOperand(1));
	case Intrinsic::aarch64_neon_smax:
	return DAG.getNode(ISD::SMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umax:
	return DAG.getNode(ISD::UMAX, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_smin:
	return DAG.getNode(ISD::SMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	case Intrinsic::aarch64_neon_umin:
	return DAG.getNode(ISD::UMIN, dl, Op.getValueType(),
	Op.getOperand(1), Op.getOperand(2));
	}
	}

	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
	SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unimplemented operand");
	return SDValue();
	case ISD::BITCAST:
	return LowerBITCAST(Op, DAG);
	case ISD::GlobalAddress:
	return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress:
	return LowerGlobalTLSAddress(Op, DAG);
	case ISD::SETCC:
	return LowerSETCC(Op, DAG);
	case ISD::BR_CC:
	return LowerBR_CC(Op, DAG);
	case ISD::SELECT:
	return LowerSELECT(Op, DAG);
	case ISD::SELECT_CC:
	return LowerSELECT_CC(Op, DAG);
	case ISD::JumpTable:
	return LowerJumpTable(Op, DAG);
	case ISD::ConstantPool:
	return LowerConstantPool(Op, DAG);
	case ISD::BlockAddress:
	return LowerBlockAddress(Op, DAG);
	case ISD::VASTART:
	return LowerVASTART(Op, DAG);
	case ISD::VACOPY:
	return LowerVACOPY(Op, DAG);
	case ISD::VAARG:
	return LowerVAARG(Op, DAG);
	case ISD::ADDC:
	case ISD::ADDE:
	case ISD::SUBC:
	case ISD::SUBE:
	return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO:
	return LowerXALUO(Op, DAG);
	case ISD::FADD:
	return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
	case ISD::FSUB:
	return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
	case ISD::FMUL:
	return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
	case ISD::FDIV:
	return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
	case ISD::FP_ROUND:
	return LowerFP_ROUND(Op, DAG);
	case ISD::FP_EXTEND:
	return LowerFP_EXTEND(Op, DAG);
	case ISD::FRAMEADDR:
	return LowerFRAMEADDR(Op, DAG);
	case ISD::RETURNADDR:
	return LowerRETURNADDR(Op, DAG);
	case ISD::INSERT_VECTOR_ELT:
	return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT:
	return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::BUILD_VECTOR:
	return LowerBUILD_VECTOR(Op, DAG);
	case ISD::VECTOR_SHUFFLE:
	return LowerVECTOR_SHUFFLE(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR:
	return LowerEXTRACT_SUBVECTOR(Op, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL:
	return LowerVectorSRA_SRL_SHL(Op, DAG);
	case ISD::SHL_PARTS:
	return LowerShiftLeftParts(Op, DAG);
	case ISD::SRL_PARTS:
	case ISD::SRA_PARTS:
	return LowerShiftRightParts(Op, DAG);
	case ISD::CTPOP:
	return LowerCTPOP(Op, DAG);
	case ISD::FCOPYSIGN:
	return LowerFCOPYSIGN(Op, DAG);
	case ISD::AND:
	return LowerVectorAND(Op, DAG);
	case ISD::OR:
	return LowerVectorOR(Op, DAG);
	case ISD::XOR:
	return LowerXOR(Op, DAG);
	case ISD::PREFETCH:
	return LowerPREFETCH(Op, DAG);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return LowerINT_TO_FP(Op, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return LowerFP_TO_INT(Op, DAG);
	case ISD::FSINCOS:
	return LowerFSINCOS(Op, DAG);
	case ISD::MUL:
	return LowerMUL(Op, DAG);
	case ISD::INTRINSIC_WO_CHAIN:
	return LowerINTRINSIC_WO_CHAIN(Op, DAG);
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	case ISD::VECREDUCE_FMAX:
	case ISD::VECREDUCE_FMIN:
	return LowerVECREDUCE(Op, DAG);
	}
	}

	//===----------------------------------------------------------------------===//
	// Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	#include "AArch64GenCallingConv.inc"

	/// Selects the correct CCAssignFn for a given CallingConvention value.
	CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
	bool IsVarArg) const {
	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention.");
	case CallingConv::WebKit_JS:
	return CC_AArch64_WebKit_JS;
	case CallingConv::GHC:
	return CC_AArch64_GHC;
	case CallingConv::C:
	case CallingConv::Fast:
	case CallingConv::PreserveMost:
	case CallingConv::CXX_FAST_TLS:
	case CallingConv::Swift:
	if (Subtarget->isTargetWindows() && IsVarArg)
	return CC_AArch64_Win64_VarArg;
	if (!Subtarget->isTargetDarwin())
	return CC_AArch64_AAPCS;
	return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
	case CallingConv::Win64:
	return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
	}
	}

	CCAssignFn *
	AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
	return CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	}

	SDValue AArch64TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	// At this point, Ins[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeFormalArguments to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Ins.size();
	Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
	unsigned CurArgIdx = 0;
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Ins[i].VT;
	if (Ins[i].isOrigArg()) {
	std::advance(CurOrigArg, Ins[i].getOrigArgIndex() - CurArgIdx);
	CurArgIdx = Ins[i].getOrigArgIndex();

	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(), CurOrigArg->getType(),
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;
	}
	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res =
	AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	assert(ArgLocs.size() == Ins.size());
	SmallVector<SDValue, 16> ArgValues;
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];

	if (Ins[i].Flags.isByVal()) {
	// Byval is used for HFAs in the PCS, but the system should work in a
	// non-compliant manner for larger structs.
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	int Size = Ins[i].Flags.getByValSize();
	unsigned NumRegs = (Size + 7) / 8;

	// FIXME: This works on big-endian for composite byvals, which are the common
	// case. It should also work for fundamental types too.
	unsigned FrameIdx =
	MFI.CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
	SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrVT);
	InVals.push_back(FrameIdxN);

	continue;
	}

	if (VA.isRegLoc()) {
	// Arguments stored in registers.
	EVT RegVT = VA.getLocVT();

	SDValue ArgValue;
	const TargetRegisterClass *RC;

	if (RegVT == MVT::i32)
	RC = &AArch64::GPR32RegClass;
	else if (RegVT == MVT::i64)
	RC = &AArch64::GPR64RegClass;
	else if (RegVT == MVT::f16)
	RC = &AArch64::FPR16RegClass;
	else if (RegVT == MVT::f32)
	RC = &AArch64::FPR32RegClass;
	else if (RegVT == MVT::f64 \|\| RegVT.is64BitVector())
	RC = &AArch64::FPR64RegClass;
	else if (RegVT == MVT::f128 \|\| RegVT.is128BitVector())
	RC = &AArch64::FPR128RegClass;
	else
	llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");

	// Transform the arguments in physical registers into virtual ones.
	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);

	// If this is an 8, 16 or 32-bit value, it is really passed promoted
	// to 64 bits. Insert an assert[sz]ext to capture this, then
	// truncate to the right size.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
	break;
	case CCValAssign::AExt:
	case CCValAssign::SExt:
	case CCValAssign::ZExt:
	// SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
	// nodes after our lowering.
	assert(RegVT == Ins[i].VT && "incorrect register location selected");
	break;
	}

	InVals.push_back(ArgValue);

	} else { // VA.isRegLoc()
	assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
	unsigned ArgOffset = VA.getLocMemOffset();
	unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;

	uint32_t BEAlign = 0;
	if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
	!Ins[i].Flags.isInConsecutiveRegs())
	BEAlign = 8 - ArgSize;

	int FI = MFI.CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);

	// Create load nodes to retrieve arguments from the stack.
	SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
	SDValue ArgValue;

	// For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
	ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
	MVT MemVT = VA.getValVT();

	switch (VA.getLocInfo()) {
	default:
	break;
	case CCValAssign::BCvt:
	MemVT = VA.getLocVT();
	break;
	case CCValAssign::SExt:
	ExtType = ISD::SEXTLOAD;
	break;
	case CCValAssign::ZExt:
	ExtType = ISD::ZEXTLOAD;
	break;
	case CCValAssign::AExt:
	ExtType = ISD::EXTLOAD;
	break;
	}

	ArgValue = DAG.getExtLoad(
	ExtType, DL, VA.getLocVT(), Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI),
	MemVT);

	InVals.push_back(ArgValue);
	}
	}

	// varargs
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	if (isVarArg) {
	if (!Subtarget->isTargetDarwin() \|\| IsWin64) {
	// The AAPCS variadic function ABI is identical to the non-variadic
	// one. As a result there may be more arguments in registers and we should
	// save them for future reference.
	// Win64 variadic functions also pass arguments in registers, but all float
	// arguments are passed in integer registers.
	saveVarArgRegisters(CCInfo, DAG, DL, Chain);
	}

	// This will point to the next argument passed via stack.
	unsigned StackOffset = CCInfo.getNextStackOffset();
	// We currently pass all varargs at 8-byte alignment.
	StackOffset = ((StackOffset + 7) & ~7);
	FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
	}

	unsigned StackArgSize = CCInfo.getNextStackOffset();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
	// This is a non-standard ABI so by fiat I say we're allowed to make full
	// use of the stack area to be popped, which must be aligned to 16 bytes in
	// any case:
	StackArgSize = alignTo(StackArgSize, 16);

	// If we're expected to restore the stack (e.g. fastcc) then we'll be adding
	// a multiple of 16.
	FuncInfo->setArgumentStackToRestore(StackArgSize);

	// This realignment carries over to the available bytes below. Our own
	// callers will guarantee the space is free by giving an aligned value to
	// CALLSEQ_START.
	}
	// Even if we're not expected to free up the space, it's useful to know how
	// much is there while considering tail calls (because we can reuse it).
	FuncInfo->setBytesInStackArgArea(StackArgSize);

	return Chain;
	}

	void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
	SelectionDAG &DAG,
	const SDLoc &DL,
	SDValue &Chain) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv());

	SmallVector<SDValue, 8> MemOps;

	static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
	AArch64::X3, AArch64::X4, AArch64::X5,
	AArch64::X6, AArch64::X7 };
	static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
	unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(GPRArgRegs);

	unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
	int GPRIdx = 0;
	if (GPRSaveSize != 0) {
	if (IsWin64) {
	GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false);
	if (GPRSaveSize & 15)
	// The extra size here, if triggered, will always be 8.
	MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
	} else
	GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);

	SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);

	for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	IsWin64
	? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
	GPRIdx,
	(i - FirstVariadicGPR) * 8)
	: MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8));
	MemOps.push_back(Store);
	FIN =
	DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsGPRIndex(GPRIdx);
	FuncInfo->setVarArgsGPRSize(GPRSaveSize);

	if (Subtarget->hasFPARMv8() && !IsWin64) {
	static const MCPhysReg FPRArgRegs[] = {
	AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
	AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
	static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
	unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(FPRArgRegs);

	unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
	int FPRIdx = 0;
	if (FPRSaveSize != 0) {
	FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);

	SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);

	for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
	unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
	SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);

	SDValue Store = DAG.getStore(
	Val.getValue(1), DL, Val, FIN,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 16));
	MemOps.push_back(Store);
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN,
	DAG.getConstant(16, DL, PtrVT));
	}
	}
	FuncInfo->setVarArgsFPRIndex(FPRIdx);
	FuncInfo->setVarArgsFPRSize(FPRSaveSize);
	}

	if (!MemOps.empty()) {
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}
	}

	/// LowerCallResult - Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	SDValue AArch64TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
	SDValue ThisVal) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned i = 0; i != RVLocs.size(); ++i) {
	CCValAssign VA = RVLocs[i];

	// Pass 'this' value directly from the argument to return value, to avoid
	// reg unit interference
	if (i == 0 && isThisReturn) {
	assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
	"unexpected return calling convention register assignment");
	InVals.push_back(ThisVal);
	continue;
	}

	SDValue Val =
	DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
	Chain = Val.getValue(1);
	InFlag = Val.getValue(2);

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::BCvt:
	Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
	break;
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return CC == CallingConv::Fast;
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	case CallingConv::C:
	case CallingConv::PreserveMost:
	case CallingConv::Swift:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	bool AArch64TargetLowering::isEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	MachineFunction &MF = DAG.getMachineFunction();
	const Function *CallerF = MF.getFunction();
	CallingConv::ID CallerCC = CallerF->getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;

	// Byval parameters hand the function a pointer directly into the stack area
	// we want to reuse during a tail call. Working around this is possible (see
	// X86) but less efficient and uglier in LowerCall.
	for (Function::const_arg_iterator i = CallerF->arg_begin(),
	e = CallerF->arg_end();
	i != e; ++i)
	if (i->hasByValAttr())
	return false;

	if (getTargetMachine().Options.GuaranteedTailCallOpt)
	return canGuaranteeTCO(CalleeCC) && CCMatch;

	// Externally-defined functions with weak linkage should not be
	// tail-called on AArch64 when the OS does not support dynamic
	// pre-emption of symbols, as the AAELF spec requires normal calls
	// to undefined weak functions to be replaced with a NOP or jump to the
	// next instruction. The behaviour of branch instructions in this
	// situation (as used for tail calls) is implementation-defined, so we
	// cannot rely on the linker replacing the tail call with a return.
	if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	const GlobalValue *GV = G->getGlobal();
	const Triple &TT = getTargetMachine().getTargetTriple();
	if (GV->hasExternalWeakLinkage() &&
	(!TT.isOSWindows() \|\| TT.isOSBinFormatELF() \|\| TT.isOSBinFormatMachO()))
	return false;
	}

	// Now we search for cases where we can use a tail call without changing the
	// ABI. Sibcall is used in some places (particularly gcc) to refer to this
	// concept.

	// I want anyone implementing a new calling convention to think long and hard
	// about this assert.
	assert((!isVarArg \|\| CalleeCC == CallingConv::C) &&
	"Unexpected variadic calling convention");

	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// At least two cases here: if caller is fastcc then we can't have any
	// memory arguments (we'd be expected to clean up the stack afterwards). If
	// caller is C then we could potentially use its argument area.

	// FIXME: for now we take the most conservative of these in both cases:
	// disallow all variadic memory operands.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
	for (const CCValAssign &ArgLoc : ArgLocs)
	if (!ArgLoc.isRegLoc())
	return false;
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	CCAssignFnForCall(CalleeCC, isVarArg),
	CCAssignFnForCall(CallerCC, isVarArg)))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	// Nothing more to check if the callee is taking no arguments
	if (Outs.empty())
	return true;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));

	const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();

	// If the stack arguments for this call do not fit into our own save area then
	// the call cannot be made tail.
	if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
	return false;

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;

	return true;
	}

	SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
	SelectionDAG &DAG,
	MachineFrameInfo &MFI,
	int ClobberedFI) const {
	SmallVector<SDValue, 8> ArgChains;
	int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
	int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;

	// Include the original chain at the beginning of the list. When this is
	// used by target LowerCall hooks, this helps legalize find the
	// CALLSEQ_BEGIN node.
	ArgChains.push_back(Chain);

	// Add a chain value for each stack argument corresponding
	for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
	UE = DAG.getEntryNode().getNode()->use_end();
	U != UE; ++U)
	if (LoadSDNode L = dyn_cast<LoadSDNode>(U))
	if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
	if (FI->getIndex() < 0) {
	int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
	int64_t InLastByte = InFirstByte;
	InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;

	if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) \|\|
	(FirstByte <= InFirstByte && InFirstByte <= LastByte))
	ArgChains.push_back(SDValue(L, 1));
	}

	// Build a tokenfactor for all the chains.
	return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
	}

	bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
	bool TailCallOpt) const {
	return CallCC == CallingConv::Fast && TailCallOpt;
	}

	/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
	/// and add input and output parameter nodes.
	SDValue
	AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &DL = CLI.DL;
	SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
	SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
	SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	bool &IsTailCall = CLI.IsTailCall;
	CallingConv::ID CallConv = CLI.CallConv;
	bool IsVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool IsThisReturn = false;

	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
	bool IsSibCall = false;

	if (IsTailCall) {
	// Check if it's really possible to do a tail call.
	IsTailCall = isEligibleForTailCallOptimization(
	Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
	if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
	report_fatal_error("failed to perform tail call elimination on a call "
	"site marked musttail");

	// A sibling call is one where we're under the usual C ABI and not planning
	// to change that but can still do a tail call:
	if (!TailCallOpt && IsTailCall)
	IsSibCall = true;

	if (IsTailCall)
	++NumTailCalls;
	}

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
	*DAG.getContext());

	if (IsVarArg) {
	// Handle fixed and variable vector arguments differently.
	// Variable vector arguments always go into memory.
	unsigned NumArgs = Outs.size();

	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ArgVT = Outs[i].VT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
	/IsVarArg=/ !Outs[i].IsFixed);
	bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	} else {
	// At this point, Outs[].VT may already be promoted to i32. To correctly
	// handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
	// i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
	// Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
	// we use a special version of AnalyzeCallOperands to pass in ValVT and
	// LocVT.
	unsigned NumArgs = Outs.size();
	for (unsigned i = 0; i != NumArgs; ++i) {
	MVT ValVT = Outs[i].VT;
	// Get type of the original argument.
	EVT ActualVT = getValueType(DAG.getDataLayout(),
	CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
	/AllowUnknown/ true);
	MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
	ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
	// If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
	if (ActualMVT == MVT::i1 \|\| ActualMVT == MVT::i8)
	ValVT = MVT::i8;
	else if (ActualMVT == MVT::i16)
	ValVT = MVT::i16;

	CCAssignFn AssignFn = CCAssignFnForCall(CallConv, /IsVarArg=*/false);
	bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
	assert(!Res && "Call operand has unhandled type");
	(void)Res;
	}
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getNextStackOffset();

	if (IsSibCall) {
	// Since we're not changing the ABI to make this a tail call, the memory
	// operands are already available in the caller's incoming argument space.
	NumBytes = 0;
	}

	// FPDiff is the byte offset of the call's argument area from the callee's.
	// Stores to callee stack arguments will be placed in FixedStackSlots offset
	// by this amount for a tail call. In a sibling call it must be 0 because the
	// caller will deallocate the entire stack and the callee still expects its
	// arguments to begin at SP+0. Completely unused for non-tail calls.
	int FPDiff = 0;

	if (IsTailCall && !IsSibCall) {
	unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();

	// Since callee will pop argument stack as a tail call, we must keep the
	// popped size 16-byte aligned.
	NumBytes = alignTo(NumBytes, 16);

	// FPDiff will be negative if this tail call requires more space than we
	// would automatically have in our incoming argument space. Positive if we
	// can actually shrink the stack.
	FPDiff = NumReusableBytes - NumBytes;

	// The stack pointer must be 16-byte aligned at all times it's used for a
	// memory operation, which in practice means at all times and in
	// particular across call boundaries. Therefore our own arguments started at
	// a 16-byte aligned SP and the delta applied for the tail call should
	// satisfy the same constraint.
	assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
	}

	// Adjust the stack pointer for the new arguments...
	// These operations are automatically eliminated by the prolog/epilog pass
	if (!IsSibCall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL);

	SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP,
	getPointerTy(DAG.getDataLayout()));

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	// Walk the register/memloc assignments, inserting copies/loads.
	for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
	++i, ++realArgIdx) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[realArgIdx];
	ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::AExt:
	if (Outs[realArgIdx].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
	}
	Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	case CCValAssign::FPExt:
	Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
	break;
	}

	if (VA.isRegLoc()) {
	if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
	Outs[0].VT == MVT::i64) {
	assert(VA.getLocVT() == MVT::i64 &&
	"unexpected calling convention register assignment");
	assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
	"unexpected use of 'returned'");
	IsThisReturn = true;
	}
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	} else {
	assert(VA.isMemLoc());

	SDValue DstAddr;
	MachinePointerInfo DstInfo;

	// FIXME: This works on big-endian for composite byvals, which are the
	// common case. It should also work for fundamental types too.
	uint32_t BEAlign = 0;
	unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
	: VA.getValVT().getSizeInBits();
	OpSize = (OpSize + 7) / 8;
	if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
	!Flags.isInConsecutiveRegs()) {
	if (OpSize < 8)
	BEAlign = 8 - OpSize;
	}
	unsigned LocMemOffset = VA.getLocMemOffset();
	int32_t Offset = LocMemOffset + BEAlign;
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);
	PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);

	if (IsTailCall) {
	Offset = Offset + FPDiff;
	int FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);

	DstAddr = DAG.getFrameIndex(FI, PtrVT);
	DstInfo =
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);

	// Make sure any stack arguments overlapping with where we're storing
	// are loaded before this eventual operation. Otherwise they'll be
	// clobbered.
	Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
	} else {
	SDValue PtrOff = DAG.getIntPtrConstant(Offset, DL);

	DstAddr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
	DstInfo = MachinePointerInfo::getStack(DAG.getMachineFunction(),
	LocMemOffset);
	}

	if (Outs[i].Flags.isByVal()) {
	SDValue SizeNode =
	DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
	SDValue Cpy = DAG.getMemcpy(
	Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
	/isVol = / false, /AlwaysInline = / false,
	/isTailCall = / false,
	DstInfo, MachinePointerInfo());

	MemOpChains.push_back(Cpy);
	} else {
	// Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
	// promoted to a legal register type i32, we should truncate Arg back to
	// i1/i8/i16.
	if (VA.getValVT() == MVT::i1 \|\| VA.getValVT() == MVT::i8 \|\|
	VA.getValVT() == MVT::i16)
	Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);

	SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo);
	MemOpChains.push_back(Store);
	}
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into the appropriate regs.
	SDValue InFlag;
	for (auto &RegToPass : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
	RegToPass.second, InFlag);
	InFlag = Chain.getValue(1);
	}

	// If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
	// direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
	// node so that legalize doesn't hack it.
	if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
	auto GV = G->getGlobal();
	if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
	AArch64II::MO_GOT) {
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else {
	const GlobalValue *GV = G->getGlobal();
	Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
	}
	} else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	Subtarget->isTargetMachO()) {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, AArch64II::MO_GOT);
	Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
	} else {
	const char *Sym = S->getSymbol();
	Callee = DAG.getTargetExternalSymbol(Sym, PtrVT, 0);
	}
	}

	// We don't usually want to end the call-sequence here because we would tidy
	// the frame up after the call, however in the ABI-changing tail-call case
	// we've carefully laid out the parameters so that when sp is reset they'll be
	// in the correct location.
	if (IsTailCall && !IsSibCall) {
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(0, DL, true), InFlag, DL);
	InFlag = Chain.getValue(1);
	}

	std::vector<SDValue> Ops;
	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (IsTailCall) {
	// Each tail call may have to adjust the stack by a different amount, so
	// this information must travel along with the operation for eventual
	// consumption by emitEpilogue.
	Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
	}

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (auto &RegToPass : RegsToPass)
	Ops.push_back(DAG.getRegister(RegToPass.first,
	RegToPass.second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	const uint32_t *Mask;
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	if (IsThisReturn) {
	// For 'this' returns, use the X0-preserving mask if applicable
	Mask = TRI->getThisReturnPreservedMask(MF, CallConv);
	if (!Mask) {
	IsThisReturn = false;
	Mask = TRI->getCallPreservedMask(MF, CallConv);
	}
	} else
	Mask = TRI->getCallPreservedMask(MF, CallConv);

	assert(Mask && "Missing call preserved mask for calling convention");
	Ops.push_back(DAG.getRegisterMask(Mask));

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	// If we're doing a tall call, use a TC_RETURN here rather than an
	// actual call instruction.
	if (IsTailCall) {
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
	}

	// Returns a chain and a flag for retval copy to use.
	Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	uint64_t CalleePopBytes =
	DoesCalleeRestoreStack(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : 0;

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, DL, true),
	DAG.getIntPtrConstant(CalleePopBytes, DL, true),
	InFlag, DL);
	if (!Ins.empty())
	InFlag = Chain.getValue(1);

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
	InVals, IsThisReturn,
	IsThisReturn ? OutVals[0] : SDValue());
	}

	bool AArch64TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC);
	}

	SDValue
	AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &DL, SelectionDAG &DAG) const {
	CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
	? RetCC_AArch64_WebKit_JS
	: RetCC_AArch64_AAPCS;
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC);

	// Copy the result values into the output registers.
	SDValue Flag;
	SmallVector<SDValue, 4> RetOps(1, Chain);
	for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
	++i, ++realRVLocIdx) {
	CCValAssign &VA = RVLocs[i];
	assert(VA.isRegLoc() && "Can only return in registers!");
	SDValue Arg = OutVals[realRVLocIdx];

	switch (VA.getLocInfo()) {
	default:
	llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full:
	if (Outs[i].ArgVT == MVT::i1) {
	// AAPCS requires i1 to be zero-extended to i8 by the producer of the
	// value. This is strictly redundant on Darwin (which uses "zeroext
	// i1"), but will be optimised out before ISel.
	Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
	Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
	}
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
	break;
	}

	Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
	}
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (AArch64::GPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else if (AArch64::FPR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Code
	//===----------------------------------------------------------------------===//

	SDValue AArch64TargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetGlobalAddress(N->getGlobal(), SDLoc(N), Ty, 0, Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetJumpTable(N->getIndex(), Ty, Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
	N->getOffset(), Flag);
	}

	SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
	SelectionDAG &DAG,
	unsigned Flag) const {
	return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
	}

	// (loadGOT sym)
	template <class NodeTy>
	SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG) const {
	DEBUG(dbgs() << "AArch64TargetLowering::getGOT\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue GotAddr = getTargetNode(N, Ty, DAG, AArch64II::MO_GOT);
	// FIXME: Once remat is capable of dealing with instructions with register
	// operands, expand this into two nodes instead of using a wrapper node.
	return DAG.getNode(AArch64ISD::LOADgot, DL, Ty, GotAddr);
	}

	// (wrapper %highest(sym), %higher(sym), %hi(sym), %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddrLarge(NodeTy *N, SelectionDAG &DAG)
	const {
	DEBUG(dbgs() << "AArch64TargetLowering::getAddrLarge\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	const unsigned char MO_NC = AArch64II::MO_NC;
	return DAG.getNode(
	AArch64ISD::WrapperLarge, DL, Ty,
	getTargetNode(N, Ty, DAG, AArch64II::MO_G3),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G2 \| MO_NC),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G1 \| MO_NC),
	getTargetNode(N, Ty, DAG, AArch64II::MO_G0 \| MO_NC));
	}

	// (addlow (adrp %hi(sym)) %lo(sym))
	template <class NodeTy>
	SDValue AArch64TargetLowering::getAddr(NodeTy *N, SelectionDAG &DAG) const {
	DEBUG(dbgs() << "AArch64TargetLowering::getAddr\n");
	SDLoc DL(N);
	EVT Ty = getPointerTy(DAG.getDataLayout());
	SDValue Hi = getTargetNode(N, Ty, DAG, AArch64II::MO_PAGE);
	SDValue Lo = getTargetNode(N, Ty, DAG,
	AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);
	SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, Ty, Hi);
	return DAG.getNode(AArch64ISD::ADDlow, DL, Ty, ADRP, Lo);
	}

	SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
	SelectionDAG &DAG) const {
	GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
	const GlobalValue *GV = GN->getGlobal();
	unsigned char OpFlags =
	Subtarget->ClassifyGlobalReference(GV, getTargetMachine());

	assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
	"unexpected offset in global node");

	// This also catches the large code model case for Darwin.
	if ((OpFlags & AArch64II::MO_GOT) != 0) {
	return getGOT(GN, DAG);
	}

	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	return getAddrLarge(GN, DAG);
	} else {
	return getAddr(GN, DAG);
	}
	}

	/// \brief Convert a TLS address reference into the correct sequence of loads
	/// and calls to compute the variable's address (for Darwin, currently) and
	/// return an SDValue containing the final node.

	/// Darwin only has one TLS scheme which must be capable of dealing with the
	/// fully general situation, in the worst case. This means:
	/// + "extern __thread" declaration.
	/// + Defined in a possibly unknown dynamic library.
	///
	/// The general system is that each __thread variable has a [3 x i64] descriptor
	/// which contains information used by the runtime to calculate the address. The
	/// only part of this the compiler needs to know about is the first xword, which
	/// contains a function pointer that must be called with the address of the
	/// entire descriptor in "x0".
	///
	/// Since this descriptor may be in a different unit, in general even the
	/// descriptor must be accessed via an indirect load. The "ideal" code sequence
	/// is:
	/// adrp x0, _var@TLVPPAGE
	/// ldr x0, [x0, _var@TLVPPAGEOFF] ; x0 now contains address of descriptor
	/// ldr x1, [x0] ; x1 contains 1st entry of descriptor,
	/// ; the function pointer
	/// blr x1 ; Uses descriptor address in x0
	/// ; Address of _var is now in x0.
	///
	/// If the address of _var's descriptor is known to the linker, then it can
	/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
	/// a slight efficiency gain.
	SDValue
	AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");

	SDLoc DL(Op);
	MVT PtrVT = getPointerTy(DAG.getDataLayout());
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();

	SDValue TLVPAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);

	// The first entry in the descriptor is a function pointer that we must call
	// to obtain the address of the variable.
	SDValue Chain = DAG.getEntryNode();
	SDValue FuncTLVGet = DAG.getLoad(
	MVT::i64, DL, Chain, DescAddr,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()),
	/* Alignment = */ 8,
	MachineMemOperand::MONonTemporal \| MachineMemOperand::MOInvariant \|
	MachineMemOperand::MODereferenceable);
	Chain = FuncTLVGet.getValue(1);

	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// TLS calls preserve all registers except those that absolutely must be
	// trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
	// silly).
	const uint32_t *Mask =
	Subtarget->getRegisterInfo()->getTLSCallPreservedMask();

	// Finally, we can make the call. This is just a degenerate version of a
	// normal AArch64 call node: x0 takes the address of the descriptor, and
	// returns the address of the variable in this thread.
	Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
	Chain =
	DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
	Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
	DAG.getRegisterMask(Mask), Chain.getValue(1));
	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
	}

	/// When accessing thread-local variables under either the general-dynamic or
	/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
	/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
	/// is a function pointer to carry out the resolution.
	///
	/// The sequence is:
	/// adrp x0, :tlsdesc:var
	/// ldr x1, [x0, #:tlsdesc_lo12:var]
	/// add x0, x0, #:tlsdesc_lo12:var
	/// .tlsdesccall var
	/// blr x1
	/// (TPIDR_EL0 offset now in x0)
	///
	/// The above sequence must be produced unscheduled, to enable the linker to
	/// optimize/relax this sequence.
	/// Therefore, a pseudo-instruction (TLSDESC_CALLSEQ) is used to represent the
	/// above sequence, and expanded really late in the compilation flow, to ensure
	/// the sequence is produced as per above.
	SDValue AArch64TargetLowering::LowerELFTLSDescCallSeq(SDValue SymAddr,
	const SDLoc &DL,
	SelectionDAG &DAG) const {
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);

	Chain =
	DAG.getNode(AArch64ISD::TLSDESC_CALLSEQ, DL, NodeTys, {Chain, SymAddr});
	SDValue Glue = Chain.getValue(1);

	return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
	}

	SDValue
	AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Subtarget->isTargetELF() && "This function expects an ELF target");
	assert(Subtarget->useSmallAddressing() &&
	"ELF TLS only supported in small memory model");
	// Different choices can be made for the maximum size of the TLS area for a
	// module. For the small address model, the default TLS size is 16MiB and the
	// maximum TLS size is 4GiB.
	// FIXME: add -mtls-size command line option and make it control the 16MiB
	// vs. 4GiB code sequence generation.
	const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());

	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	if (!EnableAArch64ELFLocalDynamicTLSGeneration) {
	if (Model == TLSModel::LocalDynamic)
	Model = TLSModel::GeneralDynamic;
	}

	SDValue TPOff;
	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);
	const GlobalValue *GV = GA->getGlobal();

	SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);

	if (Model == TLSModel::LocalExec) {
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, PtrVT, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	SDValue TPWithOff_lo =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
	HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	SDValue TPWithOff =
	SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
	LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	return TPWithOff;
	} else if (Model == TLSModel::InitialExec) {
	TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
	TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
	} else if (Model == TLSModel::LocalDynamic) {
	// Local-dynamic accesses proceed in two phases. A general-dynamic TLS
	// descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
	// the beginning of the module's TLS region, followed by a DTPREL offset
	// calculation.

	// These accesses will need deduplicating if there's more than one.
	AArch64FunctionInfo *MFI =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
	AArch64II::MO_TLS);

	// Now we can calculate the offset from TPIDR_EL0 to this module's
	// thread-local area.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);

	// Now use :dtprel_whatever: operations to calculate this variable's offset
	// in its thread-storage area.
	SDValue HiVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0, AArch64II::MO_TLS \| AArch64II::MO_HI12);
	SDValue LoVar = DAG.getTargetGlobalAddress(
	GV, DL, MVT::i64, 0,
	AArch64II::MO_TLS \| AArch64II::MO_PAGEOFF \| AArch64II::MO_NC);

	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, HiVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	TPOff = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPOff, LoVar,
	DAG.getTargetConstant(0, DL, MVT::i32)),
	0);
	} else if (Model == TLSModel::GeneralDynamic) {
	// The call needs a relocation too for linker relaxation. It doesn't make
	// sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
	// the address.
	SDValue SymAddr =
	DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);

	// Finally we can make a call to calculate the offset from tpidr_el0.
	TPOff = LowerELFTLSDescCallSeq(SymAddr, DL, DAG);
	} else
	llvm_unreachable("Unsupported ELF TLS access model");

	return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
	}

	SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
	SelectionDAG &DAG) const {
	if (Subtarget->isTargetDarwin())
	return LowerDarwinGlobalTLSAddress(Op, DAG);
	if (Subtarget->isTargetELF())
	return LowerELFGlobalTLSAddress(Op, DAG);

	llvm_unreachable("Unexpected platform trying to use TLS");
	}

	SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
	SDValue LHS = Op.getOperand(2);
	SDValue RHS = Op.getOperand(3);
	SDValue Dest = Op.getOperand(4);
	SDLoc dl(Op);

	// Handle f128 first, since lowering it will result in comparing the return
	// value of a libcall against zero, which is just what the rest of LowerBR_CC
	// is expecting to deal with.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a branch
	// instruction.
	unsigned Opc = LHS.getOpcode();
	if (LHS.getResNo() == 1 && isOneConstant(RHS) &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO \|\| Opc == ISD::SMULO \|\| Opc == ISD::UMULO)) {
	assert((CC == ISD::SETEQ \|\| CC == ISD::SETNE) &&
	"Unexpected condition code.");
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
	return SDValue();

	// The actual operation with overflow check.
	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);

	if (CC == ISD::SETNE)
	OFCC = getInvertedCondCode(OFCC);
	SDValue CCVal = DAG.getConstant(OFCC, dl, MVT::i32);

	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Overflow);
	}

	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	// If the RHS of the comparison is zero, we can potentially fold this
	// to a specialized branch.
	const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
	if (RHSC && RHSC->getZExtValue() == 0) {
	if (CC == ISD::SETEQ) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETNE) {
	// See if we can use a TBZ to fold in an AND as well.
	// TBZ has a smaller branch displacement than CBZ. If the offset is
	// out of bounds, a late MI-layer pass rewrites branches.
	// 403.gcc is an example that hits this case.
	if (LHS.getOpcode() == ISD::AND &&
	isa<ConstantSDNode>(LHS.getOperand(1)) &&
	isPowerOf2_64(LHS.getConstantOperandVal(1))) {
	SDValue Test = LHS.getOperand(0);
	uint64_t Mask = LHS.getConstantOperandVal(1);
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
	DAG.getConstant(Log2_64(Mask), dl, MVT::i64),
	Dest);
	}

	return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
	} else if (CC == ISD::SETLT && LHS.getOpcode() != ISD::AND) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}
	}
	if (RHSC && RHSC->getSExtValue() == -1 && CC == ISD::SETGT &&
	LHS.getOpcode() != ISD::AND) {
	// Don't combine AND since emitComparison converts the AND to an ANDS
	// (a.k.a. TST) and the test in the test bit and branch instruction
	// becomes redundant. This would also increase register pressure.
	uint64_t Mask = LHS.getValueSizeInBits() - 1;
	return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, LHS,
	DAG.getConstant(Mask, dl, MVT::i64), Dest);
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
	Cmp);
	}

	assert(LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue BR1 =
	DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
	Cmp);
	}

	return BR1;
	}

	SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);

	SDValue In1 = Op.getOperand(0);
	SDValue In2 = Op.getOperand(1);
	EVT SrcVT = In2.getValueType();

	if (SrcVT.bitsLT(VT))
	In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
	else if (SrcVT.bitsGT(VT))
	In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0, DL));

	EVT VecVT;
	EVT EltVT;
	uint64_t EltMask;
	SDValue VecVal1, VecVal2;
	if (VT == MVT::f32 \|\| VT == MVT::v2f32 \|\| VT == MVT::v4f32) {
	EltVT = MVT::i32;
	VecVT = (VT == MVT::v2f32 ? MVT::v2i32 : MVT::v4i32);
	EltMask = 0x80000000ULL;

	if (!VT.isVector()) {
	VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
	DAG.getUNDEF(VecVT), In1);
	VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
	DAG.getUNDEF(VecVT), In2);
	} else {
	VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
	VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
	}
	} else if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	EltVT = MVT::i64;
	VecVT = MVT::v2i64;

	// We want to materialize a mask with the high bit set, but the AdvSIMD
	// immediate moves cannot materialize that in a single instruction for
	// 64-bit elements. Instead, materialize zero and then negate it.
	EltMask = 0;

	if (!VT.isVector()) {
	VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
	DAG.getUNDEF(VecVT), In1);
	VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
	DAG.getUNDEF(VecVT), In2);
	} else {
	VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
	VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
	}
	} else {
	llvm_unreachable("Invalid type for copysign!");
	}

	SDValue BuildVec = DAG.getConstant(EltMask, DL, VecVT);

	// If we couldn't materialize the mask above, then the mask vector will be
	// the zero vector, and we need to negate it here.
	if (VT == MVT::f64 \|\| VT == MVT::v2f64) {
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
	BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
	}

	SDValue Sel =
	DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);

	if (VT == MVT::f32)
	return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
	else if (VT == MVT::f64)
	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
	else
	return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
	}

	SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
	if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
	Attribute::NoImplicitFloat))
	return SDValue();

	if (!Subtarget->hasNEON())
	return SDValue();

	// While there is no integer popcount instruction, it can
	// be more efficiently lowered to the following sequence that uses
	// AdvSIMD registers/instructions as long as the copies to/from
	// the AdvSIMD registers are cheap.
	// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
	// CNT V0.8B, V0.8B // 8xbyte pop-counts
	// ADDV B0, V0.8B // sum 8xbyte pop-counts
	// UMOV X0, V0.B[0] // copy byte result back to integer reg
	SDValue Val = Op.getOperand(0);
	SDLoc DL(Op);
	EVT VT = Op.getValueType();

	if (VT == MVT::i32)
	Val = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Val);
	Val = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);

	SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, Val);
	SDValue UaddLV = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
	DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);

	if (VT == MVT::i64)
	UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
	return UaddLV;
	}

	SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	if (Op.getValueType().isVector())
	return LowerVSETCC(Op, DAG);

	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDLoc dl(Op);

	// We chose ZeroOrOneBooleanContents, so use zero and one.
	EVT VT = Op.getValueType();
	SDValue TVal = DAG.getConstant(1, dl, VT);
	SDValue FVal = DAG.getConstant(0, dl, VT);

	// Handle f128 first, since one possible outcome is a normal integer
	// comparison which gets picked up by the next if statement.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, use it.
	if (!RHS.getNode()) {
	assert(LHS.getValueType() == Op.getValueType() &&
	"Unexpected setcc expansion!");
	return LHS;
	}
	}

	if (LHS.getValueType().isInteger()) {
	SDValue CCVal;
	SDValue Cmp =
	getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);

	// If that fails, we'll need to perform an FCMP + CSEL sequence. Go ahead
	// and do the comparison.
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);
	if (CC2 == AArch64CC::AL) {
	changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);

	// Note that we inverted the condition above, so we reverse the order of
	// the true and false operands here. This will allow the setcc to be
	// matched to a single CSINC instruction.
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
	} else {
	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
	// totally clean. Some of them require two CSELs to implement. As is in
	// this case, we emit the first CSEL and then emit a second using the output
	// of the first as the RHS. We're effectively OR'ing the two CC's together.

	// FIXME: It would be nice if we could match the two CSELs to two CSINCs.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
	SDValue RHS, SDValue TVal,
	SDValue FVal, const SDLoc &dl,
	SelectionDAG &DAG) const {
	// Handle f128 first, because it will result in a comparison of some RTLIB
	// call result against zero.
	if (LHS.getValueType() == MVT::f128) {
	softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);

	// If softenSetCCOperands returned a scalar, we need to compare the result
	// against zero to select between true and false values.
	if (!RHS.getNode()) {
	RHS = DAG.getConstant(0, dl, LHS.getValueType());
	CC = ISD::SETNE;
	}
	}

	// Also handle f16, for which we need to do a f32 comparison.
	if (LHS.getValueType() == MVT::f16) {
	LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, LHS);
	RHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, RHS);
	}

	// Next, handle integers.
	if (LHS.getValueType().isInteger()) {
	assert((LHS.getValueType() == RHS.getValueType()) &&
	(LHS.getValueType() == MVT::i32 \|\| LHS.getValueType() == MVT::i64));

	unsigned Opcode = AArch64ISD::CSEL;

	// If both the TVal and the FVal are constants, see if we can swap them in
	// order to for a CSINV or CSINC out of them.
	ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
	ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);

	if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	} else if (TVal.getOpcode() == ISD::XOR) {
	// If TVal is a NOT we want to swap TVal and FVal so that we can match
	// with a CSINV rather than a CSEL.
	if (isAllOnesConstant(TVal.getOperand(1))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}
	} else if (TVal.getOpcode() == ISD::SUB) {
	// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
	// that we can match with a CSNEG rather than a CSEL.
	if (isNullConstant(TVal.getOperand(0))) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}
	} else if (CTVal && CFVal) {
	const int64_t TrueVal = CTVal->getSExtValue();
	const int64_t FalseVal = CFVal->getSExtValue();
	bool Swap = false;

	// If both TVal and FVal are constants, see if FVal is the
	// inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
	// instead of a CSEL in that case.
	if (TrueVal == ~FalseVal) {
	Opcode = AArch64ISD::CSINV;
	} else if (TrueVal == -FalseVal) {
	Opcode = AArch64ISD::CSNEG;
	} else if (TVal.getValueType() == MVT::i32) {
	// If our operands are only 32-bit wide, make sure we use 32-bit
	// arithmetic for the check whether we can use CSINC. This ensures that
	// the addition in the check will wrap around properly in case there is
	// an overflow (which would not be the case if we do the check with
	// 64-bit arithmetic).
	const uint32_t TrueVal32 = CTVal->getZExtValue();
	const uint32_t FalseVal32 = CFVal->getZExtValue();

	if ((TrueVal32 == FalseVal32 + 1) \|\| (TrueVal32 + 1 == FalseVal32)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal32 > FalseVal32) {
	Swap = true;
	}
	}
	// 64-bit check whether we can use CSINC.
	} else if ((TrueVal == FalseVal + 1) \|\| (TrueVal + 1 == FalseVal)) {
	Opcode = AArch64ISD::CSINC;

	if (TrueVal > FalseVal) {
	Swap = true;
	}
	}

	// Swap TVal and FVal if necessary.
	if (Swap) {
	std::swap(TVal, FVal);
	std::swap(CTVal, CFVal);
	CC = ISD::getSetCCInverse(CC, true);
	}

	if (Opcode != AArch64ISD::CSEL) {
	// Drop FVal since we can get its value by simply inverting/negating
	// TVal.
	FVal = TVal;
	}
	}

	// Avoid materializing a constant when possible by reusing a known value in
	// a register. However, don't perform this optimization if the known value
	// is one, zero or negative one in the case of a CSEL. We can always
	// materialize these values using CSINC, CSEL and CSINV with wzr/xzr as the
	// FVal, respectively.
	ConstantSDNode *RHSVal = dyn_cast<ConstantSDNode>(RHS);
	if (Opcode == AArch64ISD::CSEL && RHSVal && !RHSVal->isOne() &&
	!RHSVal->isNullValue() && !RHSVal->isAllOnesValue()) {
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	// Transform "a == C ? C : x" to "a == C ? a : x" and "a != C ? x : C" to
	// "a != C ? x : a" to avoid materializing C.
	if (CTVal && CTVal == RHSVal && AArch64CC == AArch64CC::EQ)
	TVal = LHS;
	else if (CFVal && CFVal == RHSVal && AArch64CC == AArch64CC::NE)
	FVal = LHS;
	} else if (Opcode == AArch64ISD::CSNEG && RHSVal && RHSVal->isOne()) {
	assert (CTVal && CFVal && "Expected constant operands for CSNEG.");
	// Use a CSINV to transform "a == C ? 1 : -1" to "a == C ? a : -1" to
	// avoid materializing C.
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	if (CTVal == RHSVal && AArch64CC == AArch64CC::EQ) {
	Opcode = AArch64ISD::CSINV;
	TVal = LHS;
	FVal = DAG.getConstant(0, dl, FVal.getValueType());
	}
	}

	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);

	EVT VT = TVal.getValueType();
	return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
	}

	// Now we know we're dealing with FP values.
	assert(LHS.getValueType() == MVT::f32 \|\| LHS.getValueType() == MVT::f64);
	assert(LHS.getValueType() == RHS.getValueType());
	EVT VT = TVal.getValueType();
	SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two CSELs to implement.
	AArch64CC::CondCode CC1, CC2;
	changeFPCCToAArch64CC(CC, CC1, CC2);

	if (DAG.getTarget().Options.UnsafeFPMath) {
	// Transform "a == 0.0 ? 0.0 : x" to "a == 0.0 ? a : x" and
	// "a != 0.0 ? x : 0.0" to "a != 0.0 ? x : a" to avoid materializing 0.0.
	ConstantFPSDNode *RHSVal = dyn_cast<ConstantFPSDNode>(RHS);
	if (RHSVal && RHSVal->isZero()) {
	ConstantFPSDNode *CFVal = dyn_cast<ConstantFPSDNode>(FVal);
	ConstantFPSDNode *CTVal = dyn_cast<ConstantFPSDNode>(TVal);

	if ((CC == ISD::SETEQ \|\| CC == ISD::SETOEQ \|\| CC == ISD::SETUEQ) &&
	CTVal && CTVal->isZero() && TVal.getValueType() == LHS.getValueType())
	TVal = LHS;
	else if ((CC == ISD::SETNE \|\| CC == ISD::SETONE \|\| CC == ISD::SETUNE) &&
	CFVal && CFVal->isZero() &&
	FVal.getValueType() == LHS.getValueType())
	FVal = LHS;
	}
	}

	// Emit first, and possibly only, CSEL.
	SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
	SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);

	// If we need a second CSEL, emit it, using the output of the first as the
	// RHS. We're effectively OR'ing the two CC's together.
	if (CC2 != AArch64CC::AL) {
	SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
	}

	// Otherwise, return the output of the first CSEL.
	return CS1;
	}

	SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue TVal = Op.getOperand(2);
	SDValue FVal = Op.getOperand(3);
	SDLoc DL(Op);
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue CCVal = Op->getOperand(0);
	SDValue TVal = Op->getOperand(1);
	SDValue FVal = Op->getOperand(2);
	SDLoc DL(Op);

	unsigned Opc = CCVal.getOpcode();
	// Optimize {s\|u}{add\|sub\|mul}.with.overflow feeding into a select
	// instruction.
	if (CCVal.getResNo() == 1 &&
	(Opc == ISD::SADDO \|\| Opc == ISD::UADDO \|\| Opc == ISD::SSUBO \|\|
	Opc == ISD::USUBO \|\| Opc == ISD::SMULO \|\| Opc == ISD::UMULO)) {
	// Only lower legal XALUO ops.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
	return SDValue();

	AArch64CC::CondCode OFCC;
	SDValue Value, Overflow;
	std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CCVal.getValue(0), DAG);
	SDValue CCVal = DAG.getConstant(OFCC, DL, MVT::i32);

	return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
	CCVal, Overflow);
	}

	// Lower it the same way as we would lower a SELECT_CC node.
	ISD::CondCode CC;
	SDValue LHS, RHS;
	if (CCVal.getOpcode() == ISD::SETCC) {
	LHS = CCVal.getOperand(0);
	RHS = CCVal.getOperand(1);
	CC = cast<CondCodeSDNode>(CCVal->getOperand(2))->get();
	} else {
	LHS = CCVal;
	RHS = DAG.getConstant(0, DL, CCVal.getValueType());
	CC = ISD::SETNE;
	}
	return LowerSELECT_CC(CC, LHS, RHS, TVal, FVal, DL, DAG);
	}

	SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
	SelectionDAG &DAG) const {
	// Jump table entries as PC relative offsets. No additional tweaking
	// is necessary here. Just get the address of the jump table.
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(JT, DAG);
	}
	return getAddr(JT, DAG);
	}

	SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
	SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	if (getTargetMachine().getCodeModel() == CodeModel::Large) {
	// Use the GOT for the large code model on iOS.
	if (Subtarget->isTargetMachO()) {
	return getGOT(CP, DAG);
	}
	return getAddrLarge(CP, DAG);
	} else {
	return getAddr(CP, DAG);
	}
	}

	SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
	SelectionDAG &DAG) const {
	BlockAddressSDNode *BA = cast<BlockAddressSDNode>(Op);
	if (getTargetMachine().getCodeModel() == CodeModel::Large &&
	!Subtarget->isTargetMachO()) {
	return getAddrLarge(BA, DAG);
	} else {
	return getAddr(BA, DAG);
	}
	}

	SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	AArch64FunctionInfo *FuncInfo =
	DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();

	SDLoc DL(Op);
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0
	? FuncInfo->getVarArgsGPRIndex()
	: FuncInfo->getVarArgsStackIndex(),
	getPointerTy(DAG.getDataLayout()));
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
	SelectionDAG &DAG) const {
	// The layout of the va_list struct is specified in the AArch64 Procedure Call
	// Standard, section B.3.
	MachineFunction &MF = DAG.getMachineFunction();
	AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDLoc DL(Op);

	SDValue Chain = Op.getOperand(0);
	SDValue VAList = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SmallVector<SDValue, 4> MemOps;

	// void *__stack at offset 0
	SDValue Stack = DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), PtrVT);
	MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
	MachinePointerInfo(SV), /* Alignment = */ 8));

	// void *__gr_top at offset 8
	int GPRSize = FuncInfo->getVarArgsGPRSize();
	if (GPRSize > 0) {
	SDValue GRTop, GRTopAddr;

	GRTopAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(8, DL, PtrVT));

	GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), PtrVT);
	GRTop = DAG.getNode(ISD::ADD, DL, PtrVT, GRTop,
	DAG.getConstant(GPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
	MachinePointerInfo(SV, 8),
	/* Alignment = */ 8));
	}

	// void *__vr_top at offset 16
	int FPRSize = FuncInfo->getVarArgsFPRSize();
	if (FPRSize > 0) {
	SDValue VRTop, VRTopAddr;
	VRTopAddr = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(16, DL, PtrVT));

	VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), PtrVT);
	VRTop = DAG.getNode(ISD::ADD, DL, PtrVT, VRTop,
	DAG.getConstant(FPRSize, DL, PtrVT));

	MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
	MachinePointerInfo(SV, 16),
	/* Alignment = */ 8));
	}

	// int __gr_offs at offset 24
	SDValue GROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(24, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-GPRSize, DL, MVT::i32), GROffsAddr,
	MachinePointerInfo(SV, 24), /* Alignment = */ 4));

	// int __vr_offs at offset 28
	SDValue VROffsAddr =
	DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getConstant(28, DL, PtrVT));
	MemOps.push_back(DAG.getStore(
	Chain, DL, DAG.getConstant(-FPRSize, DL, MVT::i32), VROffsAddr,
	MachinePointerInfo(SV, 28), /* Alignment = */ 4));

	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();

	if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()))
	return LowerWin64_VASTART(Op, DAG);
	else if (Subtarget->isTargetDarwin())
	return LowerDarwin_VASTART(Op, DAG);
	else
	return LowerAAPCS_VASTART(Op, DAG);
	}

	SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
	SelectionDAG &DAG) const {
	// AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
	// pointer.
	SDLoc DL(Op);
	unsigned VaListSize =
	Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows() ? 8 : 32;
	const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();

	return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1),
	Op.getOperand(2),
	DAG.getConstant(VaListSize, DL, MVT::i32),
	8, false, false, false, MachinePointerInfo(DestSV),
	MachinePointerInfo(SrcSV));
	}

	SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget->isTargetDarwin() &&
	"automatic va_arg instruction only works on Darwin");

	const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	SDValue Chain = Op.getOperand(0);
	SDValue Addr = Op.getOperand(1);
	unsigned Align = Op.getConstantOperandVal(3);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	SDValue VAList = DAG.getLoad(PtrVT, DL, Chain, Addr, MachinePointerInfo(V));
	Chain = VAList.getValue(1);

	if (Align > 8) {
	assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
	VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(Align - 1, DL, PtrVT));
	VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
	DAG.getConstant(-(int64_t)Align, DL, PtrVT));
	}

	Type ArgTy = VT.getTypeForEVT(DAG.getContext());
	uint64_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);

	// Scalar integer and FP values smaller than 64 bits are implicitly extended
	// up to 64 bits. At the very least, we have to increase the striding of the
	// vaargs list to match this, and for FP values we need to introduce
	// FP_ROUND nodes as well.
	if (VT.isInteger() && !VT.isVector())
	ArgSize = 8;
	bool NeedFPTrunc = false;
	if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
	ArgSize = 8;
	NeedFPTrunc = true;
	}

	// Increment the pointer, VAList, to the next vaarg
	SDValue VANext = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
	DAG.getConstant(ArgSize, DL, PtrVT));
	// Store the incremented VAList to the legalized pointer
	SDValue APStore =
	DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V));

	// Load the actual argument out of the pointer VAList
	if (NeedFPTrunc) {
	// Load the value as an f64.
	SDValue WideFP =
	DAG.getLoad(MVT::f64, DL, APStore, VAList, MachinePointerInfo());
	// Round the value down to an f32.
	SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
	DAG.getIntPtrConstant(1, DL));
	SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
	// Merge the rounded value with the chain output of the load.
	return DAG.getMergeValues(Ops, DL);
	}

	return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo());
	}

	SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setFrameAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDValue FrameAddr =
	DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("sp", AArch64::SP)
	.Case("x18", AArch64::X18)
	.Case("w18", AArch64::W18)
	.Default(0);
	if ((Reg == AArch64::X18 \|\| Reg == AArch64::W18) &&
	!Subtarget->isX18Reserved())
	Reg = 0;
	if (Reg)
	return Reg;
	report_fatal_error(Twine("Invalid register name \""
	+ StringRef(RegName) + "\"."));
	}

	SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	if (Depth) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	SDValue Offset = DAG.getConstant(8, DL, getPointerTy(DAG.getDataLayout()));
	return DAG.getLoad(VT, DL, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Return LR, which contains the return address. Mark it an implicit live-in.
	unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
	return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
	}

	/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;

	assert(Op.getOpcode() == ISD::SRA_PARTS \|\| Op.getOpcode() == ISD::SRL_PARTS);

	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue HiBitsForLo = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SHL ShOpHi, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	HiBitsForLo =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	HiBitsForLo, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));

	SDValue LoBitsForLo = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
	SDValue LoForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForLo, HiBitsForLo);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue LoForBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	// AArch64 shifts larger than the register width are wrapped rather than
	// clamped, so we can't just emit "hi >> x".
	SDValue HiForNormalShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
	SDValue HiForBigShift =
	Opc == ISD::SRA
	? DAG.getNode(Opc, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i64))
	: DAG.getConstant(0, dl, VT);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
	/// i64 values and take a 2 x i64 value to shift plus a shift amount.
	SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	EVT VT = Op.getValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);

	assert(Op.getOpcode() == ISD::SHL_PARTS);
	SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
	DAG.getConstant(VTBits, dl, MVT::i64), ShAmt);
	SDValue LoBitsForHi = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);

	// Unfortunately, if ShAmt == 0, we just calculated "(SRL ShOpLo, 64)" which
	// is "undef". We wanted 0, so CSEL it directly.
	SDValue Cmp = emitComparison(ShAmt, DAG.getConstant(0, dl, MVT::i64),
	ISD::SETEQ, dl, DAG);
	SDValue CCVal = DAG.getConstant(AArch64CC::EQ, dl, MVT::i32);
	LoBitsForHi =
	DAG.getNode(AArch64ISD::CSEL, dl, VT, DAG.getConstant(0, dl, MVT::i64),
	LoBitsForHi, CCVal, Cmp);

	SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i64));
	SDValue HiBitsForHi = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
	SDValue HiForNormalShift =
	DAG.getNode(ISD::OR, dl, VT, LoBitsForHi, HiBitsForHi);

	SDValue HiForBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);

	Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, dl, MVT::i64), ISD::SETGE,
	dl, DAG);
	CCVal = DAG.getConstant(AArch64CC::GE, dl, MVT::i32);
	SDValue Hi = DAG.getNode(AArch64ISD::CSEL, dl, VT, HiForBigShift,
	HiForNormalShift, CCVal, Cmp);

	// AArch64 shifts of larger than register sizes are wrapped rather than
	// clamped, so we can't just emit "lo << a" if a is too big.
	SDValue LoForBigShift = DAG.getConstant(0, dl, VT);
	SDValue LoForNormalShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
	SDValue Lo = DAG.getNode(AArch64ISD::CSEL, dl, VT, LoForBigShift,
	LoForNormalShift, CCVal, Cmp);

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	bool AArch64TargetLowering::isOffsetFoldingLegal(
	const GlobalAddressSDNode *GA) const {
	// The AArch64 target doesn't support folding offsets into global addresses.
	return false;
	}

	bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	// We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
	// FIXME: We should be able to handle f128 as well with a clever lowering.
	if (Imm.isPosZero() && (VT == MVT::f64 \|\| VT == MVT::f32))
	return true;

	if (VT == MVT::f64)
	return AArch64_AM::getFP64Imm(Imm) != -1;
	else if (VT == MVT::f32)
	return AArch64_AM::getFP32Imm(Imm) != -1;
	return false;
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Optimization Hooks
	//===----------------------------------------------------------------------===//

	static SDValue getEstimate(const AArch64Subtarget *ST, unsigned Opcode,
	SDValue Operand, SelectionDAG &DAG,
	int &ExtraSteps) {
	EVT VT = Operand.getValueType();
	if (ST->hasNEON() &&
	(VT == MVT::f64 \|\| VT == MVT::v1f64 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::f32 \|\| VT == MVT::v1f32 \|\|
	VT == MVT::v2f32 \|\| VT == MVT::v4f32)) {
	if (ExtraSteps == TargetLoweringBase::ReciprocalEstimate::Unspecified)
	// For the reciprocal estimates, convergence is quadratic, so the number
	// of digits is doubled after each iteration. In ARMv8, the accuracy of
	// the initial estimate is 2^-8. Thus the number of extra steps to refine
	// the result for float (23 mantissa bits) is 2 and for double (52
	// mantissa bits) is 3.
	ExtraSteps = VT == MVT::f64 ? 3 : 2;

	return DAG.getNode(Opcode, SDLoc(Operand), VT, Operand);
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getSqrtEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps,
	bool &UseOneConst,
	bool Reciprocal) const {
	if (Enabled == ReciprocalEstimate::Enabled \|\|
	(Enabled == ReciprocalEstimate::Unspecified && Subtarget->useRSqrt()))
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRSQRTE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setUnsafeAlgebra(true);

	// Newton reciprocal square root iteration: E * 0.5 * (3 - X * E^2)
	// AArch64 reciprocal square root iteration instruction: 0.5 * (3 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Estimate,
	Flags);
	Step = DAG.getNode(AArch64ISD::FRSQRTS, DL, VT, Operand, Step, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}

	if (!Reciprocal) {
	EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
	VT);
	SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
	SDValue Eq = DAG.getSetCC(DL, CCVT, Operand, FPZero, ISD::SETEQ);

	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate, Flags);
	// Correct the result if the operand is 0.0.
	Estimate = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL,
	VT, Eq, Operand, Estimate);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	SDValue AArch64TargetLowering::getRecipEstimate(SDValue Operand,
	SelectionDAG &DAG, int Enabled,
	int &ExtraSteps) const {
	if (Enabled == ReciprocalEstimate::Enabled)
	if (SDValue Estimate = getEstimate(Subtarget, AArch64ISD::FRECPE, Operand,
	DAG, ExtraSteps)) {
	SDLoc DL(Operand);
	EVT VT = Operand.getValueType();

	SDNodeFlags Flags;
	Flags.setUnsafeAlgebra(true);

	// Newton reciprocal iteration: E * (2 - X * E)
	// AArch64 reciprocal iteration instruction: (2 - M * N)
	for (int i = ExtraSteps; i > 0; --i) {
	SDValue Step = DAG.getNode(AArch64ISD::FRECPS, DL, VT, Operand,
	Estimate, Flags);
	Estimate = DAG.getNode(ISD::FMUL, DL, VT, Estimate, Step, Flags);
	}

	ExtraSteps = 0;
	return Estimate;
	}

	return SDValue();
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Table of Constraints
	// TODO: This is the current set of constraints supported by ARM for the
	// compiler, not all of them may make sense, e.g. S may be difficult to support.
	//
	// r - A general register
	// w - An FP/SIMD register of some size in the range v0-v31
	// x - An FP/SIMD register of some size in the range v0-v15
	// I - Constant that can be used with an ADD instruction
	// J - Constant that can be used with a SUB instruction
	// K - Constant that can be used with a 32-bit logical instruction
	// L - Constant that can be used with a 64-bit logical instruction
	// M - Constant that can be used as a 32-bit MOV immediate
	// N - Constant that can be used as a 64-bit MOV immediate
	// Q - A memory reference with base register and no offset
	// S - A symbolic address
	// Y - Floating point constant zero
	// Z - Integer constant zero
	//
	// Note that general register operands will be output using their 64-bit x
	// register name, whatever the size of the variable, unless the asm operand
	// is prefixed by the %w modifier. Floating-point and SIMD register operands
	// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
	// %q modifier.
	const char *AArch64TargetLowering::LowerXConstraint(EVT ConstraintVT) const {
	// At this point, we have to lower this constraint to something else, so we
	// lower it to an "r" or "w". However, by doing this we will force the result
	// to be in register, while the X constraint is much more permissive.
	//
	// Although we are correct (we are free to emit anything, without
	// constraints), we might break use cases that would expect us to be more
	// efficient and emit something else.
	if (!Subtarget->hasFPARMv8())
	return "r";

	if (ConstraintVT.isFloatingPoint())
	return "w";

	if (ConstraintVT.isVector() &&
	(ConstraintVT.getSizeInBits() == 64 \|\|
	ConstraintVT.getSizeInBits() == 128))
	return "w";

	return "r";
	}

	/// getConstraintType - Given a constraint letter, return the type of
	/// constraint it is for this target.
	AArch64TargetLowering::ConstraintType
	AArch64TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	default:
	break;
	case 'z':
	return C_Other;
	case 'x':
	case 'w':
	return C_RegisterClass;
	// An address with a single base register. Due to the way we
	// currently handle addresses it is the same as 'r'.
	case 'Q':
	return C_Memory;
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	AArch64TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	break;
	case 'x':
	case 'w':
	if (type->isFloatingPointTy() \|\| type->isVectorTy())
	weight = CW_Register;
	break;
	case 'z':
	weight = CW_Constant;
	break;
	}
	return weight;
	}

	std::pair<unsigned, const TargetRegisterClass *>
	AArch64TargetLowering::getRegForInlineAsmConstraint(
	const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'r':
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::GPR64commonRegClass);
	return std::make_pair(0U, &AArch64::GPR32commonRegClass);
	case 'w':
	if (VT.getSizeInBits() == 16)
	return std::make_pair(0U, &AArch64::FPR16RegClass);
	if (VT.getSizeInBits() == 32)
	return std::make_pair(0U, &AArch64::FPR32RegClass);
	if (VT.getSizeInBits() == 64)
	return std::make_pair(0U, &AArch64::FPR64RegClass);
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128RegClass);
	break;
	// The instructions that this constraint is designed for can
	// only take 128-bit registers so just use that regclass.
	case 'x':
	if (VT.getSizeInBits() == 128)
	return std::make_pair(0U, &AArch64::FPR128_loRegClass);
	break;
	}
	}
	if (StringRef("{cc}").equals_lower(Constraint))
	return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass *> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	unsigned Size = Constraint.size();
	if ((Size == 4 \|\| Size == 5) && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
	int RegNo;
	bool Failed = Constraint.slice(2, Size - 1).getAsInteger(10, RegNo);
	if (!Failed && RegNo >= 0 && RegNo <= 31) {
	// v0 - v31 are aliases of q0 - q31 or d0 - d31 depending on size.
	// By default we'll emit v0-v31 for this unless there's a modifier where
	// we'll emit the correct register as well.
	if (VT != MVT::Other && VT.getSizeInBits() == 64) {
	Res.first = AArch64::FPR64RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR64RegClass;
	} else {
	Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
	Res.second = &AArch64::FPR128RegClass;
	}
	}
	}
	}

	return Res;
	}

	/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
	/// vector. If it is invalid, don't add anything to Ops.
	void AArch64TargetLowering::LowerAsmOperandForConstraint(
	SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Currently only support length 1 constraints.
	if (Constraint.length() != 1)
	return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default:
	break;

	// This set of constraints deal with valid constants for various instructions.
	// Validate and return a target constant for them if we can.
	case 'z': {
	// 'z' maps to xzr or wzr so it needs an input of 0.
	if (!isNullConstant(Op))
	return;

	if (Op.getValueType() == MVT::i64)
	Result = DAG.getRegister(AArch64::XZR, MVT::i64);
	else
	Result = DAG.getRegister(AArch64::WZR, MVT::i32);
	break;
	}

	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return;

	// Grab the value and do some validation.
	uint64_t CVal = C->getZExtValue();
	switch (ConstraintLetter) {
	// The I constraint applies only to simple ADD or SUB immediate operands:
	// i.e. 0 to 4095 with optional shift by 12
	// The J constraint applies only to ADD or SUB immediates that would be
	// valid when negated, i.e. if [an add pattern] were to be output as a SUB
	// instruction [or vice versa], in other words -1 to -4095 with optional
	// left shift by 12.
	case 'I':
	if (isUInt<12>(CVal) \|\| isShiftedUInt<12, 12>(CVal))
	break;
	return;
	case 'J': {
	uint64_t NVal = -C->getSExtValue();
	if (isUInt<12>(NVal) \|\| isShiftedUInt<12, 12>(NVal)) {
	CVal = C->getSExtValue();
	break;
	}
	return;
	}
	// The K and L constraints apply only to logical immediates, including
	// what used to be the MOVI alias for ORR (though the MOVI alias has now
	// been removed and MOV should be used). So these constraints have to
	// distinguish between bit patterns that are valid 32-bit or 64-bit
	// "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
	// not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
	// versa.
	case 'K':
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	return;
	case 'L':
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	return;
	// The M and N constraints are a superset of K and L respectively, for use
	// with the MOV (immediate) alias. As well as the logical immediates they
	// also match 32 or 64-bit immediates that can be loaded either using a
	// single MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
	// (M) or 64-bit 0x1234000000000000 (N) etc.
	// As a note some of this code is liberally stolen from the asm parser.
	case 'M': {
	if (!isUInt<32>(CVal))
	return;
	if (AArch64_AM::isLogicalImmediate(CVal, 32))
	break;
	if ((CVal & 0xFFFF) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	uint64_t NCVal = ~(uint32_t)CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	return;
	}
	case 'N': {
	if (AArch64_AM::isLogicalImmediate(CVal, 64))
	break;
	if ((CVal & 0xFFFFULL) == CVal)
	break;
	if ((CVal & 0xFFFF0000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF00000000ULL) == CVal)
	break;
	if ((CVal & 0xFFFF000000000000ULL) == CVal)
	break;
	uint64_t NCVal = ~CVal;
	if ((NCVal & 0xFFFFULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF0000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF00000000ULL) == NCVal)
	break;
	if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
	break;
	return;
	}
	default:
	return;
	}

	// All assembler immediates are 64-bit integers.
	Result = DAG.getTargetConstant(CVal, SDLoc(Op), MVT::i64);
	break;
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}

	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	//===----------------------------------------------------------------------===//
	// AArch64 Advanced SIMD Support
	//===----------------------------------------------------------------------===//

	/// WidenVector - Given a value in the V64 register class, produce the
	/// equivalent value in the V128 register class.
	static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
	EVT VT = V64Reg.getValueType();
	unsigned NarrowSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
	SDLoc DL(V64Reg);

	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
	V64Reg, DAG.getConstant(0, DL, MVT::i32));
	}

	/// getExtFactor - Determine the adjustment factor for the position when
	/// generating an "extract from vector registers" instruction.
	static unsigned getExtFactor(SDValue &V) {
	EVT EltType = V.getValueType().getVectorElementType();
	return EltType.getSizeInBits() / 8;
	}

	/// NarrowVector - Given a value in the V128 register class, produce the
	/// equivalent value in the V64 register class.
	static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
	EVT VT = V128Reg.getValueType();
	unsigned WideSize = VT.getVectorNumElements();
	MVT EltTy = VT.getVectorElementType().getSimpleVT();
	MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
	SDLoc DL(V128Reg);

	return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
	}

	// Gather data to see if the operation can be modelled as a
	// shuffle in combination with VEXTs.
	SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	unsigned NumElts = VT.getVectorNumElements();

	struct ShuffleSourceInfo {
	SDValue Vec;
	unsigned MinElt;
	unsigned MaxElt;

	// We may insert some combination of BITCASTs and VEXT nodes to force Vec to
	// be compatible with the shuffle we intend to construct. As a result
	// ShuffleVec will be some sliding window into the original Vec.
	SDValue ShuffleVec;

	// Code should guarantee that element i in Vec starts at element "WindowBase
	// + i * WindowScale in ShuffleVec".
	int WindowBase;
	int WindowScale;

	ShuffleSourceInfo(SDValue Vec)
	: Vec(Vec), MinElt(std::numeric_limits<unsigned>::max()), MaxElt(0),
	ShuffleVec(Vec), WindowBase(0), WindowScale(1) {}

	bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
	};

	// First gather all vectors used as an immediate source for this BUILD_VECTOR
	// node.
	SmallVector<ShuffleSourceInfo, 2> Sources;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(V.getOperand(1))) {
	// A shuffle can only come from building a vector from various
	// elements of other vectors, provided their indices are constant.
	return SDValue();
	}

	// Add this element source to the list if it's not already there.
	SDValue SourceVec = V.getOperand(0);
	auto Source = find(Sources, SourceVec);
	if (Source == Sources.end())
	Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));

	// Update the minimum and maximum lane number seen.
	unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
	Source->MinElt = std::min(Source->MinElt, EltNo);
	Source->MaxElt = std::max(Source->MaxElt, EltNo);
	}

	// Currently only do something sane when at most two source vectors
	// are involved.
	if (Sources.size() > 2)
	return SDValue();

	// Find out the smallest element size among result and two sources, and use
	// it as element size to build the shuffle_vector.
	EVT SmallestEltTy = VT.getVectorElementType();
	for (auto &Source : Sources) {
	EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
	if (SrcEltTy.bitsLT(SmallestEltTy)) {
	SmallestEltTy = SrcEltTy;
	}
	}
	unsigned ResMultiplier =
	VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
	NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
	EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);

	// If the source vector is too wide or too narrow, we may nevertheless be able
	// to construct a compatible shuffle either by concatenating it with UNDEF or
	// extracting a suitable range of elements.
	for (auto &Src : Sources) {
	EVT SrcVT = Src.ShuffleVec.getValueType();

	if (SrcVT.getSizeInBits() == VT.getSizeInBits())
	continue;

	// This stage of the search produces a source with the same element type as
	// the original, but with a total width matching the BUILD_VECTOR output.
	EVT EltVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = VT.getSizeInBits() / EltVT.getSizeInBits();
	EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);

	if (SrcVT.getSizeInBits() < VT.getSizeInBits()) {
	assert(2 * SrcVT.getSizeInBits() == VT.getSizeInBits());
	// We can pad out the smaller vector for free, so if it's part of a
	// shuffle...
	Src.ShuffleVec =
	DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
	DAG.getUNDEF(Src.ShuffleVec.getValueType()));
	continue;
	}

	assert(SrcVT.getSizeInBits() == 2 * VT.getSizeInBits());

	if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
	// Span too large for a VEXT to cope
	return SDValue();
	}

	if (Src.MinElt >= NumSrcElts) {
	// The extraction can just take the second half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	Src.WindowBase = -NumSrcElts;
	} else if (Src.MaxElt < NumSrcElts) {
	// The extraction can just take the first half
	Src.ShuffleVec =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	} else {
	// An actual VEXT is needed
	SDValue VEXTSrc1 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(0, dl, MVT::i64));
	SDValue VEXTSrc2 =
	DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
	DAG.getConstant(NumSrcElts, dl, MVT::i64));
	unsigned Imm = Src.MinElt * getExtFactor(VEXTSrc1);

	Src.ShuffleVec = DAG.getNode(AArch64ISD::EXT, dl, DestVT, VEXTSrc1,
	VEXTSrc2,
	DAG.getConstant(Imm, dl, MVT::i32));
	Src.WindowBase = -Src.MinElt;
	}
	}

	// Another possible incompatibility occurs from the vector element types. We
	// can fix this by bitcasting the source vectors to the same type we intend
	// for the shuffle.
	for (auto &Src : Sources) {
	EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
	if (SrcEltTy == SmallestEltTy)
	continue;
	assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
	Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
	Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
	Src.WindowBase *= Src.WindowScale;
	}

	// Final sanity check before we try to actually produce a shuffle.
	DEBUG(
	for (auto Src : Sources)
	assert(Src.ShuffleVec.getValueType() == ShuffleVT);
	);

	// The stars all align, our next step is to produce the mask for the shuffle.
	SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
	int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
	for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
	SDValue Entry = Op.getOperand(i);
	if (Entry.isUndef())
	continue;

	auto Src = find(Sources, Entry.getOperand(0));
	int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();

	// EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
	// trunc. So only std::min(SrcBits, DestBits) actually get defined in this
	// segment.
	EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
	int BitsDefined =
	std::min(OrigEltTy.getSizeInBits(), VT.getScalarSizeInBits());
	int LanesDefined = BitsDefined / BitsPerShuffleLane;

	// This source is expected to fill ResMultiplier lanes of the final shuffle,
	// starting at the appropriate offset.
	int LaneMask = &Mask[i ResMultiplier];

	int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
	ExtractBase += NumElts * (Src - Sources.begin());
	for (int j = 0; j < LanesDefined; ++j)
	LaneMask[j] = ExtractBase + j;
	}

	// Final check before we try to produce nonsense...
	if (!isShuffleMaskLegal(Mask, ShuffleVT))
	return SDValue();

	SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
	for (unsigned i = 0; i < Sources.size(); ++i)
	ShuffleOps[i] = Sources[i].ShuffleVec;

	SDValue Shuffle = DAG.getVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
	ShuffleOps[1], Mask);
	return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are the same.
	static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
	unsigned NumElts = VT.getVectorNumElements();

	// Assume that the first shuffle index is not UNDEF. Fail if it is.
	if (M[0] < 0)
	return false;

	Imm = M[0];

	// If this is a VEXT shuffle, the immediate value is the index of the first
	// element. The other shuffle indices must be the successive elements after
	// the first one.
	unsigned ExpectedElt = Imm;
	for (unsigned i = 1; i < NumElts; ++i) {
	// Increment the expected index. If it wraps around, just follow it
	// back to index zero and keep going.
	++ExpectedElt;
	if (ExpectedElt == NumElts)
	ExpectedElt = 0;

	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if (ExpectedElt != static_cast<unsigned>(M[i]))
	return false;
	}

	return true;
	}

	// check if an EXT instruction can handle the shuffle mask when the
	// vector sources of the shuffle are different.
	static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
	unsigned &Imm) {
	// Look for the first non-undef element.
	const int *FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });

	// Benefit form APInt to handle overflow when calculating expected element.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
	APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
	// The following shuffle indices must be the successive elements after the
	// first real element.
	const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
	[&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
	if (FirstWrongElt != M.end())
	return false;

	// The index of an EXT is the first element if it is not UNDEF.
	// Watch out for the beginning UNDEFs. The EXT index should be the expected
	// value of the first element. E.g.
	// <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
	// <-1, -1, 0, 1, ...> is treated as <2NumElts-2, 2NumElts-1, 0, 1, ...>.
	// ExpectedElt is the last mask index plus 1.
	Imm = ExpectedElt.getZExtValue();

	// There are two difference cases requiring to reverse input vectors.
	// For example, for vector <4 x i32> we have the following cases,
	// Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
	// Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
	// For both cases, we finally use mask <5, 6, 7, 0>, which requires
	// to reverse two input vectors.
	if (Imm < NumElts)
	ReverseEXT = true;
	else
	Imm -= NumElts;

	return true;
	}

	/// isREVMask - Check if a vector shuffle corresponds to a REV
	/// instruction with the specified blocksize. (The order of the elements
	/// within each block of the vector is reversed.)
	static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
	assert((BlockSize == 16 \|\| BlockSize == 32 \|\| BlockSize == 64) &&
	"Only possible block sizes for REV are: 16, 32, 64");

	unsigned EltSz = VT.getScalarSizeInBits();
	if (EltSz == 64)
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	unsigned BlockElts = M[0] + 1;
	// If the first shuffle index is UNDEF, be optimistic.
	if (M[0] < 0)
	BlockElts = BlockSize / EltSz;

	if (BlockSize <= EltSz \|\| BlockSize != BlockElts * EltSz)
	return false;

	for (unsigned i = 0; i < NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
	return false;
	}

	return true;
	}

	static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
	return false;
	Idx += 1;
	}

	return true;
	}

	static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (M[i] < 0)
	continue; // ignore UNDEF indices
	if ((unsigned)M[i] != 2 * i + WhichResult)
	return false;
	}

	return true;
	}

	static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
	return false;
	}
	return true;
	}

	/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
	static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	unsigned Idx = WhichResult * NumElts / 2;
	for (unsigned i = 0; i != NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != Idx) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
	return false;
	Idx += 1;
	}

	return true;
	}

	/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
	static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned Half = VT.getVectorNumElements() / 2;
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned j = 0; j != 2; ++j) {
	unsigned Idx = WhichResult;
	for (unsigned i = 0; i != Half; ++i) {
	int MIdx = M[i + j * Half];
	if (MIdx >= 0 && (unsigned)MIdx != Idx)
	return false;
	Idx += 2;
	}
	}

	return true;
	}

	/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
	/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
	/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
	static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
	unsigned NumElts = VT.getVectorNumElements();
	WhichResult = (M[0] == 0 ? 0 : 1);
	for (unsigned i = 0; i < NumElts; i += 2) {
	if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) \|\|
	(M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
	return false;
	}
	return true;
	}

	static bool isINSMask(ArrayRef<int> M, int NumInputElements,
	bool &DstIsLeft, int &Anomaly) {
	if (M.size() != static_cast<size_t>(NumInputElements))
	return false;

	int NumLHSMatch = 0, NumRHSMatch = 0;
	int LastLHSMismatch = -1, LastRHSMismatch = -1;

	for (int i = 0; i < NumInputElements; ++i) {
	if (M[i] == -1) {
	++NumLHSMatch;
	++NumRHSMatch;
	continue;
	}

	if (M[i] == i)
	++NumLHSMatch;
	else
	LastLHSMismatch = i;

	if (M[i] == i + NumInputElements)
	++NumRHSMatch;
	else
	LastRHSMismatch = i;
	}

	if (NumLHSMatch == NumInputElements - 1) {
	DstIsLeft = true;
	Anomaly = LastLHSMismatch;
	return true;
	} else if (NumRHSMatch == NumInputElements - 1) {
	DstIsLeft = false;
	Anomaly = LastRHSMismatch;
	return true;
	}

	return false;
	}

	static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
	if (VT.getSizeInBits() != 128)
	return false;

	unsigned NumElts = VT.getVectorNumElements();

	for (int I = 0, E = NumElts / 2; I != E; I++) {
	if (Mask[I] != I)
	return false;
	}

	int Offset = NumElts / 2;
	for (int I = NumElts / 2, E = NumElts; I != E; I++) {
	if (Mask[I] != I + SplitLHS * Offset)
	return false;
	}

	return true;
	}

	static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	EVT VT = Op.getValueType();
	SDValue V0 = Op.getOperand(0);
	SDValue V1 = Op.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();

	if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() \|\|
	VT.getVectorElementType() != V1.getValueType().getVectorElementType())
	return SDValue();

	bool SplitV0 = V0.getValueSizeInBits() == 128;

	if (!isConcatMask(Mask, VT, SplitV0))
	return SDValue();

	EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	if (SplitV0) {
	V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
	DAG.getConstant(0, DL, MVT::i64));
	}
	if (V1.getValueSizeInBits() == 128) {
	V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
	DAG.getConstant(0, DL, MVT::i64));
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
	}

	/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
	/// the specified operations to build the shuffle.
	static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
	SDValue RHS, SelectionDAG &DAG,
	const SDLoc &dl) {
	unsigned OpNum = (PFEntry >> 26) & 0x0F;
	unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
	unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);

	enum {
	OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
	OP_VREV,
	OP_VDUP0,
	OP_VDUP1,
	OP_VDUP2,
	OP_VDUP3,
	OP_VEXT1,
	OP_VEXT2,
	OP_VEXT3,
	OP_VUZPL, // VUZP, left result
	OP_VUZPR, // VUZP, right result
	OP_VZIPL, // VZIP, left result
	OP_VZIPR, // VZIP, right result
	OP_VTRNL, // VTRN, left result
	OP_VTRNR // VTRN, right result
	};

	if (OpNum == OP_COPY) {
	if (LHSID == (1 * 9 + 2) * 9 + 3)
	return LHS;
	assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
	return RHS;
	}

	SDValue OpLHS, OpRHS;
	OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
	OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
	EVT VT = OpLHS.getValueType();

	switch (OpNum) {
	default:
	llvm_unreachable("Unknown shuffle opcode!");
	case OP_VREV:
	// VREV divides the vector in half and swaps within the half.
	if (VT.getVectorElementType() == MVT::i32 \|\|
	VT.getVectorElementType() == MVT::f32)
	return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
	// vrev <4 x i16> -> REV32
	if (VT.getVectorElementType() == MVT::i16 \|\|
	VT.getVectorElementType() == MVT::f16)
	return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
	// vrev <4 x i8> -> REV16
	assert(VT.getVectorElementType() == MVT::i8);
	return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
	case OP_VDUP0:
	case OP_VDUP1:
	case OP_VDUP2:
	case OP_VDUP3: {
	EVT EltTy = VT.getVectorElementType();
	unsigned Opcode;
	if (EltTy == MVT::i8)
	Opcode = AArch64ISD::DUPLANE8;
	else if (EltTy == MVT::i16 \|\| EltTy == MVT::f16)
	Opcode = AArch64ISD::DUPLANE16;
	else if (EltTy == MVT::i32 \|\| EltTy == MVT::f32)
	Opcode = AArch64ISD::DUPLANE32;
	else if (EltTy == MVT::i64 \|\| EltTy == MVT::f64)
	Opcode = AArch64ISD::DUPLANE64;
	else
	llvm_unreachable("Invalid vector element type?");

	if (VT.getSizeInBits() == 64)
	OpLHS = WidenVector(OpLHS, DAG);
	SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, dl, MVT::i64);
	return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
	}
	case OP_VEXT1:
	case OP_VEXT2:
	case OP_VEXT3: {
	unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
	return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
	DAG.getConstant(Imm, dl, MVT::i32));
	}
	case OP_VUZPL:
	return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VUZPR:
	return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPL:
	return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VZIPR:
	return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNL:
	return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	case OP_VTRNR:
	return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
	OpRHS);
	}
	}

	static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
	SelectionDAG &DAG) {
	// Check to see if we can use the TBL instruction.
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	SDLoc DL(Op);

	EVT EltVT = Op.getValueType().getVectorElementType();
	unsigned BytesPerElt = EltVT.getSizeInBits() / 8;

	SmallVector<SDValue, 8> TBLMask;
	for (int Val : ShuffleMask) {
	for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
	unsigned Offset = Byte + Val * BytesPerElt;
	TBLMask.push_back(DAG.getConstant(Offset, DL, MVT::i32));
	}
	}

	MVT IndexVT = MVT::v8i8;
	unsigned IndexLen = 8;
	if (Op.getValueSizeInBits() == 128) {
	IndexVT = MVT::v16i8;
	IndexLen = 16;
	}

	SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
	SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);

	SDValue Shuffle;
	if (V2.getNode()->isUndef()) {
	if (IndexLen == 8)
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	if (IndexLen == 8) {
	V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl1, DL, MVT::i32), V1Cst,
	DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	} else {
	// FIXME: We cannot, for the moment, emit a TBL2 instruction because we
	// cannot currently represent the register constraints on the input
	// table registers.
	// Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
	// DAG.getBuildVector(IndexVT, DL, &TBLMask[0],
	// IndexLen));
	Shuffle = DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
	DAG.getConstant(Intrinsic::aarch64_neon_tbl2, DL, MVT::i32), V1Cst,
	V2Cst, DAG.getBuildVector(IndexVT, DL,
	makeArrayRef(TBLMask.data(), IndexLen)));
	}
	}
	return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
	}

	static unsigned getDUPLANEOp(EVT EltType) {
	if (EltType == MVT::i8)
	return AArch64ISD::DUPLANE8;
	if (EltType == MVT::i16 \|\| EltType == MVT::f16)
	return AArch64ISD::DUPLANE16;
	if (EltType == MVT::i32 \|\| EltType == MVT::f32)
	return AArch64ISD::DUPLANE32;
	if (EltType == MVT::i64 \|\| EltType == MVT::f64)
	return AArch64ISD::DUPLANE64;

	llvm_unreachable("Invalid vector element type?");
	}

	SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());

	// Convert shuffles that are directly supported on NEON to target-specific
	// DAG nodes, instead of keeping them as shuffles and matching them again
	// during code selection. This is more efficient and avoids the possibility
	// of inconsistencies between legalization and selection.
	ArrayRef<int> ShuffleMask = SVN->getMask();

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);

	if (SVN->isSplat()) {
	int Lane = SVN->getSplatIndex();
	// If this is undef splat, generate it via "just" vdup, if possible.
	if (Lane == -1)
	Lane = 0;

	if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
	V1.getOperand(0));
	// Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
	// constant. If so, we can just reference the lane's definition directly.
	if (V1.getOpcode() == ISD::BUILD_VECTOR &&
	!isa<ConstantSDNode>(V1.getOperand(Lane)))
	return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));

	// Otherwise, duplicate from the lane of the input vector.
	unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());

	// SelectionDAGBuilder may have "helpfully" already extracted or conatenated
	// to make a vector of the same size as this SHUFFLE. We can ignore the
	// extract entirely, and canonicalise the concat using WidenVector.
	if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
	V1 = V1.getOperand(0);
	} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
	unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
	Lane -= Idx * VT.getVectorNumElements() / 2;
	V1 = WidenVector(V1.getOperand(Idx), DAG);
	} else if (VT.getSizeInBits() == 64)
	V1 = WidenVector(V1, DAG);

	return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
	}

	if (isREVMask(ShuffleMask, VT, 64))
	return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 32))
	return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
	if (isREVMask(ShuffleMask, VT, 16))
	return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);

	bool ReverseEXT = false;
	unsigned Imm;
	if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
	if (ReverseEXT)
	std::swap(V1, V2);
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
	DAG.getConstant(Imm, dl, MVT::i32));
	} else if (V2->isUndef() && isSingletonEXTMask(ShuffleMask, VT, Imm)) {
	Imm *= getExtFactor(V1);
	return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
	DAG.getConstant(Imm, dl, MVT::i32));
	}

	unsigned WhichResult;
	if (isZIPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isUZPMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}
	if (isTRNMask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
	}

	if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}
	if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
	unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
	return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
	}

	if (SDValue Concat = tryFormConcatFromShuffle(Op, DAG))
	return Concat;

	bool DstIsLeft;
	int Anomaly;
	int NumInputElements = V1.getValueType().getVectorNumElements();
	if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
	SDValue DstVec = DstIsLeft ? V1 : V2;
	SDValue DstLaneV = DAG.getConstant(Anomaly, dl, MVT::i64);

	SDValue SrcVec = V1;
	int SrcLane = ShuffleMask[Anomaly];
	if (SrcLane >= NumInputElements) {
	SrcVec = V2;
	SrcLane -= VT.getVectorNumElements();
	}
	SDValue SrcLaneV = DAG.getConstant(SrcLane, dl, MVT::i64);

	EVT ScalarVT = VT.getVectorElementType();

	if (ScalarVT.getSizeInBits() < 32 && ScalarVT.isInteger())
	ScalarVT = MVT::i32;

	return DAG.getNode(
	ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
	DstLaneV);
	}

	// If the shuffle is not directly supported and it has 4 elements, use
	// the PerfectShuffle-generated table to synthesize it from other shuffles.
	unsigned NumElts = VT.getVectorNumElements();
	if (NumElts == 4) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (ShuffleMask[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = ShuffleMask[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
	}

	return GenerateTBL(Op, ShuffleMask, DAG);
	}

	static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
	APInt &UndefBits) {
	EVT VT = BVN->getValueType(0);
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
	unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;

	for (unsigned i = 0; i < NumSplats; ++i) {
	CnstBits <<= SplatBitSize;
	UndefBits <<= SplatBitSize;
	CnstBits \|= SplatBits.zextOrTrunc(VT.getSizeInBits());
	UndefBits \|= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
	}

	return true;
	}

	return false;
	}

	SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
	SelectionDAG &DAG) const {
	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
	SDValue LHS = Op.getOperand(0);
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	if (!BVN)
	return Op;

	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
	// We only have BIC vector immediate instruction, which is and-not.
	CnstBits = ~CnstBits;

	// We make use of a little bit of goto ickiness in order to avoid having to
	// duplicate the immediate matching logic for the undef toggled case.
	bool SecondTry = false;
	AttemptModImm:

	if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
	CnstBits = CnstBits.zextOrTrunc(64);
	uint64_t CnstVal = CnstBits.getZExtValue();

	if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(16, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(24, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	if (SecondTry)
	goto FailedModImm;
	SecondTry = true;
	CnstBits = ~UndefBits;
	goto AttemptModImm;
	}

	// We can always fall back to a non-immediate AND.
	FailedModImm:
	return Op;
	}

	// Specialized code to quickly find if PotentialBVec is a BuildVector that
	// consists of only the same constant int value, returned in reference arg
	// ConstVal
	static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
	uint64_t &ConstVal) {
	BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
	if (!Bvec)
	return false;
	ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
	if (!FirstElt)
	return false;
	EVT VT = Bvec->getValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	for (unsigned i = 1; i < NumElts; ++i)
	if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
	return false;
	ConstVal = FirstElt->getZExtValue();
	return true;
	}

	static unsigned getIntrinsicID(const SDNode *N) {
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	default:
	return Intrinsic::not_intrinsic;
	case ISD::INTRINSIC_WO_CHAIN: {
	unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
	if (IID < Intrinsic::num_intrinsics)
	return IID;
	return Intrinsic::not_intrinsic;
	}
	}
	}

	// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
	// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
	// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
	// Also, logical shift right -> sri, with the same structure.
	static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	if (!VT.isVector())
	return SDValue();

	SDLoc DL(N);

	// Is the first op an AND?
	const SDValue And = N->getOperand(0);
	if (And.getOpcode() != ISD::AND)
	return SDValue();

	// Is the second op an shl or lshr?
	SDValue Shift = N->getOperand(1);
	// This will have been turned into: AArch64ISD::VSHL vector, #shift
	// or AArch64ISD::VLSHR vector, #shift
	unsigned ShiftOpc = Shift.getOpcode();
	if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
	return SDValue();
	bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;

	// Is the shift amount constant?
	ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	if (!C2node)
	return SDValue();

	// Is the and mask vector all constant?
	uint64_t C1;
	if (!isAllConstantBuildVector(And.getOperand(1), C1))
	return SDValue();

	// Is C1 == ~C2, taking into account how much one can shift elements of a
	// particular size?
	uint64_t C2 = C2node->getZExtValue();
	unsigned ElemSizeInBits = VT.getScalarSizeInBits();
	if (C2 > ElemSizeInBits)
	return SDValue();
	unsigned ElemMask = (1 << ElemSizeInBits) - 1;
	if ((C1 & ElemMask) != (~C2 & ElemMask))
	return SDValue();

	SDValue X = And.getOperand(0);
	SDValue Y = Shift.getOperand(0);

	unsigned Intrin =
	IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
	SDValue ResultSLI =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
	Shift.getOperand(1));

	DEBUG(dbgs() << "aarch64-lower: transformed: \n");
	DEBUG(N->dump(&DAG));
	DEBUG(dbgs() << "into: \n");
	DEBUG(ResultSLI->dump(&DAG));

	++NumShiftInserts;
	return ResultSLI;
	}

	SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
	SelectionDAG &DAG) const {
	// Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
	if (EnableAArch64SlrGeneration) {
	if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
	return Res;
	}

	BuildVectorSDNode *BVN =
	dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
	SDValue LHS = Op.getOperand(1);
	SDLoc dl(Op);
	EVT VT = Op.getValueType();

	// OR commutes, so try swapping the operands.
	if (!BVN) {
	LHS = Op.getOperand(0);
	BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
	}
	if (!BVN)
	return Op;

	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
	// We make use of a little bit of goto ickiness in order to avoid having to
	// duplicate the immediate matching logic for the undef toggled case.
	bool SecondTry = false;
	AttemptModImm:

	if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
	CnstBits = CnstBits.zextOrTrunc(64);
	uint64_t CnstVal = CnstBits.getZExtValue();

	if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(16, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(24, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	if (SecondTry)
	goto FailedModImm;
	SecondTry = true;
	CnstBits = UndefBits;
	goto AttemptModImm;
	}

	// We can always fall back to a non-immediate OR.
	FailedModImm:
	return Op;
	}

	// Normalize the operands of BUILD_VECTOR. The value of constant operands will
	// be truncated to fit element width.
	static SDValue NormalizeBuildVector(SDValue Op,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	EVT EltTy= VT.getVectorElementType();

	if (EltTy.isFloatingPoint() \|\| EltTy.getSizeInBits() > 16)
	return Op;

	SmallVector<SDValue, 16> Ops;
	for (SDValue Lane : Op->ops()) {
	if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
	APInt LowBits(EltTy.getSizeInBits(),
	CstLane->getZExtValue());
	Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
	}
	Ops.push_back(Lane);
	}
	return DAG.getBuildVector(VT, dl, Ops);
	}

	SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	EVT VT = Op.getValueType();
	Op = NormalizeBuildVector(Op, DAG);
	BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());

	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
	// We make use of a little bit of goto ickiness in order to avoid having to
	// duplicate the immediate matching logic for the undef toggled case.
	bool SecondTry = false;
	AttemptModImm:

	if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
	CnstBits = CnstBits.zextOrTrunc(64);
	uint64_t CnstVal = CnstBits.getZExtValue();

	// Certain magic vector constants (used to express things like NOT
	// and NEG) are passed through unmodified. This allows codegen patterns
	// for these operations to match. Special-purpose patterns will lower
	// these immediates to MOVIs if it proves necessary.
	if (VT.isInteger() && (CnstVal == 0 \|\| CnstVal == ~0ULL))
	return Op;

	// The many faces of MOVI...
	if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
	if (VT.getSizeInBits() == 128) {
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	// Support the V64 version via subregister insertion.
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(16, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(24, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(264, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(272, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
	SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	// The few faces of FMOV...
	if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
	SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
	VT.getSizeInBits() == 128) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
	SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
	DAG.getConstant(CnstVal, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	// The many faces of MVNI...
	CnstVal = ~CnstVal;
	if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(16, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(24, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(0, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(8, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(264, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}

	if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
	CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
	MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
	SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
	DAG.getConstant(CnstVal, dl, MVT::i32),
	DAG.getConstant(272, dl, MVT::i32));
	return DAG.getNode(AArch64ISD::NVCAST, dl, VT, Mov);
	}
	}

	if (SecondTry)
	goto FailedModImm;
	SecondTry = true;
	CnstBits = UndefBits;
	goto AttemptModImm;
	}
	FailedModImm:

	// Scan through the operands to find some interesting properties we can
	// exploit:
	// 1) If only one value is used, we can use a DUP, or
	// 2) if only the low element is not undef, we can just insert that, or
	// 3) if only one constant value is used (w/ some non-constant lanes),
	// we can splat the constant value into the whole vector then fill
	// in the non-constant lanes.
	// 4) FIXME: If different constant values are used, but we can intelligently
	// select the values we'll be overwriting for the non-constant
	// lanes such that we can directly materialize the vector
	// some other way (MOVI, e.g.), we can be sneaky.
	unsigned NumElts = VT.getVectorNumElements();
	bool isOnlyLowElement = true;
	bool usesOnlyOneValue = true;
	bool usesOnlyOneConstantValue = true;
	bool isConstant = true;
	unsigned NumConstantLanes = 0;
	SDValue Value;
	SDValue ConstantValue;
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	if (i > 0)
	isOnlyLowElement = false;
	if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
	isConstant = false;

	if (isa<ConstantSDNode>(V) \|\| isa<ConstantFPSDNode>(V)) {
	++NumConstantLanes;
	if (!ConstantValue.getNode())
	ConstantValue = V;
	else if (ConstantValue != V)
	usesOnlyOneConstantValue = false;
	}

	if (!Value.getNode())
	Value = V;
	else if (V != Value)
	usesOnlyOneValue = false;
	}

	if (!Value.getNode())
	return DAG.getUNDEF(VT);

	if (isOnlyLowElement)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);

	// Use DUP for non-constant splats. For f32 constant splats, reduce to
	// i32 and try again.
	if (usesOnlyOneValue) {
	if (!isConstant) {
	if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Value.getValueType() != VT)
	return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);

	// This is actually a DUPLANExx operation, which keeps everything vectory.

	// DUPLANE works on 128-bit vectors, widen it if necessary.
	SDValue Lane = Value.getOperand(1);
	Value = Value.getOperand(0);
	if (Value.getValueSizeInBits() == 64)
	Value = WidenVector(Value, DAG);

	unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
	return DAG.getNode(Opcode, dl, VT, Value, Lane);
	}

	if (VT.getVectorElementType().isFloatingPoint()) {
	SmallVector<SDValue, 8> Ops;
	EVT EltTy = VT.getVectorElementType();
	assert ((EltTy == MVT::f16 \|\| EltTy == MVT::f32 \|\| EltTy == MVT::f64) &&
	"Unsupported floating-point vector type");
	MVT NewType = MVT::getIntegerVT(EltTy.getSizeInBits());
	for (unsigned i = 0; i < NumElts; ++i)
	Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
	SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
	Val = LowerBUILD_VECTOR(Val, DAG);
	if (Val.getNode())
	return DAG.getNode(ISD::BITCAST, dl, VT, Val);
	}
	}

	// If there was only one constant value used and for more than one lane,
	// start by splatting that value, then replace the non-constant lanes. This
	// is better than the default, which will perform a separate initialization
	// for each lane.
	if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
	SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
	// Now insert the non-constant lanes.
	for (unsigned i = 0; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
	// Note that type legalization likely mucked about with the VT of the
	// source operand, so we may have to convert it here before inserting.
	Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
	}
	}
	return Val;
	}

	// If all elements are constants and the case above didn't get hit, fall back
	// to the default expansion, which will generate a load from the constant
	// pool.
	if (isConstant)
	return SDValue();

	// Empirical tests suggest this is rarely worth it for vectors of length <= 2.
	if (NumElts >= 4) {
	if (SDValue shuffle = ReconstructShuffle(Op, DAG))
	return shuffle;
	}

	// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
	// know the default expansion would otherwise fall back on something even
	// worse. For a vector with one or two non-undef values, that's
	// scalar_to_vector for the elements followed by a shuffle (provided the
	// shuffle is valid for the target) and materialization element by element
	// on the stack followed by a load for everything else.
	if (!isConstant && !usesOnlyOneValue) {
	SDValue Vec = DAG.getUNDEF(VT);
	SDValue Op0 = Op.getOperand(0);
	unsigned i = 0;

	// Use SCALAR_TO_VECTOR for lane zero to
	// a) Avoid a RMW dependency on the full vector register, and
	// b) Allow the register coalescer to fold away the copy if the
	// value is already in an S or D register, and we're forced to emit an
	// INSERT_SUBREG that we can't fold anywhere.
	//
	// We also allow types like i8 and i16 which are illegal scalar but legal
	// vector element types. After type-legalization the inserted value is
	// extended (i32) and it is safe to cast them to the vector type by ignoring
	// the upper bits of the lowest lane (e.g. v8i8, v4i16).
	if (!Op0.isUndef()) {
	Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
	++i;
	}
	for (; i < NumElts; ++i) {
	SDValue V = Op.getOperand(i);
	if (V.isUndef())
	continue;
	SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i64);
	Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
	}
	return Vec;
	}

	// Just use the default expansion. We failed to find a better alternative.
	return SDValue();
	}

	SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(2));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
	return SDValue();

	// For V64 types, we perform insertion by expanding the value
	// to a V128 type and perform the insertion on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
	Op.getOperand(1), Op.getOperand(2));
	// Re-narrow the resultant vector.
	return NarrowVector(Node, DAG);
	}

	SDValue
	AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");

	// Check for non-constant or out of range lane.
	EVT VT = Op.getOperand(0).getValueType();
	ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!CI \|\| CI->getZExtValue() >= VT.getVectorNumElements())
	return SDValue();


	// Insertion/extraction are legal for V128 types.
	if (VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v2i64 \|\| VT == MVT::v4f32 \|\| VT == MVT::v2f64 \|\|
	VT == MVT::v8f16)
	return Op;

	if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
	VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
	return SDValue();

	// For V64 types, we perform extraction by expanding the value
	// to a V128 type and perform the extraction on that.
	SDLoc DL(Op);
	SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
	EVT WideTy = WideVec.getValueType();

	EVT ExtrTy = WideTy.getVectorElementType();
	if (ExtrTy == MVT::i16 \|\| ExtrTy == MVT::i8)
	ExtrTy = MVT::i32;

	// For extractions, we just return the result directly.
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
	Op.getOperand(1));
	}

	SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getOperand(0).getValueType();
	SDLoc dl(Op);
	// Just in case...
	if (!VT.isVector())
	return SDValue();

	ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
	if (!Cst)
	return SDValue();
	unsigned Val = Cst->getZExtValue();

	unsigned Size = Op.getValueSizeInBits();

	// This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
	if (Val == 0)
	return Op;

	// If this is extracting the upper 64-bits of a 128-bit vector, we match
	// that directly.
	if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
	return Op;

	return SDValue();
	}

	bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
	EVT VT) const {
	if (VT.getVectorNumElements() == 4 &&
	(VT.is128BitVector() \|\| VT.is64BitVector())) {
	unsigned PFIndexes[4];
	for (unsigned i = 0; i != 4; ++i) {
	if (M[i] < 0)
	PFIndexes[i] = 8;
	else
	PFIndexes[i] = M[i];
	}

	// Compute the index in the perfect shuffle table.
	unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
	PFIndexes[2] * 9 + PFIndexes[3];
	unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
	unsigned Cost = (PFEntry >> 30);

	if (Cost <= 4)
	return true;
	}

	bool DummyBool;
	int DummyInt;
	unsigned DummyUnsigned;

	return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) \|\| isREVMask(M, VT, 64) \|\|
	isREVMask(M, VT, 32) \|\| isREVMask(M, VT, 16) \|\|
	isEXTMask(M, VT, DummyBool, DummyUnsigned) \|\|
	// isTBLMask(M, VT) \|\| // FIXME: Port TBL support from ARM.
	isTRNMask(M, VT, DummyUnsigned) \|\| isUZPMask(M, VT, DummyUnsigned) \|\|
	isZIPMask(M, VT, DummyUnsigned) \|\|
	isTRN_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isUZP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isZIP_v_undef_Mask(M, VT, DummyUnsigned) \|\|
	isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) \|\|
	isConcatMask(M, VT, VT.getSizeInBits() == 128));
	}

	/// getVShiftImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift operation, where all the elements of the
	/// build_vector must have the same constant integer value.
	static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
	// Ignore bit_converts.
	while (Op.getOpcode() == ISD::BITCAST)
	Op = Op.getOperand(0);
	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
	APInt SplatBits, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN \|\| !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElementBits) \|\|
	SplatBitSize > ElementBits)
	return false;
	Cnt = SplatBits.getSExtValue();
	return true;
	}

	/// isVShiftLImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift left operation. That value must be in the range:
	/// 0 <= Value < ElementBits for a left shift; or
	/// 0 <= Value <= ElementBits for a long left shift.
	static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
	}

	/// isVShiftRImm - Check if this is a valid build_vector for the immediate
	/// operand of a vector shift right operation. The value must be in the range:
	/// 1 <= Value <= ElementBits for a right shift; or
	static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
	assert(VT.isVector() && "vector shift count is not a vector type");
	int64_t ElementBits = VT.getScalarSizeInBits();
	if (!getVShiftImm(Op, ElementBits, Cnt))
	return false;
	return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
	}

	SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
	SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();
	SDLoc DL(Op);
	int64_t Cnt;

	if (!Op.getOperand(1).getValueType().isVector())
	return Op;
	unsigned EltSize = VT.getScalarSizeInBits();

	switch (Op.getOpcode()) {
	default:
	llvm_unreachable("unexpected shift opcode");

	case ISD::SHL:
	if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
	return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Intrinsic::aarch64_neon_ushl, DL,
	MVT::i32),
	Op.getOperand(0), Op.getOperand(1));
	case ISD::SRA:
	case ISD::SRL:
	// Right shift immediate
	if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
	unsigned Opc =
	(Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
	return DAG.getNode(Opc, DL, VT, Op.getOperand(0),
	DAG.getConstant(Cnt, DL, MVT::i32));
	}

	// Right shift register. Note, there is not a shift right register
	// instruction, but the shift left register instruction takes a signed
	// value, where negative numbers specify a right shift.
	unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
	: Intrinsic::aarch64_neon_ushl;
	// negate the shift amount
	SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
	SDValue NegShiftLeft =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
	DAG.getConstant(Opc, DL, MVT::i32), Op.getOperand(0),
	NegShift);
	return NegShiftLeft;
	}

	return SDValue();
	}

	static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
	AArch64CC::CondCode CC, bool NoNans, EVT VT,
	const SDLoc &dl, SelectionDAG &DAG) {
	EVT SrcVT = LHS.getValueType();
	assert(VT.getSizeInBits() == SrcVT.getSizeInBits() &&
	"function only supposed to emit natural comparisons");

	BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
	APInt CnstBits(VT.getSizeInBits(), 0);
	APInt UndefBits(VT.getSizeInBits(), 0);
	bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
	bool IsZero = IsCnst && (CnstBits == 0);

	if (SrcVT.getVectorElementType().isFloatingPoint()) {
	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Fcmeq;
	if (IsZero)
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	else
	Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
	case AArch64CC::LS:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (!NoNans)
	return SDValue();
	// If we ignore NaNs then we can use to the MI implementation.
	LLVM_FALLTHROUGH;
	case AArch64CC::MI:
	if (IsZero)
	return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
	}
	}

	switch (CC) {
	default:
	return SDValue();
	case AArch64CC::NE: {
	SDValue Cmeq;
	if (IsZero)
	Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	else
	Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
	}
	case AArch64CC::EQ:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
	case AArch64CC::GE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
	case AArch64CC::GT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
	case AArch64CC::LE:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
	case AArch64CC::LS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
	case AArch64CC::LO:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
	case AArch64CC::LT:
	if (IsZero)
	return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
	return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
	case AArch64CC::HI:
	return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
	case AArch64CC::HS:
	return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
	}
	}

	SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
	SelectionDAG &DAG) const {
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	EVT CmpVT = LHS.getValueType().changeVectorElementTypeToInteger();
	SDLoc dl(Op);

	if (LHS.getValueType().getVectorElementType().isInteger()) {
	assert(LHS.getValueType() == RHS.getValueType());
	AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, AArch64CC, false, CmpVT, dl, DAG);
	return DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());
	}

	if (LHS.getValueType().getVectorElementType() == MVT::f16)
	return SDValue();

	assert(LHS.getValueType().getVectorElementType() == MVT::f32 \|\|
	LHS.getValueType().getVectorElementType() == MVT::f64);

	// Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
	// clean. Some of them require two branches to implement.
	AArch64CC::CondCode CC1, CC2;
	bool ShouldInvert;
	changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);

	bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
	SDValue Cmp =
	EmitVectorComparison(LHS, RHS, CC1, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp.getNode())
	return SDValue();

	if (CC2 != AArch64CC::AL) {
	SDValue Cmp2 =
	EmitVectorComparison(LHS, RHS, CC2, NoNaNs, CmpVT, dl, DAG);
	if (!Cmp2.getNode())
	return SDValue();

	Cmp = DAG.getNode(ISD::OR, dl, CmpVT, Cmp, Cmp2);
	}

	Cmp = DAG.getSExtOrTrunc(Cmp, dl, Op.getValueType());

	if (ShouldInvert)
	return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());

	return Cmp;
	}

	static SDValue getReductionSDNode(unsigned Op, SDLoc DL, SDValue ScalarOp,
	SelectionDAG &DAG) {
	SDValue VecOp = ScalarOp.getOperand(0);
	auto Rdx = DAG.getNode(Op, DL, VecOp.getSimpleValueType(), VecOp);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarOp.getValueType(), Rdx,
	DAG.getConstant(0, DL, MVT::i64));
	}

	SDValue AArch64TargetLowering::LowerVECREDUCE(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	switch (Op.getOpcode()) {
	case ISD::VECREDUCE_ADD:
	return getReductionSDNode(AArch64ISD::UADDV, dl, Op, DAG);
	case ISD::VECREDUCE_SMAX:
	return getReductionSDNode(AArch64ISD::SMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_SMIN:
	return getReductionSDNode(AArch64ISD::SMINV, dl, Op, DAG);
	case ISD::VECREDUCE_UMAX:
	return getReductionSDNode(AArch64ISD::UMAXV, dl, Op, DAG);
	case ISD::VECREDUCE_UMIN:
	return getReductionSDNode(AArch64ISD::UMINV, dl, Op, DAG);
	case ISD::VECREDUCE_FMAX: {
	assert(Op->getFlags().hasNoNaNs() && "fmax vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fmaxnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	case ISD::VECREDUCE_FMIN: {
	assert(Op->getFlags().hasNoNaNs() && "fmin vector reduction needs NoNaN flag");
	return DAG.getNode(
	ISD::INTRINSIC_WO_CHAIN, dl, Op.getValueType(),
	DAG.getConstant(Intrinsic::aarch64_neon_fminnmv, dl, MVT::i32),
	Op.getOperand(0));
	}
	default:
	llvm_unreachable("Unhandled reduction");
	}
	}

	/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
	/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
	/// specified in the intrinsic calls.
	bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	unsigned Intrinsic) const {
	auto &DL = I.getModule()->getDataLayout();
	switch (Intrinsic) {
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r: {
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	// Conservatively set memVT to the entire set of vectors loaded.
	uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	Info.vol = false; // volatile loads with NEON intrinsics not supported
	Info.readMem = true;
	Info.writeMem = false;
	return true;
	}
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane: {
	Info.opc = ISD::INTRINSIC_VOID;
	// Conservatively set memVT to the entire set of vectors stored.
	unsigned NumElts = 0;
	for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
	Type *ArgTy = I.getArgOperand(ArgI)->getType();
	if (!ArgTy->isVectorTy())
	break;
	NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
	}
	Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
	Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
	Info.offset = 0;
	Info.align = 0;
	Info.vol = false; // volatile stores with NEON intrinsics not supported
	Info.readMem = false;
	Info.writeMem = true;
	return true;
	}
	case Intrinsic::aarch64_ldaxr:
	case Intrinsic::aarch64_ldxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.vol = true;
	Info.readMem = true;
	Info.writeMem = false;
	return true;
	}
	case Intrinsic::aarch64_stlxr:
	case Intrinsic::aarch64_stxr: {
	PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::getVT(PtrTy->getElementType());
	Info.ptrVal = I.getArgOperand(1);
	Info.offset = 0;
	Info.align = DL.getABITypeAlignment(PtrTy->getElementType());
	Info.vol = true;
	Info.readMem = false;
	Info.writeMem = true;
	return true;
	}
	case Intrinsic::aarch64_ldaxp:
	case Intrinsic::aarch64_ldxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(0);
	Info.offset = 0;
	Info.align = 16;
	Info.vol = true;
	Info.readMem = true;
	Info.writeMem = false;
	return true;
	case Intrinsic::aarch64_stlxp:
	case Intrinsic::aarch64_stxp:
	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.memVT = MVT::i128;
	Info.ptrVal = I.getArgOperand(2);
	Info.offset = 0;
	Info.align = 16;
	Info.vol = true;
	Info.readMem = false;
	Info.writeMem = true;
	return true;
	default:
	break;
	}

	return false;
	}

	// Truncations from 64-bit GPR to 32-bit GPR is free.
	bool AArch64TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}
	bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	/// Check if it is profitable to hoist instruction in then/else to if.
	/// Not profitable if I and it's user can form a FMA instruction
	/// because we prefer FMSUB/FMADD.
	bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
	if (I->getOpcode() != Instruction::FMul)
	return true;

	if (!I->hasOneUse())
	return true;

	Instruction *User = I->user_back();

	if (User &&
	!(User->getOpcode() == Instruction::FSub \|\|
	User->getOpcode() == Instruction::FAdd))
	return true;

	const TargetOptions &Options = getTargetMachine().Options;
	const DataLayout &DL = I->getModule()->getDataLayout();
	EVT VT = getValueType(DL, User->getOperand(0)->getType());

	return !(isFMAFasterThanFMulAndFAdd(VT) &&
	isOperationLegalOrCustom(ISD::FMA, VT) &&
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\|
	Options.UnsafeFPMath));
	}

	// All 32-bit GPR operations implicitly zero the high-half of the corresponding
	// 64-bit GPR.
	bool AArch64TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}
	bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	if (VT1.isVector() \|\| VT2.isVector() \|\| !VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 == 32 && NumBits2 == 64;
	}

	bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2)) {
	return true;
	}

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	// 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
	return (VT1.isSimple() && !VT1.isVector() && VT1.isInteger() &&
	VT2.isSimple() && !VT2.isVector() && VT2.isInteger() &&
	VT1.getSizeInBits() <= 32);
	}

	bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
	if (isa<FPExtInst>(Ext))
	return false;

	// Vector types are next free.
	if (Ext->getType()->isVectorTy())
	return false;

	for (const Use &U : Ext->uses()) {
	// The extension is free if we can fold it with a left shift in an
	// addressing mode or an arithmetic operation: add, sub, and cmp.

	// Is there a shift?
	const Instruction *Instr = cast<Instruction>(U.getUser());

	// Is this a constant shift?
	switch (Instr->getOpcode()) {
	case Instruction::Shl:
	if (!isa<ConstantInt>(Instr->getOperand(1)))
	return false;
	break;
	case Instruction::GetElementPtr: {
	gep_type_iterator GTI = gep_type_begin(Instr);
	auto &DL = Ext->getModule()->getDataLayout();
	std::advance(GTI, U.getOperandNo()-1);
	Type *IdxTy = GTI.getIndexedType();
	// This extension will end up with a shift because of the scaling factor.
	// 8-bit sized types have a scaling factor of 1, thus a shift amount of 0.
	// Get the shift amount based on the scaling factor:
	// log2(sizeof(IdxTy)) - log2(8).
	uint64_t ShiftAmt =
	countTrailingZeros(DL.getTypeStoreSizeInBits(IdxTy)) - 3;
	// Is the constant foldable in the shift of the addressing mode?
	// I.e., shift amount is between 1 and 4 inclusive.
	if (ShiftAmt == 0 \|\| ShiftAmt > 4)
	return false;
	break;
	}
	case Instruction::Trunc:
	// Check if this is a noop.
	// trunc(sext ty1 to ty2) to ty1.
	if (Instr->getType() == Ext->getOperand(0)->getType())
	continue;
	LLVM_FALLTHROUGH;
	default:
	return false;
	}

	// At this point we can use the bfm family, so this extension is free
	// for that use.
	}
	return true;
	}

	bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
	unsigned &RequiredAligment) const {
	if (!LoadedType.isSimple() \|\|
	(!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
	return false;
	// Cyclone supports unaligned accesses.
	RequiredAligment = 0;
	unsigned NumBits = LoadedType.getSizeInBits();
	return NumBits == 32 \|\| NumBits == 64;
	}

	/// A helper function for determining the number of interleaved accesses we
	/// will generate when lowering accesses of the given type.
	unsigned
	AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
	const DataLayout &DL) const {
	return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
	}

	MachineMemOperand::Flags
	AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
	if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
	I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
	return MOStridedAccess;
	return MachineMemOperand::MONone;
	}

	bool AArch64TargetLowering::isLegalInterleavedAccessType(
	VectorType *VecTy, const DataLayout &DL) const {

	unsigned VecSize = DL.getTypeSizeInBits(VecTy);
	unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());

	// Ensure the number of vector elements is greater than 1.
	if (VecTy->getNumElements() < 2)
	return false;

	// Ensure the element type is legal.
	if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
	return false;

	// Ensure the total vector size is 64 or a multiple of 128. Types larger than
	// 128 will be split into multiple interleaved accesses.
	return VecSize == 64 \|\| VecSize % 128 == 0;
	}

	/// \brief Lower an interleaved load into a ldN intrinsic.
	///
	/// E.g. Lower an interleaved load (Factor = 2):
	/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr
	/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
	/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
	///
	/// Into:
	/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.aarch64.neon.ld2(%ptr)
	/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
	/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
	bool AArch64TargetLowering::lowerInterleavedLoad(
	LoadInst LI, ArrayRef<ShuffleVectorInst > Shuffles,
	ArrayRef<unsigned> Indices, unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");
	assert(!Shuffles.empty() && "Empty shufflevector input");
	assert(Shuffles.size() == Indices.size() &&
	"Unmatched number of shufflevectors and indices");

	const DataLayout &DL = LI->getModule()->getDataLayout();

	VectorType *VecTy = Shuffles[0]->getType();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(VecTy, DL))
	return false;

	unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);

	// A pointer vector can not be the return type of the ldN intrinsics. Need to
	// load integer vectors first and then convert to pointer vectors.
	Type *EltTy = VecTy->getVectorElementType();
	if (EltTy->isPointerTy())
	VecTy =
	VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());

	IRBuilder<> Builder(LI);

	// The base address of the load.
	Value *BaseAddr = LI->getPointerOperand();

	if (NumLoads > 1) {
	// If we're going to generate more than one load, reset the sub-vector type
	// to something legal.
	VecTy = VectorType::get(VecTy->getVectorElementType(),
	VecTy->getVectorNumElements() / NumLoads);

	// We will compute the pointer operand of each load from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr, VecTy->getVectorElementType()->getPointerTo(
	LI->getPointerAddressSpace()));
	}

	Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
	Type *Tys[2] = {VecTy, PtrTy};
	static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
	Intrinsic::aarch64_neon_ld3,
	Intrinsic::aarch64_neon_ld4};
	Function *LdNFunc =
	Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);

	// Holds sub-vectors extracted from the load intrinsic return values. The
	// sub-vectors are associated with the shufflevector instructions they will
	// replace.
	DenseMap<ShuffleVectorInst , SmallVector<Value , 4>> SubVecs;

	for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {

	// If we're generating more than one load, compute the base address of
	// subsequent loads as an offset from the previous.
	if (LoadCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(
	BaseAddr, VecTy->getVectorNumElements() * Factor);

	CallInst *LdN = Builder.CreateCall(
	LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");

	// Extract and store the sub-vectors returned by the load intrinsic.
	for (unsigned i = 0; i < Shuffles.size(); i++) {
	ShuffleVectorInst *SVI = Shuffles[i];
	unsigned Index = Indices[i];

	Value *SubVec = Builder.CreateExtractValue(LdN, Index);

	// Convert the integer vector to pointer vector if the element is pointer.
	if (EltTy->isPointerTy())
	SubVec = Builder.CreateIntToPtr(
	SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
	VecTy->getVectorNumElements()));
	SubVecs[SVI].push_back(SubVec);
	}
	}

	// Replace uses of the shufflevector instructions with the sub-vectors
	// returned by the load intrinsic. If a shufflevector instruction is
	// associated with more than one sub-vector, those sub-vectors will be
	// concatenated into a single wide vector.
	for (ShuffleVectorInst *SVI : Shuffles) {
	auto &SubVec = SubVecs[SVI];
	auto *WideVec =
	SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
	SVI->replaceAllUsesWith(WideVec);
	}

	return true;
	}

	/// \brief Lower an interleaved store into a stN intrinsic.
	///
	/// E.g. Lower an interleaved store (Factor = 3):
	/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
	/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
	/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
	/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	///
	/// Note that the new shufflevectors will be removed and we'll only generate one
	/// st3 instruction in CodeGen.
	///
	/// Example for a more general valid mask (Factor 3). Lower:
	/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
	/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
	/// store <12 x i32> %i.vec, <12 x i32>* %ptr
	///
	/// Into:
	/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
	/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
	/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
	/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
	bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
	ShuffleVectorInst *SVI,
	unsigned Factor) const {
	assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
	"Invalid interleave factor");

	VectorType *VecTy = SVI->getType();
	assert(VecTy->getVectorNumElements() % Factor == 0 &&
	"Invalid interleaved store");

	unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
	Type *EltTy = VecTy->getVectorElementType();
	VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);

	const DataLayout &DL = SI->getModule()->getDataLayout();

	// Skip if we do not have NEON and skip illegal vector types. We can
	// "legalize" wide vector types into multiple interleaved accesses as long as
	// the vector types are divisible by 128.
	if (!Subtarget->hasNEON() \|\| !isLegalInterleavedAccessType(SubVecTy, DL))
	return false;

	unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);

	Value *Op0 = SVI->getOperand(0);
	Value *Op1 = SVI->getOperand(1);
	IRBuilder<> Builder(SI);

	// StN intrinsics don't support pointer vectors as arguments. Convert pointer
	// vectors to integer vectors.
	if (EltTy->isPointerTy()) {
	Type *IntTy = DL.getIntPtrType(EltTy);
	unsigned NumOpElts =
	dyn_cast<VectorType>(Op0->getType())->getVectorNumElements();

	// Convert to the corresponding integer vector.
	Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
	Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
	Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);

	SubVecTy = VectorType::get(IntTy, LaneLen);
	}

	// The base address of the store.
	Value *BaseAddr = SI->getPointerOperand();

	if (NumStores > 1) {
	// If we're going to generate more than one store, reset the lane length
	// and sub-vector type to something legal.
	LaneLen /= NumStores;
	SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);

	// We will compute the pointer operand of each store from the original base
	// address using GEPs. Cast the base address to a pointer to the scalar
	// element type.
	BaseAddr = Builder.CreateBitCast(
	BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
	SI->getPointerAddressSpace()));
	}

	auto Mask = SVI->getShuffleMask();

	Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
	Type *Tys[2] = {SubVecTy, PtrTy};
	static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
	Intrinsic::aarch64_neon_st3,
	Intrinsic::aarch64_neon_st4};
	Function *StNFunc =
	Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);

	for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {

	SmallVector<Value *, 5> Ops;

	// Split the shufflevector operands into sub vectors for the new stN call.
	for (unsigned i = 0; i < Factor; i++) {
	unsigned IdxI = StoreCount * LaneLen * Factor + i;
	if (Mask[IdxI] >= 0) {
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
	} else {
	unsigned StartMask = 0;
	for (unsigned j = 1; j < LaneLen; j++) {
	unsigned IdxJ = StoreCount * LaneLen * Factor + j;
	if (Mask[IdxJ * Factor + IdxI] >= 0) {
	StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
	break;
	}
	}
	// Note: Filling undef gaps with random elements is ok, since
	// those elements were being written anyway (with undefs).
	// In the case of all undefs we're defaulting to using elems from 0
	// Note: StartMask cannot be negative, it's checked in
	// isReInterleaveMask
	Ops.push_back(Builder.CreateShuffleVector(
	Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
	}
	}

	// If we generating more than one store, we compute the base address of
	// subsequent stores as an offset from the previous.
	if (StoreCount > 0)
	BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);

	Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
	Builder.CreateCall(StNFunc, Ops);
	}
	return true;
	}

	static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
	unsigned AlignCheck) {
	return ((SrcAlign == 0 \|\| SrcAlign % AlignCheck == 0) &&
	(DstAlign == 0 \|\| DstAlign % AlignCheck == 0));
	}

	EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
	unsigned SrcAlign, bool IsMemset,
	bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	// Don't use AdvSIMD to implement 16-byte memset. It would have taken one
	// instruction to materialize the v2i64 zero and one store (with restrictive
	// addressing mode). Just do two i64 store of zero-registers.
	bool Fast;
	const Function *F = MF.getFunction();
	if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
	!F->hasFnAttribute(Attribute::NoImplicitFloat) &&
	(memOpAlign(SrcAlign, DstAlign, 16) \|\|
	(allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
	return MVT::f128;

	if (Size >= 8 &&
	(memOpAlign(SrcAlign, DstAlign, 8) \|\|
	(allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
	return MVT::i64;

	if (Size >= 4 &&
	(memOpAlign(SrcAlign, DstAlign, 4) \|\|
	(allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
	return MVT::i32;

	return MVT::Other;
	}

	// 12-bit optionally shifted immediates are legal for adds.
	bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
	// Avoid UB for INT64_MIN.
	if (Immed == std::numeric_limits<int64_t>::min())
	return false;
	// Same encoding for add/sub, just flip the sign.
	Immed = std::abs(Immed);
	return ((Immed >> 12) == 0 \|\| ((Immed & 0xfff) == 0 && Immed >> 24 == 0));
	}

	// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
	// immediates is the same as for an add or a sub.
	bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
	return isLegalAddImmediate(Immed);
	}

	/// isLegalAddressingMode - Return true if the addressing mode represented
	/// by AM is legal for this target, for a load/store of the specified type.
	bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// AArch64 has five basic addressing modes:
	// reg
	// reg + 9-bit signed offset
	// reg + SIZE_IN_BYTES * 12-bit unsigned offset
	// reg1 + reg2
	// reg + SIZE_IN_BYTES * reg

	// No global is ever allowed as a base.
	if (AM.BaseGV)
	return false;

	// No reg+reg+imm addressing.
	if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
	return false;

	// check reg + imm case:
	// i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
	uint64_t NumBytes = 0;
	if (Ty->isSized()) {
	uint64_t NumBits = DL.getTypeSizeInBits(Ty);
	NumBytes = NumBits / 8;
	if (!isPowerOf2_64(NumBits))
	NumBytes = 0;
	}

	if (!AM.Scale) {
	int64_t Offset = AM.BaseOffs;

	// 9-bit signed offset
	if (isInt<9>(Offset))
	return true;

	// 12-bit unsigned offset
	unsigned shift = Log2_64(NumBytes);
	if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
	// Must be a multiple of NumBytes (NumBytes is a power of 2)
	(Offset >> shift) << shift == Offset)
	return true;
	return false;
	}

	// Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2

	return AM.Scale == 1 \|\| (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes);
	}

	int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// Operands \| Rt Latency
	// -------------------------------------------
	// Rt, [Xn, Xm] \| 4
	// -------------------------------------------
	// Rt, [Xn, Xm, lsl #imm] \| Rn: 4 Rm: 5
	// Rt, [Xn, Wm, <extend> #imm] \|
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1 if
	// it is not equal to 0 or 1.
	return AM.Scale != 0 && AM.Scale != 1;
	return -1;
	}

	bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	const MCPhysReg *
	AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
	// LR is a callee-save register, but we must treat it as clobbered by any call
	// site. Hence we include LR in the scratch registers, which are in turn added
	// as implicit-defs for stackmaps and patchpoints.
	static const MCPhysReg ScratchRegs[] = {
	AArch64::X16, AArch64::X17, AArch64::LR, 0
	};
	return ScratchRegs;
	}

	bool
	AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
	EVT VT = N->getValueType(0);
	// If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
	// it with shift to let it be lowered to UBFX.
	if (N->getOpcode() == ISD::AND && (VT == MVT::i32 \|\| VT == MVT::i64) &&
	isa<ConstantSDNode>(N->getOperand(1))) {
	uint64_t TruncMask = N->getConstantOperandVal(1);
	if (isMask_64(TruncMask) &&
	N->getOperand(0).getOpcode() == ISD::SRL &&
	isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
	return false;
	}
	return true;
	}

	bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0)
	return false;

	int64_t Val = Imm.getSExtValue();
	if (Val == 0 \|\| AArch64_AM::isLogicalImmediate(Val, BitSize))
	return true;

	if ((int64_t)Val < 0)
	Val = ~Val;
	if (BitSize == 32)
	Val &= (1LL << 32) - 1;

	unsigned LZ = countLeadingZeros((uint64_t)Val);
	unsigned Shift = (63 - LZ) / 16;
	// MOVZ is free so return true for one or fewer MOVK.
	return Shift < 3;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// cmge X, X, #0
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	EVT VT = N->getValueType(0);
	if (!Subtarget->hasNEON() \|\| !VT.isVector())
	return SDValue();

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != AArch64ISD::VASHR \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftAmt = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	return DAG.getNode(AArch64ISD::CMGEz, SDLoc(N), VT, Shift.getOperand(0));
	}

	// Generate SUBS and CSEL for integer abs.
	static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
	// and change it to SUB and CSEL.
	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
	if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
	if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
	SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	N0.getOperand(0));
	// Generate SUBS & CSEL.
	SDValue Cmp =
	DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
	N0.getOperand(0), DAG.getConstant(0, DL, VT));
	return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
	DAG.getConstant(AArch64CC::PL, DL, MVT::i32),
	SDValue(Cmp.getNode(), 1));
	}
	return SDValue();
	}

	static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	return performIntegerAbsCombine(N, DAG);
	}

	SDValue
	AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
	SelectionDAG &DAG,
	std::vector<SDNode > Created) const {
	AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
	if (isIntDivCheap(N->getValueType(0), Attr))
	return SDValue(N,0); // Lower SDIV as SDIV

	// fold (sdiv X, pow2)
	EVT VT = N->getValueType(0);
	if ((VT != MVT::i32 && VT != MVT::i64) \|\|
	!(Divisor.isPowerOf2() \|\| (-Divisor).isPowerOf2()))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	unsigned Lg2 = Divisor.countTrailingZeros();
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);

	// Add (N0 < 0) ? Pow2 - 1 : 0;
	SDValue CCVal;
	SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
	SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);

	if (Created) {
	Created->push_back(Cmp.getNode());
	Created->push_back(Add.getNode());
	Created->push_back(CSel.getNode());
	}

	// Divide by pow2.
	SDValue SRA =
	DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));

	// If we're dividing by a positive value, we're done. Otherwise, we must
	// negate the result.
	if (Divisor.isNonNegative())
	return SRA;

	if (Created)
	Created->push_back(SRA.getNode());
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
	}

	static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// The below optimizations require a constant RHS.
	if (!isa<ConstantSDNode>(N->getOperand(1)))
	return SDValue();

	ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
	const APInt &ConstValue = C->getAPIntValue();

	// Multiplication of a power of two plus/minus one can be done more
	// cheaply as as shift+add/sub. For now, this is true unilaterally. If
	// future CPUs have a cheaper MADD instruction, this may need to be
	// gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
	// 64-bit is 5 cycles, so this is always a win.
	// More aggressively, some multiplications N0 * C can be lowered to
	// shift+add+shift if the constant C = A * B where A = 2^N + 1 and B = 2^M,
	// e.g. 6=32=(2+1)2.
	// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
	// which equals to (1+2)*16-(1+2).
	SDValue N0 = N->getOperand(0);
	// TrailingZeroes is used to test if the mul can be lowered to
	// shift+add+shift.
	unsigned TrailingZeroes = ConstValue.countTrailingZeros();
	if (TrailingZeroes) {
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into smul or umul.
	if (N0->hasOneUse() && (isSignExtended(N0.getNode(), DAG) \|\|
	isZeroExtended(N0.getNode(), DAG)))
	return SDValue();
	// Conservatively do not lower to shift+add+shift if the mul might be
	// folded into madd or msub.
	if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ADD \|\|
	N->use_begin()->getOpcode() == ISD::SUB))
	return SDValue();
	}
	// Use ShiftedConstValue instead of ConstValue to support both shift+add/sub
	// and shift+add+shift.
	APInt ShiftedConstValue = ConstValue.ashr(TrailingZeroes);

	unsigned ShiftAmt, AddSubOpc;
	// Is the shifted value the LHS operand of the add/sub?
	bool ShiftValUseIsN0 = true;
	// Do we need to negate the result?
	bool NegateResult = false;

	if (ConstValue.isNonNegative()) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	// (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
	APInt SCVMinus1 = ShiftedConstValue - 1;
	APInt CVPlus1 = ConstValue + 1;
	if (SCVMinus1.isPowerOf2()) {
	ShiftAmt = SCVMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	} else if (CVPlus1.isPowerOf2()) {
	ShiftAmt = CVPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	} else
	return SDValue();
	} else {
	// (mul x, -(2^N - 1)) => (sub x, (shl x, N))
	// (mul x, -(2^N + 1)) => - (add (shl x, N), x)
	APInt CVNegPlus1 = -ConstValue + 1;
	APInt CVNegMinus1 = -ConstValue - 1;
	if (CVNegPlus1.isPowerOf2()) {
	ShiftAmt = CVNegPlus1.logBase2();
	AddSubOpc = ISD::SUB;
	ShiftValUseIsN0 = false;
	} else if (CVNegMinus1.isPowerOf2()) {
	ShiftAmt = CVNegMinus1.logBase2();
	AddSubOpc = ISD::ADD;
	NegateResult = true;
	} else
	return SDValue();
	}

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
	DAG.getConstant(ShiftAmt, DL, MVT::i64));

	SDValue AddSubN0 = ShiftValUseIsN0 ? ShiftedVal : N0;
	SDValue AddSubN1 = ShiftValUseIsN0 ? N0 : ShiftedVal;
	SDValue Res = DAG.getNode(AddSubOpc, DL, VT, AddSubN0, AddSubN1);
	assert(!(NegateResult && TrailingZeroes) &&
	"NegateResult and TrailingZeroes cannot both be true for now.");
	// Negate the result.
	if (NegateResult)
	return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Res);
	// Shift the result.
	if (TrailingZeroes)
	return DAG.getNode(ISD::SHL, DL, VT, Res,
	DAG.getConstant(TrailingZeroes, DL, MVT::i64));
	return Res;
	}

	static SDValue performVectorCompareAndMaskUnaryOpCombine(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getNode(ISD::BITCAST, DL, IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getNode(ISD::BITCAST, DL, VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	// First try to optimize away the conversion when it's conditionally from
	// a constant. Vectors only.
	if (SDValue Res = performVectorCompareAndMaskUnaryOpCombine(N, DAG))
	return Res;

	EVT VT = N->getValueType(0);
	if (VT != MVT::f32 && VT != MVT::f64)
	return SDValue();

	// Only optimize when the source and destination types have the same width.
	if (VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
	return SDValue();

	// If the result of an integer load is only used by an integer-to-float
	// conversion, use a fp load instead and a AdvSIMD scalar {S\|U}CVTF instead.
	// This eliminates an "integer-to-vector-move" UOP and improves throughput.
	SDValue N0 = N->getOperand(0);
	if (Subtarget->hasNEON() && ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
	// Do not change the width of a volatile load.
	!cast<LoadSDNode>(N0)->isVolatile()) {
	LoadSDNode *LN0 = cast<LoadSDNode>(N0);
	SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
	LN0->getPointerInfo(), LN0->getAlignment(),
	LN0->getMemOperand()->getFlags());

	// Make sure successors of the original load stay after it by updating them
	// to use the new Chain.
	DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));

	unsigned Opcode =
	(N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
	return DAG.getNode(Opcode, SDLoc(N), VT, Load);
	}

	return SDValue();
	}

	/// Fold a floating-point multiply by power of two into floating-point to
	/// fixed-point conversion.
	static SDValue performFpToIntCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	Op.getOpcode() != ISD::FMUL)
	return SDValue();

	SDValue ConstVec = Op->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
	uint32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
	uint32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., float -> i64).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t Bits = IntBits == 64 ? 64 : 32;
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, Bits + 1);
	if (C == -1 \|\| C == 0 \|\| C > Bits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	assert((ResTy != MVT::v4i64 \|\| DCI.isBeforeLegalizeOps()) &&
	"Illegal vector type after legalization");

	SDLoc DL(N);
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfp2fxs
	: Intrinsic::aarch64_neon_vcvtfp2fxu;
	SDValue FixConv =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, ResTy,
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32),
	Op->getOperand(0), DAG.getConstant(C, DL, MVT::i32));
	// We can handle smaller integers by generating an extra trunc.
	if (IntBits < FloatBits)
	FixConv = DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), FixConv);

	return FixConv;
	}

	/// Fold a floating-point divide by power of two into fixed-point to
	/// floating-point conversion.
	static SDValue performFDivCombine(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	if (!Subtarget->hasNEON())
	return SDValue();

	SDValue Op = N->getOperand(0);
	unsigned Opc = Op->getOpcode();
	if (!Op.getValueType().isVector() \|\| !Op.getValueType().isSimple() \|\|
	!Op.getOperand(0).getValueType().isSimple() \|\|
	(Opc != ISD::SINT_TO_FP && Opc != ISD::UINT_TO_FP))
	return SDValue();

	SDValue ConstVec = N->getOperand(1);
	if (!isa<BuildVectorSDNode>(ConstVec))
	return SDValue();

	MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
	int32_t IntBits = IntTy.getSizeInBits();
	if (IntBits != 16 && IntBits != 32 && IntBits != 64)
	return SDValue();

	MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
	int32_t FloatBits = FloatTy.getSizeInBits();
	if (FloatBits != 32 && FloatBits != 64)
	return SDValue();

	// Avoid conversions where iN is larger than the float (e.g., i64 -> float).
	if (IntBits > FloatBits)
	return SDValue();

	BitVector UndefElements;
	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(ConstVec);
	int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, FloatBits + 1);
	if (C == -1 \|\| C == 0 \|\| C > FloatBits)
	return SDValue();

	MVT ResTy;
	unsigned NumLanes = Op.getValueType().getVectorNumElements();
	switch (NumLanes) {
	default:
	return SDValue();
	case 2:
	ResTy = FloatBits == 32 ? MVT::v2i32 : MVT::v2i64;
	break;
	case 4:
	ResTy = FloatBits == 32 ? MVT::v4i32 : MVT::v4i64;
	break;
	}

	if (ResTy == MVT::v4i64 && DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc DL(N);
	SDValue ConvInput = Op.getOperand(0);
	bool IsSigned = Opc == ISD::SINT_TO_FP;
	if (IntBits < FloatBits)
	ConvInput = DAG.getNode(IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, DL,
	ResTy, ConvInput);

	unsigned IntrinsicOpcode = IsSigned ? Intrinsic::aarch64_neon_vcvtfxs2fp
	: Intrinsic::aarch64_neon_vcvtfxu2fp;
	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
	DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
	DAG.getConstant(C, DL, MVT::i32));
	}

	/// An EXTR instruction is made up of two shifts, ORed together. This helper
	/// searches for and classifies those shifts.
	static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
	bool &FromHi) {
	if (N.getOpcode() == ISD::SHL)
	FromHi = false;
	else if (N.getOpcode() == ISD::SRL)
	FromHi = true;
	else
	return false;

	if (!isa<ConstantSDNode>(N.getOperand(1)))
	return false;

	ShiftAmount = N->getConstantOperandVal(1);
	Src = N->getOperand(0);
	return true;
	}

	/// EXTR instruction extracts a contiguous chunk of bits from two existing
	/// registers viewed as a high/low pair. This function looks for the pattern:
	/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it
	/// with an EXTR. Can't quite be done in TableGen because the two immediates
	/// aren't independent.
	static SDValue tryCombineToEXTR(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);
	EVT VT = N->getValueType(0);

	assert(N->getOpcode() == ISD::OR && "Unexpected root");

	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	SDValue LHS;
	uint32_t ShiftLHS = 0;
	bool LHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
	return SDValue();

	SDValue RHS;
	uint32_t ShiftRHS = 0;
	bool RHSFromHi = false;
	if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
	return SDValue();

	// If they're both trying to come from the high part of the register, they're
	// not really an EXTR.
	if (LHSFromHi == RHSFromHi)
	return SDValue();

	if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
	return SDValue();

	if (LHSFromHi) {
	std::swap(LHS, RHS);
	std::swap(ShiftLHS, ShiftRHS);
	}

	return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
	DAG.getConstant(ShiftRHS, DL, MVT::i64));
	}

	static SDValue tryCombineToBSL(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	EVT VT = N->getValueType(0);
	SelectionDAG &DAG = DCI.DAG;
	SDLoc DL(N);

	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() != ISD::AND)
	return SDValue();

	SDValue N1 = N->getOperand(1);
	if (N1.getOpcode() != ISD::AND)
	return SDValue();

	// We only have to look for constant vectors here since the general, variable
	// case can be handled in TableGen.
	unsigned Bits = VT.getScalarSizeInBits();
	uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
	for (int i = 1; i >= 0; --i)
	for (int j = 1; j >= 0; --j) {
	BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
	BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
	if (!BVN0 \|\| !BVN1)
	continue;

	bool FoundMatch = true;
	for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
	ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
	ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
	if (!CN0 \|\| !CN1 \|\|
	CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
	FoundMatch = false;
	break;
	}
	}

	if (FoundMatch)
	return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
	N0->getOperand(1 - i), N1->getOperand(1 - j));
	}

	return SDValue();
	}

	static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	// Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	if (SDValue Res = tryCombineToEXTR(N, DCI))
	return Res;

	if (SDValue Res = tryCombineToBSL(N, DCI))
	return Res;

	return SDValue();
	}

	static SDValue performSRLCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// Canonicalize (srl (bswap i32 x), 16) to (rotr (bswap i32 x), 16), if the
	// high 16-bits of x are zero. Similarly, canonicalize (srl (bswap i64 x), 32)
	// to (rotr (bswap i64 x), 32), if the high 32-bits of x are zero.
	SDValue N0 = N->getOperand(0);
	if (N0.getOpcode() == ISD::BSWAP) {
	SDLoc DL(N);
	SDValue N1 = N->getOperand(1);
	SDValue N00 = N0.getOperand(0);
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
	uint64_t ShiftAmt = C->getZExtValue();
	if (VT == MVT::i32 && ShiftAmt == 16 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(32, 16)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	if (VT == MVT::i64 && ShiftAmt == 32 &&
	DAG.MaskedValueIsZero(N00, APInt::getHighBitsSet(64, 32)))
	return DAG.getNode(ISD::ROTR, DL, VT, N0, N1);
	}
	}
	return SDValue();
	}

	static SDValue performBitcastCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// Remove extraneous bitcasts around an extract_subvector.
	// For example,
	// (v4i16 (bitconvert
	// (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
	// becomes
	// (extract_subvector ((v8i16 ...), (i64 4)))

	// Only interested in 64-bit vectors as the ultimate result.
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();
	if (VT.getSimpleVT().getSizeInBits() != 64)
	return SDValue();
	// Is the operand an extract_subvector starting at the beginning or halfway
	// point of the vector? A low half may also come through as an
	// EXTRACT_SUBREG, so look for that, too.
	SDValue Op0 = N->getOperand(0);
	if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
	!(Op0->isMachineOpcode() &&
	Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
	return SDValue();
	uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
	if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
	return SDValue();
	} else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
	if (idx != AArch64::dsub)
	return SDValue();
	// The dsub reference is equivalent to a lane zero subvector reference.
	idx = 0;
	}
	// Look through the bitcast of the input to the extract.
	if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
	return SDValue();
	SDValue Source = Op0->getOperand(0)->getOperand(0);
	// If the source type has twice the number of elements as our destination
	// type, we know this is an extract of the high or low half of the vector.
	EVT SVT = Source->getValueType(0);
	if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
	return SDValue();

	DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");

	// Create the simplified form to just extract the low or high half of the
	// vector directly rather than bothering with the bitcasts.
	SDLoc dl(N);
	unsigned NumElements = VT.getVectorNumElements();
	if (idx) {
	SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
	} else {
	SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
	return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
	Source, SubReg),
	0);
	}
	}

	static SDValue performConcatVectorsCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);

	// Optimize concat_vectors of truncated vectors, where the intermediate
	// type is illegal, to avoid said illegality, e.g.,
	// (v4i16 (concat_vectors (v2i16 (truncate (v2i64))),
	// (v2i16 (truncate (v2i64)))))
	// ->
	// (v4i16 (truncate (vector_shuffle (v4i32 (bitcast (v2i64))),
	// (v4i32 (bitcast (v2i64))),
	// <0, 2, 4, 6>)))
	// This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
	// on both input and result type, so we might generate worse code.
	// On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
	if (N->getNumOperands() == 2 &&
	N0->getOpcode() == ISD::TRUNCATE &&
	N1->getOpcode() == ISD::TRUNCATE) {
	SDValue N00 = N0->getOperand(0);
	SDValue N10 = N1->getOperand(0);
	EVT N00VT = N00.getValueType();

	if (N00VT == N10.getValueType() &&
	(N00VT == MVT::v2i64 \|\| N00VT == MVT::v4i32) &&
	N00VT.getScalarSizeInBits() == 4 * VT.getScalarSizeInBits()) {
	MVT MidVT = (N00VT == MVT::v2i64 ? MVT::v4i32 : MVT::v8i16);
	SmallVector<int, 8> Mask(MidVT.getVectorNumElements());
	for (size_t i = 0; i < Mask.size(); ++i)
	Mask[i] = i * 2;
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getVectorShuffle(
	MidVT, dl,
	DAG.getNode(ISD::BITCAST, dl, MidVT, N00),
	DAG.getNode(ISD::BITCAST, dl, MidVT, N10), Mask));
	}
	}

	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	// If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
	// splat. The indexed instructions are going to be expecting a DUPLANE64, so
	// canonicalise to that.
	if (N0 == N1 && VT.getVectorNumElements() == 2) {
	assert(VT.getScalarSizeInBits() == 64);
	return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT, WidenVector(N0, DAG),
	DAG.getConstant(0, dl, MVT::i64));
	}

	// Canonicalise concat_vectors so that the right-hand vector has as few
	// bit-casts as possible before its real operation. The primary matching
	// destination for these operations will be the narrowing "2" instructions,
	// which depend on the operation being performed on this right-hand vector.
	// For example,
	// (concat_vectors LHS, (v1i64 (bitconvert (v4i16 RHS))))
	// becomes
	// (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))

	if (N1->getOpcode() != ISD::BITCAST)
	return SDValue();
	SDValue RHS = N1->getOperand(0);
	MVT RHSTy = RHS.getValueType().getSimpleVT();
	// If the RHS is not a vector, this is not the pattern we're looking for.
	if (!RHSTy.isVector())
	return SDValue();

	DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");

	MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
	RHSTy.getVectorNumElements() * 2);
	return DAG.getNode(ISD::BITCAST, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
	DAG.getNode(ISD::BITCAST, dl, RHSTy, N0),
	RHS));
	}

	static SDValue tryCombineFixedPointConvert(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// Wait 'til after everything is legalized to try this. That way we have
	// legal vector types and such.
	if (DCI.isBeforeLegalizeOps())
	return SDValue();
	// Transform a scalar conversion of a value from a lane extract into a
	// lane extract of a vector conversion. E.g., from foo1 to foo2:
	// double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
	// double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
	//
	// The second form interacts better with instruction selection and the
	// register allocator to avoid cross-class register copies that aren't
	// coalescable due to a lane reference.

	// Check the operand and see if it originates from a lane extract.
	SDValue Op1 = N->getOperand(1);
	if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	// Yep, no additional predication needed. Perform the transform.
	SDValue IID = N->getOperand(0);
	SDValue Shift = N->getOperand(2);
	SDValue Vec = Op1.getOperand(0);
	SDValue Lane = Op1.getOperand(1);
	EVT ResTy = N->getValueType(0);
	EVT VecResTy;
	SDLoc DL(N);

	// The vector width should be 128 bits by the time we get here, even
	// if it started as 64 bits (the extract_vector handling will have
	// done so).
	assert(Vec.getValueSizeInBits() == 128 &&
	"unexpected vector size on extract_vector_elt!");
	if (Vec.getValueType() == MVT::v4i32)
	VecResTy = MVT::v4f32;
	else if (Vec.getValueType() == MVT::v2i64)
	VecResTy = MVT::v2f64;
	else
	llvm_unreachable("unexpected vector type!");

	SDValue Convert =
	DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
	}
	return SDValue();
	}

	// AArch64 high-vector "long" operations are formed by performing the non-high
	// version on an extract_subvector of each operand which gets the high half:
	//
	// (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
	//
	// However, there are cases which don't have an extract_high explicitly, but
	// have another operation that can be made compatible with one for free. For
	// example:
	//
	// (dupv64 scalar) --> (extract_high (dup128 scalar))
	//
	// This routine does the actual conversion of such DUPs, once outer routines
	// have determined that everything else is in order.
	// It also supports immediate DUP-like nodes (MOVI/MVNi), which we can fold
	// similarly here.
	static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
	switch (N.getOpcode()) {
	case AArch64ISD::DUP:
	case AArch64ISD::DUPLANE8:
	case AArch64ISD::DUPLANE16:
	case AArch64ISD::DUPLANE32:
	case AArch64ISD::DUPLANE64:
	case AArch64ISD::MOVI:
	case AArch64ISD::MOVIshift:
	case AArch64ISD::MOVIedit:
	case AArch64ISD::MOVImsl:
	case AArch64ISD::MVNIshift:
	case AArch64ISD::MVNImsl:
	break;
	default:
	// FMOV could be supported, but isn't very useful, as it would only occur
	// if you passed a bitcast' floating point immediate to an eligible long
	// integer op (addl, smull, ...).
	return SDValue();
	}

	MVT NarrowTy = N.getSimpleValueType();
	if (!NarrowTy.is64BitVector())
	return SDValue();

	MVT ElementTy = NarrowTy.getVectorElementType();
	unsigned NumElems = NarrowTy.getVectorNumElements();
	MVT NewVT = MVT::getVectorVT(ElementTy, NumElems * 2);

	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NarrowTy,
	DAG.getNode(N->getOpcode(), dl, NewVT, N->ops()),
	DAG.getConstant(NumElems, dl, MVT::i64));
	}

	static bool isEssentiallyExtractSubvector(SDValue N) {
	if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	return true;

	return N.getOpcode() == ISD::BITCAST &&
	N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
	}

	/// \brief Helper structure to keep track of ISD::SET_CC operands.
	struct GenericSetCCInfo {
	const SDValue *Opnd0;
	const SDValue *Opnd1;
	ISD::CondCode CC;
	};

	/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
	struct AArch64SetCCInfo {
	const SDValue *Cmp;
	AArch64CC::CondCode CC;
	};

	/// \brief Helper structure to keep track of SetCC information.
	union SetCCInfo {
	GenericSetCCInfo Generic;
	AArch64SetCCInfo AArch64;
	};

	/// \brief Helper structure to be able to read SetCC information. If set to
	/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
	/// GenericSetCCInfo.
	struct SetCCInfoAndKind {
	SetCCInfo Info;
	bool IsAArch64;
	};

	/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
	/// an
	/// AArch64 lowered one.
	/// \p SetCCInfo is filled accordingly.
	/// \post SetCCInfo is meanginfull only when this function returns true.
	/// \return True when Op is a kind of SET_CC operation.
	static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
	// If this is a setcc, this is straight forward.
	if (Op.getOpcode() == ISD::SETCC) {
	SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
	SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
	SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
	SetCCInfo.IsAArch64 = false;
	return true;
	}
	// Otherwise, check if this is a matching csel instruction.
	// In other words:
	// - csel 1, 0, cc
	// - csel 0, 1, !cc
	if (Op.getOpcode() != AArch64ISD::CSEL)
	return false;
	// Set the information about the operands.
	// TODO: we want the operands of the Cmp not the csel
	SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
	SetCCInfo.IsAArch64 = true;
	SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// Check that the operands matches the constraints:
	// (1) Both operands must be constants.
	// (2) One must be 1 and the other must be 0.
	ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
	ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));

	// Check (1).
	if (!TValue \|\| !FValue)
	return false;

	// Check (2).
	if (!TValue->isOne()) {
	// Update the comparison when we are interested in !cc.
	std::swap(TValue, FValue);
	SetCCInfo.Info.AArch64.CC =
	AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
	}
	return TValue->isOne() && FValue->isNullValue();
	}

	// Returns true if Op is setcc or zext of setcc.
	static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
	if (isSetCC(Op, Info))
	return true;
	return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
	isSetCC(Op->getOperand(0), Info));
	}

	// The folding we want to perform is:
	// (add x, [zext] (setcc cc ...) )
	// -->
	// (csel x, (add x, 1), !cc ...)
	//
	// The latter will get matched to a CSINC instruction.
	static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
	assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
	SDValue LHS = Op->getOperand(0);
	SDValue RHS = Op->getOperand(1);
	SetCCInfoAndKind InfoAndKind;

	// If neither operand is a SET_CC, give up.
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
	std::swap(LHS, RHS);
	if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
	return SDValue();
	}

	// FIXME: This could be generatized to work for FP comparisons.
	EVT CmpVT = InfoAndKind.IsAArch64
	? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
	: InfoAndKind.Info.Generic.Opnd0->getValueType();
	if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
	return SDValue();

	SDValue CCVal;
	SDValue Cmp;
	SDLoc dl(Op);
	if (InfoAndKind.IsAArch64) {
	CCVal = DAG.getConstant(
	AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), dl,
	MVT::i32);
	Cmp = *InfoAndKind.Info.AArch64.Cmp;
	} else
	Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
	*InfoAndKind.Info.Generic.Opnd1,
	ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
	CCVal, DAG, dl);

	EVT VT = Op->getValueType(0);
	LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
	return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
	}

	// The basic add/sub long vector instructions have variants with "2" on the end
	// which act on the high-half of their inputs. They are normally matched by
	// patterns like:
	//
	// (add (zeroext (extract_high LHS)),
	// (zeroext (extract_high RHS)))
	// -> uaddl2 vD, vN, vM
	//
	// However, if one of the extracts is something like a duplicate, this
	// instruction can still be used profitably. This function puts the DAG into a
	// more appropriate form for those patterns to trigger.
	static SDValue performAddSubLongCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	MVT VT = N->getSimpleValueType(0);
	if (!VT.is128BitVector()) {
	if (N->getOpcode() == ISD::ADD)
	return performSetccAddFolding(N, DAG);
	return SDValue();
	}

	// Make sure both branches are extended in the same way.
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
	LHS.getOpcode() != ISD::SIGN_EXTEND) \|\|
	LHS.getOpcode() != RHS.getOpcode())
	return SDValue();

	unsigned ExtType = LHS.getOpcode();

	// It's not worth doing if at least one of the inputs isn't already an
	// extract, but we don't know which it'll be so we have to try both.
	if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
	RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
	if (!RHS.getNode())
	return SDValue();

	RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
	} else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
	LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
	if (!LHS.getNode())
	return SDValue();

	LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
	}

	return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
	}

	// Massage DAGs which we can use the high-half "long" operations on into
	// something isel will recognize better. E.g.
	//
	// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
	// (aarch64_neon_umull (extract_high (v2i64 vec)))
	// (extract_high (v2i64 (dup128 scalar)))))
	//
	static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	assert(LHS.getValueType().is64BitVector() &&
	RHS.getValueType().is64BitVector() &&
	"unexpected shape for long operation");

	// Either node could be a DUP, but it's not worth doing both of them (you'd
	// just as well use the non-high version) so look for a corresponding extract
	// operation on the other "wing".
	if (isEssentiallyExtractSubvector(LHS)) {
	RHS = tryExtendDUPToExtractHigh(RHS, DAG);
	if (!RHS.getNode())
	return SDValue();
	} else if (isEssentiallyExtractSubvector(RHS)) {
	LHS = tryExtendDUPToExtractHigh(LHS, DAG);
	if (!LHS.getNode())
	return SDValue();
	}

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
	N->getOperand(0), LHS, RHS);
	}

	static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
	MVT ElemTy = N->getSimpleValueType(0).getScalarType();
	unsigned ElemBits = ElemTy.getSizeInBits();

	int64_t ShiftAmount;
	if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
	APInt SplatValue, SplatUndef;
	unsigned SplatBitSize;
	bool HasAnyUndefs;
	if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
	HasAnyUndefs, ElemBits) \|\|
	SplatBitSize != ElemBits)
	return SDValue();

	ShiftAmount = SplatValue.getSExtValue();
	} else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
	ShiftAmount = CVN->getSExtValue();
	} else
	return SDValue();

	unsigned Opcode;
	bool IsRightShift;
	switch (IID) {
	default:
	llvm_unreachable("Unknown shift intrinsic");
	case Intrinsic::aarch64_neon_sqshl:
	Opcode = AArch64ISD::SQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_uqshl:
	Opcode = AArch64ISD::UQSHL_I;
	IsRightShift = false;
	break;
	case Intrinsic::aarch64_neon_srshl:
	Opcode = AArch64ISD::SRSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_urshl:
	Opcode = AArch64ISD::URSHR_I;
	IsRightShift = true;
	break;
	case Intrinsic::aarch64_neon_sqshlu:
	Opcode = AArch64ISD::SQSHLU_I;
	IsRightShift = false;
	break;
	}

	if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(-ShiftAmount, dl, MVT::i32));
	} else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount < ElemBits) {
	SDLoc dl(N);
	return DAG.getNode(Opcode, dl, N->getValueType(0), N->getOperand(1),
	DAG.getConstant(ShiftAmount, dl, MVT::i32));
	}

	return SDValue();
	}

	// The CRC32[BH] instructions ignore the high bits of their data operand. Since
	// the intrinsics must be legal and take an i32, this means there's almost
	// certainly going to be a zext in the DAG which we can eliminate.
	static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
	SDValue AndN = N->getOperand(2);
	if (AndN.getOpcode() != ISD::AND)
	return SDValue();

	ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
	if (!CMask \|\| CMask->getZExtValue() != Mask)
	return SDValue();

	return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
	N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
	}

	static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
	SelectionDAG &DAG) {
	SDLoc dl(N);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0),
	DAG.getNode(Opc, dl,
	N->getOperand(1).getSimpleValueType(),
	N->getOperand(1)),
	DAG.getConstant(0, dl, MVT::i64));
	}

	static SDValue performIntrinsicCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	const AArch64Subtarget *Subtarget) {
	SelectionDAG &DAG = DCI.DAG;
	unsigned IID = getIntrinsicID(N);
	switch (IID) {
	default:
	break;
	case Intrinsic::aarch64_neon_vcvtfxs2fp:
	case Intrinsic::aarch64_neon_vcvtfxu2fp:
	return tryCombineFixedPointConvert(N, DCI, DAG);
	case Intrinsic::aarch64_neon_saddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SADDV, N, DAG);
	case Intrinsic::aarch64_neon_uaddv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UADDV, N, DAG);
	case Intrinsic::aarch64_neon_sminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMINV, N, DAG);
	case Intrinsic::aarch64_neon_uminv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMINV, N, DAG);
	case Intrinsic::aarch64_neon_smaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::SMAXV, N, DAG);
	case Intrinsic::aarch64_neon_umaxv:
	return combineAcrossLanesIntrinsic(AArch64ISD::UMAXV, N, DAG);
	case Intrinsic::aarch64_neon_fmax:
	return DAG.getNode(ISD::FMAXNAN, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmin:
	return DAG.getNode(ISD::FMINNAN, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fmaxnm:
	return DAG.getNode(ISD::FMAXNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_fminnm:
	return DAG.getNode(ISD::FMINNUM, SDLoc(N), N->getValueType(0),
	N->getOperand(1), N->getOperand(2));
	case Intrinsic::aarch64_neon_smull:
	case Intrinsic::aarch64_neon_umull:
	case Intrinsic::aarch64_neon_pmull:
	case Intrinsic::aarch64_neon_sqdmull:
	return tryCombineLongOpWithDup(IID, N, DCI, DAG);
	case Intrinsic::aarch64_neon_sqshl:
	case Intrinsic::aarch64_neon_uqshl:
	case Intrinsic::aarch64_neon_sqshlu:
	case Intrinsic::aarch64_neon_srshl:
	case Intrinsic::aarch64_neon_urshl:
	return tryCombineShiftImm(IID, N, DAG);
	case Intrinsic::aarch64_crc32b:
	case Intrinsic::aarch64_crc32cb:
	return tryCombineCRC32(0xff, N, DAG);
	case Intrinsic::aarch64_crc32h:
	case Intrinsic::aarch64_crc32ch:
	return tryCombineCRC32(0xffff, N, DAG);
	}
	return SDValue();
	}

	static SDValue performExtendCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	// If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
	// we can convert that DUP into another extract_high (of a bigger DUP), which
	// helps the backend to decide that an sabdl2 would be useful, saving a real
	// extract_high operation.
	if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
	N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
	SDNode *ABDNode = N->getOperand(0).getNode();
	unsigned IID = getIntrinsicID(ABDNode);
	if (IID == Intrinsic::aarch64_neon_sabd \|\|
	IID == Intrinsic::aarch64_neon_uabd) {
	SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
	if (!NewABD.getNode())
	return SDValue();

	return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
	NewABD);
	}
	}

	// This is effectively a custom type legalization for AArch64.
	//
	// Type legalization will split an extend of a small, legal, type to a larger
	// illegal type by first splitting the destination type, often creating
	// illegal source types, which then get legalized in isel-confusing ways,
	// leading to really terrible codegen. E.g.,
	// %result = v8i32 sext v8i8 %value
	// becomes
	// %losrc = extract_subreg %value, ...
	// %hisrc = extract_subreg %value, ...
	// %lo = v4i32 sext v4i8 %losrc
	// %hi = v4i32 sext v4i8 %hisrc
	// Things go rapidly downhill from there.
	//
	// For AArch64, the [sz]ext vector instructions can only go up one element
	// size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
	// take two instructions.
	//
	// This implies that the most efficient way to do the extend from v8i8
	// to two v4i32 values is to first extend the v8i8 to v8i16, then do
	// the normal splitting to happen for the v8i16->v8i32.

	// This is pre-legalization to catch some cases where the default
	// type legalization will create ill-tempered code.
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();

	// We're only interested in cleaning things up for non-legal vector types
	// here. If both the source and destination are legal, things will just
	// work naturally without any fiddling.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT ResVT = N->getValueType(0);
	if (!ResVT.isVector() \|\| TLI.isTypeLegal(ResVT))
	return SDValue();
	// If the vector type isn't a simple VT, it's beyond the scope of what
	// we're worried about here. Let legalization do its thing and hope for
	// the best.
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src->getValueType(0);
	if (!ResVT.isSimple() \|\| !SrcVT.isSimple())
	return SDValue();

	// If the source VT is a 64-bit vector, we can play games and get the
	// better results we want.
	if (SrcVT.getSizeInBits() != 64)
	return SDValue();

	unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
	unsigned ElementCount = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
	SDLoc DL(N);
	Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);

	// Now split the rest of the operation into two halves, each with a 64
	// bit source.
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	unsigned NumElements = ResVT.getVectorNumElements();
	assert(!(NumElements & 1) && "Splitting vector, but not in half!");
	LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
	ResVT.getVectorElementType(), NumElements / 2);

	EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
	LoVT.getVectorNumElements());
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(0, DL, MVT::i64));
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
	DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
	Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
	Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);

	// Now combine the parts back together so we still have a single result
	// like the combiner expects.
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
	}

	static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
	SDValue SplatVal, unsigned NumVecElts) {
	unsigned OrigAlignment = St.getAlignment();
	unsigned EltOffset = SplatVal.getValueType().getSizeInBits() / 8;

	// Create scalar stores. This is at least as good as the code sequence for a
	// split unaligned store which is a dup.s, ext.b, and two stores.
	// Most of the time the three stores should be replaced by store pair
	// instructions (stp).
	SDLoc DL(&St);
	SDValue BasePtr = St.getBasePtr();
	uint64_t BaseOffset = 0;

	const MachinePointerInfo &PtrInfo = St.getPointerInfo();
	SDValue NewST1 =
	DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
	OrigAlignment, St.getMemOperand()->getFlags());

	// As this in ISel, we will not merge this add which may degrade results.
	if (BasePtr->getOpcode() == ISD::ADD &&
	isa<ConstantSDNode>(BasePtr->getOperand(1))) {
	BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
	BasePtr = BasePtr->getOperand(0);
	}

	unsigned Offset = EltOffset;
	while (--NumVecElts) {
	unsigned Alignment = MinAlign(OrigAlignment, Offset);
	SDValue OffsetPtr =
	DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
	NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
	PtrInfo.getWithOffset(Offset), Alignment,
	St.getMemOperand()->getFlags());
	Offset += EltOffset;
	}
	return NewST1;
	}

	/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
	/// load store optimizer pass will merge them to store pair stores. This should
	/// be better than a movi to create the vector zero followed by a vector store
	/// if the zero constant is not re-used, since one instructions and one register
	/// live range will be removed.
	///
	/// For example, the final generated code should be:
	///
	/// stp xzr, xzr, [x0]
	///
	/// instead of:
	///
	/// movi v0.2d, #0
	/// str q0, [x0]
	///
	static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
	// 2, 3 or 4 i32 elements.
	int NumVecElts = VT.getVectorNumElements();
	if (!(((NumVecElts == 2 \|\| NumVecElts == 3) &&
	VT.getVectorElementType().getSizeInBits() == 64) \|\|
	((NumVecElts == 2 \|\| NumVecElts == 3 \|\| NumVecElts == 4) &&
	VT.getVectorElementType().getSizeInBits() == 32)))
	return SDValue();

	if (StVal.getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	// If the zero constant has more than one use then the vector store could be
	// better since the constant mov will be amortized and stp q instructions
	// should be able to be formed.
	if (!StVal.hasOneUse())
	return SDValue();

	// If the immediate offset of the address operand is too large for the stp
	// instruction, then bail out.
	if (DAG.isBaseWithConstantOffset(St.getBasePtr())) {
	int64_t Offset = St.getBasePtr()->getConstantOperandVal(1);
	if (Offset < -512 \|\| Offset > 504)
	return SDValue();
	}

	for (int I = 0; I < NumVecElts; ++I) {
	SDValue EltVal = StVal.getOperand(I);
	if (!isNullConstant(EltVal) && !isNullFPConstant(EltVal))
	return SDValue();
	}

	// Use WZR/XZR here to prevent DAGCombiner::MergeConsecutiveStores from
	// undoing this transformation.
	SDValue SplatVal = VT.getVectorElementType().getSizeInBits() == 32
	? DAG.getRegister(AArch64::WZR, MVT::i32)
	: DAG.getRegister(AArch64::XZR, MVT::i64);
	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
	/// value. The load store optimizer pass will merge them to store pair stores.
	/// This has better performance than a splat of the scalar followed by a split
	/// vector store. Even if the stores are not merged it is four stores vs a dup,
	/// followed by an ext.b and two stores.
	static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
	SDValue StVal = St.getValue();
	EVT VT = StVal.getValueType();

	// Don't replace floating point stores, they possibly won't be transformed to
	// stp because of the store pair suppress pass.
	if (VT.isFloatingPoint())
	return SDValue();

	// We can express a splat as store pair(s) for 2 or 4 elements.
	unsigned NumVecElts = VT.getVectorNumElements();
	if (NumVecElts != 4 && NumVecElts != 2)
	return SDValue();

	// Check that this is a splat.
	// Make sure that each of the relevant vector element locations are inserted
	// to, i.e. 0 and 1 for v2i64 and 0, 1, 2, 3 for v4i32.
	std::bitset<4> IndexNotInserted((1 << NumVecElts) - 1);
	SDValue SplatVal;
	for (unsigned I = 0; I < NumVecElts; ++I) {
	// Check for insert vector elements.
	if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
	return SDValue();

	// Check that same value is inserted at each vector element.
	if (I == 0)
	SplatVal = StVal.getOperand(1);
	else if (StVal.getOperand(1) != SplatVal)
	return SDValue();

	// Check insert element index.
	ConstantSDNode *CIndex = dyn_cast<ConstantSDNode>(StVal.getOperand(2));
	if (!CIndex)
	return SDValue();
	uint64_t IndexVal = CIndex->getZExtValue();
	if (IndexVal >= NumVecElts)
	return SDValue();
	IndexNotInserted.reset(IndexVal);

	StVal = StVal.getOperand(0);
	}
	// Check that all vector element locations were inserted to.
	if (IndexNotInserted.any())
	return SDValue();

	return splitStoreSplat(DAG, St, SplatVal, NumVecElts);
	}

	static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (!DCI.isBeforeLegalize())
	return SDValue();

	StoreSDNode *S = cast<StoreSDNode>(N);
	if (S->isVolatile() \|\| S->isIndexed())
	return SDValue();

	SDValue StVal = S->getValue();
	EVT VT = StVal.getValueType();
	if (!VT.isVector())
	return SDValue();

	// If we get a splat of zeros, convert this vector store to a store of
	// scalars. They will be merged into store pairs of xzr thereby removing one
	// instruction and one register.
	if (SDValue ReplacedZeroSplat = replaceZeroVectorStore(DAG, *S))
	return ReplacedZeroSplat;

	// FIXME: The logic for deciding if an unaligned store should be split should
	// be included in TLI.allowsMisalignedMemoryAccesses(), and there should be
	// a call to that function here.

	if (!Subtarget->isMisaligned128StoreSlow())
	return SDValue();

	// Don't split at -Oz.
	if (DAG.getMachineFunction().getFunction()->optForMinSize())
	return SDValue();

	// Don't split v2i64 vectors. Memcpy lowering produces those and splitting
	// those up regresses performance on micro-benchmarks and olden/bh.
	if (VT.getVectorNumElements() < 2 \|\| VT == MVT::v2i64)
	return SDValue();

	// Split unaligned 16B stores. They are terrible for performance.
	// Don't split stores with alignment of 1 or 2. Code that uses clang vector
	// extensions can use this to mark that it does not want splitting to happen
	// (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
	// eliminating alignment hazards is only 1 in 8 for alignment of 2.
	if (VT.getSizeInBits() != 128 \|\| S->getAlignment() >= 16 \|\|
	S->getAlignment() <= 2)
	return SDValue();

	// If we get a splat of a scalar convert this vector store to a store of
	// scalars. They will be merged into store pairs thereby removing two
	// instructions.
	if (SDValue ReplacedSplat = replaceSplatVectorStore(DAG, *S))
	return ReplacedSplat;

	SDLoc DL(S);
	unsigned NumElts = VT.getVectorNumElements() / 2;
	// Split VT into two.
	EVT HalfVT =
	EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
	SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(0, DL, MVT::i64));
	SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
	DAG.getConstant(NumElts, DL, MVT::i64));
	SDValue BasePtr = S->getBasePtr();
	SDValue NewST1 =
	DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
	S->getAlignment(), S->getMemOperand()->getFlags());
	SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
	DAG.getConstant(8, DL, MVT::i64));
	return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
	S->getPointerInfo(), S->getAlignment(),
	S->getMemOperand()->getFlags());
	}

	/// Target-specific DAG combine function for post-increment LD1 (lane) and
	/// post-increment LD1R.
	static SDValue performPostLD1Combine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	bool IsLaneOp) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SelectionDAG &DAG = DCI.DAG;
	EVT VT = N->getValueType(0);

	unsigned LoadIdx = IsLaneOp ? 1 : 0;
	SDNode *LD = N->getOperand(LoadIdx).getNode();
	// If it is not LOAD, can not do such combine.
	if (LD->getOpcode() != ISD::LOAD)
	return SDValue();

	LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
	EVT MemVT = LoadSDN->getMemoryVT();
	// Check if memory operand is the same type as the vector element.
	if (MemVT != VT.getVectorElementType())
	return SDValue();

	// Check if there are other uses. If so, do not combine as it will introduce
	// an extra load.
	for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
	++UI) {
	if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
	continue;
	if (*UI != N)
	return SDValue();
	}

	SDValue Addr = LD->getOperand(1);
	SDValue Vector = N->getOperand(0);
	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
	Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD
	\|\| UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load. Otherwise, folding it
	// would create a cycle.
	if (User->isPredecessorOf(LD) \|\| LD->isPredecessorOf(User))
	continue;
	// Also check that add is not used in the vector operand. This would also
	// create a cycle.
	if (User->isPredecessorOf(Vector.getNode()))
	continue;

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = VT.getScalarSizeInBits() / 8;
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}

	// Finally, check that the vector doesn't depend on the load.
	// Again, this would create a cycle.
	// The load depending on the vector is fine, as that's the case for the
	// LD1*post we'll eventually generate anyway.
	if (LoadSDN->isPredecessorOf(Vector.getNode()))
	continue;

	SmallVector<SDValue, 8> Ops;
	Ops.push_back(LD->getOperand(0)); // Chain
	if (IsLaneOp) {
	Ops.push_back(Vector); // The vector to be inserted
	Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
	}
	Ops.push_back(Addr);
	Ops.push_back(Inc);

	EVT Tys[3] = { VT, MVT::i64, MVT::Other };
	SDVTList SDTys = DAG.getVTList(Tys);
	unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
	MemVT,
	LoadSDN->getMemOperand());

	// Update the uses.
	SDValue NewResults[] = {
	SDValue(LD, 0), // The result of load
	SDValue(UpdN.getNode(), 2) // Chain
	};
	DCI.CombineTo(LD, NewResults);
	DCI.CombineTo(N, SDValue(UpdN.getNode(), 0)); // Dup/Inserted Result
	DCI.CombineTo(User, SDValue(UpdN.getNode(), 1)); // Write back register

	break;
	}
	return SDValue();
	}

	/// Simplify ``Addr`` given that the top byte of it is ignored by HW during
	/// address translation.
	static bool performTBISimplification(SDValue Addr,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	APInt DemandedMask = APInt::getLowBitsSet(64, 56);
	KnownBits Known;
	- TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
	- DCI.isBeforeLegalizeOps());
	+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	+ !DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.SimplifyDemandedBits(Addr, DemandedMask, Known, TLO)) {
	DCI.CommitTargetLoweringOpt(TLO);
	return true;
	}
	return false;
	}

	static SDValue performSTORECombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG,
	const AArch64Subtarget *Subtarget) {
	if (SDValue Split = splitStores(N, DCI, DAG, Subtarget))
	return Split;

	if (Subtarget->supportsAddressTopByteIgnored() &&
	performTBISimplification(N->getOperand(2), DCI, DAG))
	return SDValue(N, 0);

	return SDValue();
	}


	/// Target-specific DAG combine function for NEON load/store intrinsics
	/// to merge base address updates.
	static SDValue performNEONPostLDSTCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	unsigned AddrOpIdx = N->getNumOperands() - 1;
	SDValue Addr = N->getOperand(AddrOpIdx);

	// Search for a use of the address operand that is an increment.
	for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
	UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
	SDNode User = UI;
	if (User->getOpcode() != ISD::ADD \|\|
	UI.getUse().getResNo() != Addr.getResNo())
	continue;

	// Check that the add is independent of the load/store. Otherwise, folding
	// it would create a cycle.
	if (User->isPredecessorOf(N) \|\| N->isPredecessorOf(User))
	continue;

	// Find the new opcode for the updating load/store.
	bool IsStore = false;
	bool IsLaneOp = false;
	bool IsDupOp = false;
	unsigned NewOpc = 0;
	unsigned NumVecs = 0;
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default: llvm_unreachable("unexpected intrinsic for Neon base update");
	case Intrinsic::aarch64_neon_ld2: NewOpc = AArch64ISD::LD2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld3: NewOpc = AArch64ISD::LD3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld4: NewOpc = AArch64ISD::LD4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st2: NewOpc = AArch64ISD::ST2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st3: NewOpc = AArch64ISD::ST3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st4: NewOpc = AArch64ISD::ST4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld1x2: NewOpc = AArch64ISD::LD1x2post;
	NumVecs = 2; break;
	case Intrinsic::aarch64_neon_ld1x3: NewOpc = AArch64ISD::LD1x3post;
	NumVecs = 3; break;
	case Intrinsic::aarch64_neon_ld1x4: NewOpc = AArch64ISD::LD1x4post;
	NumVecs = 4; break;
	case Intrinsic::aarch64_neon_st1x2: NewOpc = AArch64ISD::ST1x2post;
	NumVecs = 2; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x3: NewOpc = AArch64ISD::ST1x3post;
	NumVecs = 3; IsStore = true; break;
	case Intrinsic::aarch64_neon_st1x4: NewOpc = AArch64ISD::ST1x4post;
	NumVecs = 4; IsStore = true; break;
	case Intrinsic::aarch64_neon_ld2r: NewOpc = AArch64ISD::LD2DUPpost;
	NumVecs = 2; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld3r: NewOpc = AArch64ISD::LD3DUPpost;
	NumVecs = 3; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld4r: NewOpc = AArch64ISD::LD4DUPpost;
	NumVecs = 4; IsDupOp = true; break;
	case Intrinsic::aarch64_neon_ld2lane: NewOpc = AArch64ISD::LD2LANEpost;
	NumVecs = 2; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld3lane: NewOpc = AArch64ISD::LD3LANEpost;
	NumVecs = 3; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_ld4lane: NewOpc = AArch64ISD::LD4LANEpost;
	NumVecs = 4; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st2lane: NewOpc = AArch64ISD::ST2LANEpost;
	NumVecs = 2; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st3lane: NewOpc = AArch64ISD::ST3LANEpost;
	NumVecs = 3; IsStore = true; IsLaneOp = true; break;
	case Intrinsic::aarch64_neon_st4lane: NewOpc = AArch64ISD::ST4LANEpost;
	NumVecs = 4; IsStore = true; IsLaneOp = true; break;
	}

	EVT VecTy;
	if (IsStore)
	VecTy = N->getOperand(2).getValueType();
	else
	VecTy = N->getValueType(0);

	// If the increment is a constant, it must match the memory ref size.
	SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
	if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
	uint32_t IncVal = CInc->getZExtValue();
	unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
	if (IsLaneOp \|\| IsDupOp)
	NumBytes /= VecTy.getVectorNumElements();
	if (IncVal != NumBytes)
	continue;
	Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
	}
	SmallVector<SDValue, 8> Ops;
	Ops.push_back(N->getOperand(0)); // Incoming chain
	// Load lane and store have vector list as input.
	if (IsLaneOp \|\| IsStore)
	for (unsigned i = 2; i < AddrOpIdx; ++i)
	Ops.push_back(N->getOperand(i));
	Ops.push_back(Addr); // Base register
	Ops.push_back(Inc);

	// Return Types.
	EVT Tys[6];
	unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
	unsigned n;
	for (n = 0; n < NumResultVecs; ++n)
	Tys[n] = VecTy;
	Tys[n++] = MVT::i64; // Type of write back register
	Tys[n] = MVT::Other; // Type of the chain
	SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));

	MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
	SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
	MemInt->getMemoryVT(),
	MemInt->getMemOperand());

	// Update the uses.
	std::vector<SDValue> NewResults;
	for (unsigned i = 0; i < NumResultVecs; ++i) {
	NewResults.push_back(SDValue(UpdN.getNode(), i));
	}
	NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
	DCI.CombineTo(N, NewResults);
	DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));

	break;
	}
	return SDValue();
	}

	// Checks to see if the value is the prescribed width and returns information
	// about its extension mode.
	static
	bool checkValueWidth(SDValue V, unsigned width, ISD::LoadExtType &ExtType) {
	ExtType = ISD::NON_EXTLOAD;
	switch(V.getNode()->getOpcode()) {
	default:
	return false;
	case ISD::LOAD: {
	LoadSDNode *LoadNode = cast<LoadSDNode>(V.getNode());
	if ((LoadNode->getMemoryVT() == MVT::i8 && width == 8)
	\|\| (LoadNode->getMemoryVT() == MVT::i16 && width == 16)) {
	ExtType = LoadNode->getExtensionType();
	return true;
	}
	return false;
	}
	case ISD::AssertSext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::SEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::AssertZext: {
	VTSDNode *TypeNode = cast<VTSDNode>(V.getNode()->getOperand(1));
	if ((TypeNode->getVT() == MVT::i8 && width == 8)
	\|\| (TypeNode->getVT() == MVT::i16 && width == 16)) {
	ExtType = ISD::ZEXTLOAD;
	return true;
	}
	return false;
	}
	case ISD::Constant:
	case ISD::TargetConstant: {
	return std::abs(cast<ConstantSDNode>(V.getNode())->getSExtValue()) <
	1LL << (width - 1);
	}
	}

	return true;
	}

	// This function does a whole lot of voodoo to determine if the tests are
	// equivalent without and with a mask. Essentially what happens is that given a
	// DAG resembling:
	//
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| Input \| \| AddConstant \| \| CompConstant\| \| CC \|
	// +-------------+ +-------------+ +-------------+ +-------------+
	// \| \| \| \|
	// V V \| +----------+
	// +-------------+ +----+ \| \|
	// \| ADD \| \|0xff\| \| \|
	// +-------------+ +----+ \| \|
	// \| \| \| \|
	// V V \| \|
	// +-------------+ \| \|
	// \| AND \| \| \|
	// +-------------+ \| \|
	// \| \| \|
	// +-----+ \| \|
	// \| \| \|
	// V V V
	// +-------------+
	// \| CMP \|
	// +-------------+
	//
	// The AND node may be safely removed for some combinations of inputs. In
	// particular we need to take into account the extension type of the Input,
	// the exact values of AddConstant, CompConstant, and CC, along with the nominal
	// width of the input (this can work for any width inputs, the above graph is
	// specific to 8 bits.
	//
	// The specific equations were worked out by generating output tables for each
	// AArch64CC value in terms of and AddConstant (w1), CompConstant(w2). The
	// problem was simplified by working with 4 bit inputs, which means we only
	// needed to reason about 24 distinct bit patterns: 8 patterns unique to zero
	// extension (8,15), 8 patterns unique to sign extensions (-8,-1), and 8
	// patterns present in both extensions (0,7). For every distinct set of
	// AddConstant and CompConstants bit patterns we can consider the masked and
	// unmasked versions to be equivalent if the result of this function is true for
	// all 16 distinct bit patterns of for the current extension type of Input (w0).
	//
	// sub w8, w0, w1
	// and w10, w8, #0x0f
	// cmp w8, w2
	// cset w9, AArch64CC
	// cmp w10, w2
	// cset w11, AArch64CC
	// cmp w9, w11
	// cset w0, eq
	// ret
	//
	// Since the above function shows when the outputs are equivalent it defines
	// when it is safe to remove the AND. Unfortunately it only runs on AArch64 and
	// would be expensive to run during compiles. The equations below were written
	// in a test harness that confirmed they gave equivalent outputs to the above
	// for all inputs function, so they can be used determine if the removal is
	// legal instead.
	//
	// isEquivalentMaskless() is the code for testing if the AND can be removed
	// factored out of the DAG recognition as the DAG can take several forms.

	static bool isEquivalentMaskless(unsigned CC, unsigned width,
	ISD::LoadExtType ExtType, int AddConstant,
	int CompConstant) {
	// By being careful about our equations and only writing the in term
	// symbolic values and well known constants (0, 1, -1, MaxUInt) we can
	// make them generally applicable to all bit widths.
	int MaxUInt = (1 << width);

	// For the purposes of these comparisons sign extending the type is
	// equivalent to zero extending the add and displacing it by half the integer
	// width. Provided we are careful and make sure our equations are valid over
	// the whole range we can just adjust the input and avoid writing equations
	// for sign extended inputs.
	if (ExtType == ISD::SEXTLOAD)
	AddConstant -= (1 << (width-1));

	switch(CC) {
	case AArch64CC::LE:
	case AArch64CC::GT:
	if ((AddConstant == 0) \|\|
	(CompConstant == MaxUInt - 1 && AddConstant < 0) \|\|
	(AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::LT:
	case AArch64CC::GE:
	if ((AddConstant == 0) \|\|
	(AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant <= 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::HI:
	case AArch64CC::LS:
	if ((AddConstant >= 0 && CompConstant < 0) \|\|
	(AddConstant <= 0 && CompConstant >= -1 &&
	CompConstant < AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::PL:
	case AArch64CC::MI:
	if ((AddConstant == 0) \|\|
	(AddConstant > 0 && CompConstant <= 0) \|\|
	(AddConstant < 0 && CompConstant <= AddConstant))
	return true;
	break;
	case AArch64CC::LO:
	case AArch64CC::HS:
	if ((AddConstant >= 0 && CompConstant <= 0) \|\|
	(AddConstant <= 0 && CompConstant >= 0 &&
	CompConstant <= AddConstant + MaxUInt))
	return true;
	break;
	case AArch64CC::EQ:
	case AArch64CC::NE:
	if ((AddConstant > 0 && CompConstant < 0) \|\|
	(AddConstant < 0 && CompConstant >= 0 &&
	CompConstant < AddConstant + MaxUInt) \|\|
	(AddConstant >= 0 && CompConstant >= 0 &&
	CompConstant >= AddConstant) \|\|
	(AddConstant <= 0 && CompConstant < 0 && CompConstant < AddConstant))
	return true;
	break;
	case AArch64CC::VS:
	case AArch64CC::VC:
	case AArch64CC::AL:
	case AArch64CC::NV:
	return true;
	case AArch64CC::Invalid:
	break;
	}

	return false;
	}

	static
	SDValue performCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG, unsigned CCIndex,
	unsigned CmpIndex) {
	unsigned CC = cast<ConstantSDNode>(N->getOperand(CCIndex))->getSExtValue();
	SDNode *SubsNode = N->getOperand(CmpIndex).getNode();
	unsigned CondOpcode = SubsNode->getOpcode();

	if (CondOpcode != AArch64ISD::SUBS)
	return SDValue();

	// There is a SUBS feeding this condition. Is it fed by a mask we can
	// use?

	SDNode *AndNode = SubsNode->getOperand(0).getNode();
	unsigned MaskBits = 0;

	if (AndNode->getOpcode() != ISD::AND)
	return SDValue();

	if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(AndNode->getOperand(1))) {
	uint32_t CNV = CN->getZExtValue();
	if (CNV == 255)
	MaskBits = 8;
	else if (CNV == 65535)
	MaskBits = 16;
	}

	if (!MaskBits)
	return SDValue();

	SDValue AddValue = AndNode->getOperand(0);

	if (AddValue.getOpcode() != ISD::ADD)
	return SDValue();

	// The basic dag structure is correct, grab the inputs and validate them.

	SDValue AddInputValue1 = AddValue.getNode()->getOperand(0);
	SDValue AddInputValue2 = AddValue.getNode()->getOperand(1);
	SDValue SubsInputValue = SubsNode->getOperand(1);

	// The mask is present and the provenance of all the values is a smaller type,
	// lets see if the mask is superfluous.

	if (!isa<ConstantSDNode>(AddInputValue2.getNode()) \|\|
	!isa<ConstantSDNode>(SubsInputValue.getNode()))
	return SDValue();

	ISD::LoadExtType ExtType;

	if (!checkValueWidth(SubsInputValue, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue2, MaskBits, ExtType) \|\|
	!checkValueWidth(AddInputValue1, MaskBits, ExtType) )
	return SDValue();

	if(!isEquivalentMaskless(CC, MaskBits, ExtType,
	cast<ConstantSDNode>(AddInputValue2.getNode())->getSExtValue(),
	cast<ConstantSDNode>(SubsInputValue.getNode())->getSExtValue()))
	return SDValue();

	// The AND is not necessary, remove it.

	SDVTList VTs = DAG.getVTList(SubsNode->getValueType(0),
	SubsNode->getValueType(1));
	SDValue Ops[] = { AddValue, SubsNode->getOperand(1) };

	SDValue NewValue = DAG.getNode(CondOpcode, SDLoc(SubsNode), VTs, Ops);
	DAG.ReplaceAllUsesWith(SubsNode, NewValue.getNode());

	return SDValue(N, 0);
	}

	// Optimize compare with zero and branch.
	static SDValue performBRCONDCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	if (SDValue NV = performCONDCombine(N, DCI, DAG, 2, 3))
	N = NV.getNode();
	SDValue Chain = N->getOperand(0);
	SDValue Dest = N->getOperand(1);
	SDValue CCVal = N->getOperand(2);
	SDValue Cmp = N->getOperand(3);

	assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
	unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
	if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
	return SDValue();

	unsigned CmpOpc = Cmp.getOpcode();
	if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
	return SDValue();

	// Only attempt folding if there is only one use of the flag and no use of the
	// value.
	if (!Cmp->hasNUsesOfValue(0, 0) \|\| !Cmp->hasNUsesOfValue(1, 1))
	return SDValue();

	SDValue LHS = Cmp.getOperand(0);
	SDValue RHS = Cmp.getOperand(1);

	assert(LHS.getValueType() == RHS.getValueType() &&
	"Expected the value type to be the same for both operands!");
	if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
	return SDValue();

	if (isNullConstant(LHS))
	std::swap(LHS, RHS);

	if (!isNullConstant(RHS))
	return SDValue();

	if (LHS.getOpcode() == ISD::SHL \|\| LHS.getOpcode() == ISD::SRA \|\|
	LHS.getOpcode() == ISD::SRL)
	return SDValue();

	// Fold the compare into the branch instruction.
	SDValue BR;
	if (CC == AArch64CC::EQ)
	BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
	else
	BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);

	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, BR, false);

	return SDValue();
	}

	// Optimize some simple tbz/tbnz cases. Returns the new operand and bit to test
	// as well as whether the test should be inverted. This code is required to
	// catch these cases (as opposed to standard dag combines) because
	// AArch64ISD::TBZ is matched during legalization.
	static SDValue getTestBitOperand(SDValue Op, unsigned &Bit, bool &Invert,
	SelectionDAG &DAG) {

	if (!Op->hasOneUse())
	return Op;

	// We don't handle undef/constant-fold cases below, as they should have
	// already been taken care of (e.g. and of 0, test of undefined shifted bits,
	// etc.)

	// (tbz (trunc x), b) -> (tbz x, b)
	// This case is just here to enable more of the below cases to be caught.
	if (Op->getOpcode() == ISD::TRUNCATE &&
	Bit < Op->getValueType(0).getSizeInBits()) {
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}

	if (Op->getNumOperands() != 2)
	return Op;

	auto *C = dyn_cast<ConstantSDNode>(Op->getOperand(1));
	if (!C)
	return Op;

	switch (Op->getOpcode()) {
	default:
	return Op;

	// (tbz (and x, m), b) -> (tbz x, b)
	case ISD::AND:
	if ((C->getZExtValue() >> Bit) & 1)
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	return Op;

	// (tbz (shl x, c), b) -> (tbz x, b-c)
	case ISD::SHL:
	if (C->getZExtValue() <= Bit &&
	(Bit - C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit - C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (sra x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits in x
	case ISD::SRA:
	Bit = Bit + C->getZExtValue();
	if (Bit >= Op->getValueType(0).getSizeInBits())
	Bit = Op->getValueType(0).getSizeInBits() - 1;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);

	// (tbz (srl x, c), b) -> (tbz x, b+c)
	case ISD::SRL:
	if ((Bit + C->getZExtValue()) < Op->getValueType(0).getSizeInBits()) {
	Bit = Bit + C->getZExtValue();
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	return Op;

	// (tbz (xor x, -1), b) -> (tbnz x, b)
	case ISD::XOR:
	if ((C->getZExtValue() >> Bit) & 1)
	Invert = !Invert;
	return getTestBitOperand(Op->getOperand(0), Bit, Invert, DAG);
	}
	}

	// Optimize test single bit zero/non-zero and branch.
	static SDValue performTBZCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI,
	SelectionDAG &DAG) {
	unsigned Bit = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
	bool Invert = false;
	SDValue TestSrc = N->getOperand(1);
	SDValue NewTestSrc = getTestBitOperand(TestSrc, Bit, Invert, DAG);

	if (TestSrc == NewTestSrc)
	return SDValue();

	unsigned NewOpc = N->getOpcode();
	if (Invert) {
	if (NewOpc == AArch64ISD::TBZ)
	NewOpc = AArch64ISD::TBNZ;
	else {
	assert(NewOpc == AArch64ISD::TBNZ);
	NewOpc = AArch64ISD::TBZ;
	}
	}

	SDLoc DL(N);
	return DAG.getNode(NewOpc, DL, MVT::Other, N->getOperand(0), NewTestSrc,
	DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
	}

	// vselect (v1i1 setcc) ->
	// vselect (v1iXX setcc) (XX is the size of the compared operand type)
	// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
	// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
	// such VSELECT.
	static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	EVT CCVT = N0.getValueType();

	if (N0.getOpcode() != ISD::SETCC \|\| CCVT.getVectorNumElements() != 1 \|\|
	CCVT.getVectorElementType() != MVT::i1)
	return SDValue();

	EVT ResVT = N->getValueType(0);
	EVT CmpVT = N0.getOperand(0).getValueType();
	// Only combine when the result type is of the same size as the compared
	// operands.
	if (ResVT.getSizeInBits() != CmpVT.getSizeInBits())
	return SDValue();

	SDValue IfTrue = N->getOperand(1);
	SDValue IfFalse = N->getOperand(2);
	SDValue SetCC =
	DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
	N0.getOperand(0), N0.getOperand(1),
	cast<CondCodeSDNode>(N0.getOperand(2))->get());
	return DAG.getNode(ISD::VSELECT, SDLoc(N), ResVT, SetCC,
	IfTrue, IfFalse);
	}

	/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
	/// the compare-mask instructions rather than going via NZCV, even if LHS and
	/// RHS are really scalar. This replaces any scalar setcc in the above pattern
	/// with a vector one followed by a DUP shuffle on the result.
	static SDValue performSelectCombine(SDNode *N,
	TargetLowering::DAGCombinerInfo &DCI) {
	SelectionDAG &DAG = DCI.DAG;
	SDValue N0 = N->getOperand(0);
	EVT ResVT = N->getValueType(0);

	if (N0.getOpcode() != ISD::SETCC)
	return SDValue();

	// Make sure the SETCC result is either i1 (initial DAG), or i32, the lowered
	// scalar SetCCResultType. We also don't expect vectors, because we assume
	// that selects fed by vector SETCCs are canonicalized to VSELECT.
	assert((N0.getValueType() == MVT::i1 \|\| N0.getValueType() == MVT::i32) &&
	"Scalar-SETCC feeding SELECT has unexpected result type!");

	// If NumMaskElts == 0, the comparison is larger than select result. The
	// largest real NEON comparison is 64-bits per lane, which means the result is
	// at most 32-bits and an illegal vector. Just bail out for now.
	EVT SrcVT = N0.getOperand(0).getValueType();

	// Don't try to do this optimization when the setcc itself has i1 operands.
	// There are no legal vectors of i1, so this would be pointless.
	if (SrcVT == MVT::i1)
	return SDValue();

	int NumMaskElts = ResVT.getSizeInBits() / SrcVT.getSizeInBits();
	if (!ResVT.isVector() \|\| NumMaskElts == 0)
	return SDValue();

	SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumMaskElts);
	EVT CCVT = SrcVT.changeVectorElementTypeToInteger();

	// Also bail out if the vector CCVT isn't the same size as ResVT.
	// This can happen if the SETCC operand size doesn't divide the ResVT size
	// (e.g., f64 vs v3f32).
	if (CCVT.getSizeInBits() != ResVT.getSizeInBits())
	return SDValue();

	// Make sure we didn't create illegal types, if we're not supposed to.
	assert(DCI.isBeforeLegalize() \|\|
	DAG.getTargetLoweringInfo().isTypeLegal(SrcVT));

	// First perform a vector comparison, where lane 0 is the one we're interested
	// in.
	SDLoc DL(N0);
	SDValue LHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
	SDValue RHS =
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
	SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));

	// Now duplicate the comparison mask we want across all other lanes.
	SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
	SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask);
	Mask = DAG.getNode(ISD::BITCAST, DL,
	ResVT.changeVectorElementTypeToInteger(), Mask);

	return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
	}

	/// Get rid of unnecessary NVCASTs (that don't change the type).
	static SDValue performNVCASTCombine(SDNode *N) {
	if (N->getValueType(0) == N->getOperand(0).getValueType())
	return N->getOperand(0);

	return SDValue();
	}

	SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default:
	break;
	case ISD::ADD:
	case ISD::SUB:
	return performAddSubLongCombine(N, DCI, DAG);
	case ISD::XOR:
	return performXorCombine(N, DAG, DCI, Subtarget);
	case ISD::MUL:
	return performMulCombine(N, DAG, DCI, Subtarget);
	case ISD::SINT_TO_FP:
	case ISD::UINT_TO_FP:
	return performIntToFpCombine(N, DAG, Subtarget);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT:
	return performFpToIntCombine(N, DAG, DCI, Subtarget);
	case ISD::FDIV:
	return performFDivCombine(N, DAG, DCI, Subtarget);
	case ISD::OR:
	return performORCombine(N, DCI, Subtarget);
	case ISD::SRL:
	return performSRLCombine(N, DCI);
	case ISD::INTRINSIC_WO_CHAIN:
	return performIntrinsicCombine(N, DCI, Subtarget);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::SIGN_EXTEND:
	return performExtendCombine(N, DCI, DAG);
	case ISD::BITCAST:
	return performBitcastCombine(N, DCI, DAG);
	case ISD::CONCAT_VECTORS:
	return performConcatVectorsCombine(N, DCI, DAG);
	case ISD::SELECT:
	return performSelectCombine(N, DCI);
	case ISD::VSELECT:
	return performVSelectCombine(N, DCI.DAG);
	case ISD::LOAD:
	if (performTBISimplification(N->getOperand(1), DCI, DAG))
	return SDValue(N, 0);
	break;
	case ISD::STORE:
	return performSTORECombine(N, DCI, DAG, Subtarget);
	case AArch64ISD::BRCOND:
	return performBRCONDCombine(N, DCI, DAG);
	case AArch64ISD::TBNZ:
	case AArch64ISD::TBZ:
	return performTBZCombine(N, DCI, DAG);
	case AArch64ISD::CSEL:
	return performCONDCombine(N, DCI, DAG, 2, 3);
	case AArch64ISD::DUP:
	return performPostLD1Combine(N, DCI, false);
	case AArch64ISD::NVCAST:
	return performNVCASTCombine(N);
	case ISD::INSERT_VECTOR_ELT:
	return performPostLD1Combine(N, DCI, true);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN:
	switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
	case Intrinsic::aarch64_neon_ld2:
	case Intrinsic::aarch64_neon_ld3:
	case Intrinsic::aarch64_neon_ld4:
	case Intrinsic::aarch64_neon_ld1x2:
	case Intrinsic::aarch64_neon_ld1x3:
	case Intrinsic::aarch64_neon_ld1x4:
	case Intrinsic::aarch64_neon_ld2lane:
	case Intrinsic::aarch64_neon_ld3lane:
	case Intrinsic::aarch64_neon_ld4lane:
	case Intrinsic::aarch64_neon_ld2r:
	case Intrinsic::aarch64_neon_ld3r:
	case Intrinsic::aarch64_neon_ld4r:
	case Intrinsic::aarch64_neon_st2:
	case Intrinsic::aarch64_neon_st3:
	case Intrinsic::aarch64_neon_st4:
	case Intrinsic::aarch64_neon_st1x2:
	case Intrinsic::aarch64_neon_st1x3:
	case Intrinsic::aarch64_neon_st1x4:
	case Intrinsic::aarch64_neon_st2lane:
	case Intrinsic::aarch64_neon_st3lane:
	case Intrinsic::aarch64_neon_st4lane:
	return performNEONPostLDSTCombine(N, DCI, DAG);
	default:
	break;
	}
	}
	return SDValue();
	}

	// Check if the return value is used as only a return value, as otherwise
	// we can't perform a tail-call. In particular, we need to check for
	// target ISD nodes that are returns and any other "odd" constructs
	// that the generic analysis code won't necessarily catch.
	bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
	SDValue &Chain) const {
	if (N->getNumValues() != 1)
	return false;
	if (!N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
	MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode *Node : Copy->uses()) {
	if (Node->getOpcode() != AArch64ISD::RET_FLAG)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	// Return whether the an instruction can potentially be optimized to a tail
	// call. This will cause the optimizers to attempt to move, or duplicate,
	// return instructions to help enable tail call optimizations for this
	// instruction.
	bool AArch64TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	return CI->isTailCall();
	}

	bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	bool &IsInc,
	SelectionDAG &DAG) const {
	if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
	return false;

	Base = Op->getOperand(0);
	// All of the indexed addressing mode instructions take a signed
	// 9 bit immediate offset.
	if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
	int64_t RHSC = RHS->getSExtValue();
	if (Op->getOpcode() == ISD::SUB)
	RHSC = -(uint64_t)RHSC;
	if (!isInt<9>(RHSC))
	return false;
	IsInc = (Op->getOpcode() == ISD::ADD);
	Offset = Op->getOperand(1);
	return true;
	}
	return false;
	}

	bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
	SDValue &Offset,
	ISD::MemIndexedMode &AM,
	SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
	return false;
	AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
	return true;
	}

	bool AArch64TargetLowering::getPostIndexedAddressParts(
	SDNode N, SDNode Op, SDValue &Base, SDValue &Offset,
	ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
	EVT VT;
	SDValue Ptr;
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
	VT = LD->getMemoryVT();
	Ptr = LD->getBasePtr();
	} else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
	VT = ST->getMemoryVT();
	Ptr = ST->getBasePtr();
	} else
	return false;

	bool IsInc;
	if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
	return false;
	// Post-indexing updates the base, so it's not a valid transform
	// if that's not the same as the load's pointer.
	if (Ptr != Base)
	return false;
	AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
	return true;
	}

	static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Op = N->getOperand(0);

	if (N->getValueType(0) != MVT::i16 \|\| Op.getValueType() != MVT::f16)
	return;

	Op = SDValue(
	DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::f32,
	DAG.getUNDEF(MVT::i32), Op,
	DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
	0);
	Op = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op);
	Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Op));
	}

	static void ReplaceReductionResults(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG, unsigned InterOp,
	unsigned AcrossOp) {
	EVT LoVT, HiVT;
	SDValue Lo, Hi;
	SDLoc dl(N);
	std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
	std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
	SDValue InterVal = DAG.getNode(InterOp, dl, LoVT, Lo, Hi);
	SDValue SplitVal = DAG.getNode(AcrossOp, dl, LoVT, InterVal);
	Results.push_back(SplitVal);
	}

	static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
	SDLoc DL(N);
	SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64, N);
	SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i64,
	DAG.getNode(ISD::SRL, DL, MVT::i128, N,
	DAG.getConstant(64, DL, MVT::i64)));
	return std::make_pair(Lo, Hi);
	}

	static void ReplaceCMP_SWAP_128Results(SDNode *N,
	SmallVectorImpl<SDValue> & Results,
	SelectionDAG &DAG) {
	assert(N->getValueType(0) == MVT::i128 &&
	"AtomicCmpSwap on types less than 128 should be legal");
	auto Desired = splitInt128(N->getOperand(2), DAG);
	auto New = splitInt128(N->getOperand(3), DAG);
	SDValue Ops[] = {N->getOperand(1), Desired.first, Desired.second,
	New.first, New.second, N->getOperand(0)};
	SDNode *CmpSwap = DAG.getMachineNode(
	AArch64::CMP_SWAP_128, SDLoc(N),
	DAG.getVTList(MVT::i64, MVT::i64, MVT::i32, MVT::Other), Ops);

	MachineFunction &MF = DAG.getMachineFunction();
	MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1);
	MemOp[0] = cast<MemSDNode>(N)->getMemOperand();
	cast<MachineSDNode>(CmpSwap)->setMemRefs(MemOp, MemOp + 1);

	Results.push_back(SDValue(CmpSwap, 0));
	Results.push_back(SDValue(CmpSwap, 1));
	Results.push_back(SDValue(CmpSwap, 3));
	}

	void AArch64TargetLowering::ReplaceNodeResults(
	SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Don't know how to custom expand this");
	case ISD::BITCAST:
	ReplaceBITCASTResults(N, Results, DAG);
	return;
	case ISD::VECREDUCE_ADD:
	case ISD::VECREDUCE_SMAX:
	case ISD::VECREDUCE_SMIN:
	case ISD::VECREDUCE_UMAX:
	case ISD::VECREDUCE_UMIN:
	Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
	return;

	case AArch64ISD::SADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
	return;
	case AArch64ISD::UADDV:
	ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::UADDV);
	return;
	case AArch64ISD::SMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMIN, AArch64ISD::SMINV);
	return;
	case AArch64ISD::UMINV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
	return;
	case AArch64ISD::SMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
	return;
	case AArch64ISD::UMAXV:
	ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
	return;
	case ISD::FP_TO_UINT:
	case ISD::FP_TO_SINT:
	assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
	// Let normal code take care of it by not adding anything to Results.
	return;
	case ISD::ATOMIC_CMP_SWAP:
	ReplaceCMP_SWAP_128Results(N, Results, DAG);
	return;
	}
	}

	bool AArch64TargetLowering::useLoadStackGuardNode() const {
	if (Subtarget->isTargetAndroid() \|\| Subtarget->isTargetFuchsia())
	return TargetLowering::useLoadStackGuardNode();
	return true;
	}

	unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
	// Combine multiple FDIVs with the same divisor into multiple FMULs by the
	// reciprocal if there are three or more FDIVs.
	return 3;
	}

	TargetLoweringBase::LegalizeTypeAction
	AArch64TargetLowering::getPreferredVectorAction(EVT VT) const {
	MVT SVT = VT.getSimpleVT();
	// During type legalization, we prefer to widen v1i8, v1i16, v1i32 to v8i8,
	// v4i16, v2i32 instead of to promote.
	if (SVT == MVT::v1i8 \|\| SVT == MVT::v1i16 \|\| SVT == MVT::v1i32
	\|\| SVT == MVT::v1f32)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	bool AArch64TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
	return Size == 128;
	}

	// Loads and stores less than 128-bits are already atomic; ones above that
	// are doomed anyway, so defer to the default libcall and blame the OS when
	// things go wrong.
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	unsigned Size = LI->getType()->getPrimitiveSizeInBits();
	return Size == 128 ? AtomicExpansionKind::LLSC : AtomicExpansionKind::None;
	}

	// For the real atomic operations, we have ldxr/stxr up to 128 bits,
	TargetLowering::AtomicExpansionKind
	AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned Size = AI->getType()->getPrimitiveSizeInBits();
	if (Size > 128) return AtomicExpansionKind::None;
	// Nand not supported in LSE.
	if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC;
	// Leave 128 bits to LLSC.
	return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC;
	}

	bool AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
	AtomicCmpXchgInst *AI) const {
	// If subtarget has LSE, leave cmpxchg intact for codegen.
	if (Subtarget->hasLSE()) return false;
	// At -O0, fast-regalloc cannot cope with the live vregs necessary to
	// implement cmpxchg without spilling. If the address being exchanged is also
	// on the stack and close enough to the spill slot, this can lead to a
	// situation where the monitor always gets cleared and the atomic operation
	// can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
	return getTargetMachine().getOptLevel() != 0;
	}

	Value AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
	bool IsAcquire = isAcquireOrStronger(Ord);

	// Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
	// intrinsic must return {i64, i64} and we have to recombine them into a
	// single i128 here.
	if (ValTy->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int);

	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");

	Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
	Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
	Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
	Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
	return Builder.CreateOr(
	Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
	}

	Type *Tys[] = { Addr->getType() };
	Intrinsic::ID Int =
	IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
	Function *Ldxr = Intrinsic::getDeclaration(M, Int, Tys);

	return Builder.CreateTruncOrBitCast(
	Builder.CreateCall(Ldxr, Addr),
	cast<PointerType>(Addr->getType())->getElementType());
	}

	void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
	IRBuilder<> &Builder) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::aarch64_clrex));
	}

	Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
	Value Val, Value Addr,
	AtomicOrdering Ord) const {
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	bool IsRelease = isReleaseOrStronger(Ord);

	// Since the intrinsics must have legal type, the i128 intrinsics take two
	// parameters: "i64, i64". We must marshal Val into the appropriate form
	// before the call.
	if (Val->getType()->getPrimitiveSizeInBits() == 128) {
	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
	Function *Stxr = Intrinsic::getDeclaration(M, Int);
	Type *Int64Ty = Type::getInt64Ty(M->getContext());

	Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
	Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
	Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
	return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
	}

	Intrinsic::ID Int =
	IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
	Type *Tys[] = { Addr->getType() };
	Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);

	return Builder.CreateCall(Stxr,
	{Builder.CreateZExtOrBitCast(
	Val, Stxr->getFunctionType()->getParamType(0)),
	Addr});
	}

	bool AArch64TargetLowering::functionArgumentNeedsConsecutiveRegisters(
	Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
	return Ty->isArrayTy();
	}

	bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
	EVT) const {
	return false;
	}

	static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
	Module *M = IRB.GetInsertBlock()->getParent()->getParent();
	Function *ThreadPointerFunc =
	Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
	return IRB.CreatePointerCast(
	IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
	}

	Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the stack cookie. See the definition
	// of TLS_SLOT_STACK_GUARD in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x28);

	// Fuchsia is similar.
	// <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x10);

	return TargetLowering::getIRStackGuard(IRB);
	}

	Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget->isTargetAndroid())
	return UseTlsOffset(IRB, 0x48);

	// Fuchsia is similar.
	// <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
	if (Subtarget->isTargetFuchsia())
	return UseTlsOffset(IRB, -0x8);

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	// Only sink 'and' mask to cmp use block if it is masking a single bit, since
	// this is likely to be fold the and/cmp/br into a single tbz instruction. It
	// may be beneficial to sink in other cases, but we would have to check that
	// the cmp would not get folded into the br to form a cbz for these to be
	// beneficial.
	ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
	if (!Mask)
	return false;
	return Mask->getUniqueInteger().isPowerOf2();
	}

	void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	// Update IsSplitCSR in AArch64unctionInfo.
	AArch64FunctionInfo *AFI = Entry->getParent()->getInfo<AArch64FunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void AArch64TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const AArch64RegisterInfo *TRI = Subtarget->getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget->getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (AArch64::GPR64RegClass.contains(*I))
	RC = &AArch64::GPR64RegClass;
	else if (AArch64::FPR64RegClass.contains(*I))
	RC = &AArch64::FPR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction()->hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on AArch64 is expensive. However, when aggressively
	// optimizing for code size, we prefer to use a div instruction, as it is
	// usually smaller than the alternative sequence.
	// The exception to this is vector division. Since AArch64 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	unsigned
	AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
	if (Subtarget->isTargetDarwin() \|\| Subtarget->isTargetWindows())
	return getPointerTy(DL).getSizeInBits();

	return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
	}
	Index: head/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td
	===================================================================
	--- head/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td (revision 322319)
	+++ head/contrib/llvm/lib/Target/AArch64/AArch64InstrInfo.td (revision 322320)
	@@ -1,6126 +1,6154 @@
	//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -- tablegen --=//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// AArch64 Instruction definitions.
	//
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// ARM Instruction Predicate Definitions.
	//
	def HasV8_1a : Predicate<"Subtarget->hasV8_1aOps()">,
	AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
	def HasV8_2a : Predicate<"Subtarget->hasV8_2aOps()">,
	AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
	def HasFPARMv8 : Predicate<"Subtarget->hasFPARMv8()">,
	AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
	def HasNEON : Predicate<"Subtarget->hasNEON()">,
	AssemblerPredicate<"FeatureNEON", "neon">;
	def HasCrypto : Predicate<"Subtarget->hasCrypto()">,
	AssemblerPredicate<"FeatureCrypto", "crypto">;
	def HasCRC : Predicate<"Subtarget->hasCRC()">,
	AssemblerPredicate<"FeatureCRC", "crc">;
	def HasLSE : Predicate<"Subtarget->hasLSE()">,
	AssemblerPredicate<"FeatureLSE", "lse">;
	def HasRAS : Predicate<"Subtarget->hasRAS()">,
	AssemblerPredicate<"FeatureRAS", "ras">;
	def HasRDM : Predicate<"Subtarget->hasRDM()">,
	AssemblerPredicate<"FeatureRDM", "rdm">;
	def HasPerfMon : Predicate<"Subtarget->hasPerfMon()">;
	def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">,
	AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
	def HasSPE : Predicate<"Subtarget->hasSPE()">,
	AssemblerPredicate<"FeatureSPE", "spe">;
	+def HasFuseAES : Predicate<"Subtarget->hasFuseAES()">,
	+ AssemblerPredicate<"FeatureFuseAES",
	+ "fuse-aes">;
	def HasSVE : Predicate<"Subtarget->hasSVE()">,
	AssemblerPredicate<"FeatureSVE", "sve">;

	def IsLE : Predicate<"Subtarget->isLittleEndian()">;
	def IsBE : Predicate<"!Subtarget->isLittleEndian()">;
	def UseAlternateSExtLoadCVTF32
	: Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;

	def UseNegativeImmediates
	: Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
	"NegativeImmediates">;


	//===----------------------------------------------------------------------===//
	// AArch64-specific DAG Nodes.
	//

	// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
	def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>, SDTCisVT<1, i32>]>;

	// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
	def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
	[SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>,
	SDTCisInt<0>,
	SDTCisVT<3, i32>]>;

	// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
	def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
	[SDTCisSameAs<0, 2>,
	SDTCisSameAs<0, 3>,
	SDTCisInt<0>,
	SDTCisVT<1, i32>,
	SDTCisVT<4, i32>]>;

	def SDT_AArch64Brcond : SDTypeProfile<0, 3,
	[SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
	SDTCisVT<2, i32>]>;
	def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
	def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
	SDTCisVT<2, OtherVT>]>;


	def SDT_AArch64CSel : SDTypeProfile<1, 4,
	[SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>,
	SDTCisInt<3>,
	SDTCisVT<4, i32>]>;
	def SDT_AArch64CCMP : SDTypeProfile<1, 5,
	[SDTCisVT<0, i32>,
	SDTCisInt<1>,
	SDTCisSameAs<1, 2>,
	SDTCisInt<3>,
	SDTCisInt<4>,
	SDTCisVT<5, i32>]>;
	def SDT_AArch64FCCMP : SDTypeProfile<1, 5,
	[SDTCisVT<0, i32>,
	SDTCisFP<1>,
	SDTCisSameAs<1, 2>,
	SDTCisInt<3>,
	SDTCisInt<4>,
	SDTCisVT<5, i32>]>;
	def SDT_AArch64FCmp : SDTypeProfile<0, 2,
	[SDTCisFP<0>,
	SDTCisSameAs<0, 1>]>;
	def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
	def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
	def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>,
	SDTCisSameAs<0, 1>,
	SDTCisSameAs<0, 2>]>;
	def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
	def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
	def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisInt<2>, SDTCisInt<3>]>;
	def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<0,2>, SDTCisInt<3>]>;
	def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;

	def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
	def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
	def SDT_AArch64fcmp : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
	def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<0,2>]>;
	def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
	SDTCisSameAs<0,2>,
	SDTCisSameAs<0,3>]>;
	def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
	def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;

	def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;

	def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
	SDTCisPtrTy<1>]>;

	// Generates the general dynamic sequences, i.e.
	// adrp x0, :tlsdesc:var
	// ldr x1, [x0, #:tlsdesc_lo12:var]
	// add x0, x0, #:tlsdesc_lo12:var
	// .tlsdesccall var
	// blr x1

	// (the TPIDR_EL0 offset is put directly in X0, hence no "result" here)
	// number of operands (the variable)
	def SDT_AArch64TLSDescCallSeq : SDTypeProfile<0,1,
	[SDTCisPtrTy<0>]>;

	def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
	[SDTCisVT<0, i64>, SDTCisVT<1, i32>,
	SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
	SDTCisSameAs<1, 4>]>;


	// Node definitions.
	def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
	def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
	def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
	def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
	SDCallSeqStart<[ SDTCisVT<0, i32>,
	SDTCisVT<1, i32> ]>,
	[SDNPHasChain, SDNPOutGlue]>;
	def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",
	SDCallSeqEnd<[ SDTCisVT<0, i32>,
	SDTCisVT<1, i32> ]>,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
	def AArch64call : SDNode<"AArch64ISD::CALL",
	SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
	[SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
	SDNPVariadic]>;
	def AArch64brcond : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
	[SDNPHasChain]>;
	def AArch64cbz : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
	[SDNPHasChain]>;
	def AArch64cbnz : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
	[SDNPHasChain]>;
	def AArch64tbz : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
	[SDNPHasChain]>;
	def AArch64tbnz : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
	[SDNPHasChain]>;


	def AArch64csel : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
	def AArch64csinv : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
	def AArch64csneg : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
	def AArch64csinc : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
	def AArch64retflag : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
	def AArch64adc : SDNode<"AArch64ISD::ADC", SDTBinaryArithWithFlagsIn >;
	def AArch64sbc : SDNode<"AArch64ISD::SBC", SDTBinaryArithWithFlagsIn>;
	def AArch64add_flag : SDNode<"AArch64ISD::ADDS", SDTBinaryArithWithFlagsOut,
	[SDNPCommutative]>;
	def AArch64sub_flag : SDNode<"AArch64ISD::SUBS", SDTBinaryArithWithFlagsOut>;
	def AArch64and_flag : SDNode<"AArch64ISD::ANDS", SDTBinaryArithWithFlagsOut,
	[SDNPCommutative]>;
	def AArch64adc_flag : SDNode<"AArch64ISD::ADCS", SDTBinaryArithWithFlagsInOut>;
	def AArch64sbc_flag : SDNode<"AArch64ISD::SBCS", SDTBinaryArithWithFlagsInOut>;

	def AArch64ccmp : SDNode<"AArch64ISD::CCMP", SDT_AArch64CCMP>;
	def AArch64ccmn : SDNode<"AArch64ISD::CCMN", SDT_AArch64CCMP>;
	def AArch64fccmp : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;

	def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;

	def AArch64fcmp : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;

	def AArch64dup : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
	def AArch64duplane8 : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
	def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
	def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
	def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;

	def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
	def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
	def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
	def AArch64uzp2 : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
	def AArch64trn1 : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
	def AArch64trn2 : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;

	def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
	def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
	def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
	def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
	def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
	def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
	def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;

	def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
	def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
	def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
	def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;

	def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
	def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
	def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
	def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
	def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
	def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
	def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
	def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;

	def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
	def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
	def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;

	def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
	def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
	def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
	def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
	def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;

	def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
	def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
	def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;

	def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
	def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
	def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
	def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
	def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
	def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
	(AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;

	def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
	def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
	def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
	def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
	def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;

	def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
	def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;

	def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;

	def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
	[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;

	def AArch64Prefetch : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
	[SDNPHasChain, SDNPSideEffect]>;

	def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
	def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;

	def AArch64tlsdesc_callseq : SDNode<"AArch64ISD::TLSDESC_CALLSEQ",
	SDT_AArch64TLSDescCallSeq,
	[SDNPInGlue, SDNPOutGlue, SDNPHasChain,
	SDNPVariadic]>;


	def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
	SDT_AArch64WrapperLarge>;

	def AArch64NvCast : SDNode<"AArch64ISD::NVCAST", SDTUnaryOp>;

	def SDT_AArch64mull : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
	SDTCisSameAs<1, 2>]>;
	def AArch64smull : SDNode<"AArch64ISD::SMULL", SDT_AArch64mull>;
	def AArch64umull : SDNode<"AArch64ISD::UMULL", SDT_AArch64mull>;

	def AArch64frecpe : SDNode<"AArch64ISD::FRECPE", SDTFPUnaryOp>;
	def AArch64frecps : SDNode<"AArch64ISD::FRECPS", SDTFPBinOp>;
	def AArch64frsqrte : SDNode<"AArch64ISD::FRSQRTE", SDTFPUnaryOp>;
	def AArch64frsqrts : SDNode<"AArch64ISD::FRSQRTS", SDTFPBinOp>;

	def AArch64saddv : SDNode<"AArch64ISD::SADDV", SDT_AArch64UnaryVec>;
	def AArch64uaddv : SDNode<"AArch64ISD::UADDV", SDT_AArch64UnaryVec>;
	def AArch64sminv : SDNode<"AArch64ISD::SMINV", SDT_AArch64UnaryVec>;
	def AArch64uminv : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
	def AArch64smaxv : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
	def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;

	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//

	// AArch64 Instruction Predicate Definitions.
	// We could compute these on a per-module basis but doing so requires accessing
	// the Function object through the <Target>Subtarget and objections were raised
	// to that (see post-commit review comments for r301750).
	let RecomputePerFunction = 1 in {
	def ForCodeSize : Predicate<"MF->getFunction()->optForSize()">;
	def NotForCodeSize : Predicate<"!MF->getFunction()->optForSize()">;
	}

	include "AArch64InstrFormats.td"

	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// Miscellaneous instructions.
	//===----------------------------------------------------------------------===//

	let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
	// We set Sched to empty list because we expect these instructions to simply get
	// removed in most cases.
	def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
	[(AArch64callseq_start timm:$amt1, timm:$amt2)]>,
	Sched<[]>;
	def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
	[(AArch64callseq_end timm:$amt1, timm:$amt2)]>,
	Sched<[]>;
	} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1

	let isReMaterializable = 1, isCodeGenOnly = 1 in {
	// FIXME: The following pseudo instructions are only needed because remat
	// cannot handle multiple instructions. When that changes, they can be
	// removed, along with the AArch64Wrapper node.

	let AddedComplexity = 10 in
	def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
	[(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
	Sched<[WriteLDAdr]>;

	// The MOVaddr instruction should match only when the add is not folded
	// into a load or store address.
	def MOVaddr
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
	tglobaladdr:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrJT
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
	tjumptable:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrCP
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
	tconstpool:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrBA
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
	tblockaddress:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrTLS
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
	tglobaltlsaddr:$low))]>,
	Sched<[WriteAdrAdr]>;
	def MOVaddrEXT
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
	[(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
	texternalsym:$low))]>,
	Sched<[WriteAdrAdr]>;

	} // isReMaterializable, isCodeGenOnly

	def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
	(LOADgot tglobaltlsaddr:$addr)>;

	def : Pat<(AArch64LOADgot texternalsym:$addr),
	(LOADgot texternalsym:$addr)>;

	def : Pat<(AArch64LOADgot tconstpool:$addr),
	(LOADgot tconstpool:$addr)>;

	//===----------------------------------------------------------------------===//
	// System instructions.
	//===----------------------------------------------------------------------===//

	def HINT : HintI<"hint">;
	def : InstAlias<"nop", (HINT 0b000)>;
	def : InstAlias<"yield",(HINT 0b001)>;
	def : InstAlias<"wfe", (HINT 0b010)>;
	def : InstAlias<"wfi", (HINT 0b011)>;
	def : InstAlias<"sev", (HINT 0b100)>;
	def : InstAlias<"sevl", (HINT 0b101)>;
	def : InstAlias<"esb", (HINT 0b10000)>, Requires<[HasRAS]>;

	// v8.2a Statistical Profiling extension
	def : InstAlias<"psb $op", (HINT psbhint_op:$op)>, Requires<[HasSPE]>;

	// As far as LLVM is concerned this writes to the system's exclusive monitors.
	let mayLoad = 1, mayStore = 1 in
	def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;

	// NOTE: ideally, this would have mayStore = 0, mayLoad = 0, but we cannot
	// model patterns with sufficiently fine granularity.
	let mayLoad = ?, mayStore = ? in {
	def DMB : CRmSystemI<barrier_op, 0b101, "dmb",
	[(int_aarch64_dmb (i32 imm32_0_15:$CRm))]>;

	def DSB : CRmSystemI<barrier_op, 0b100, "dsb",
	[(int_aarch64_dsb (i32 imm32_0_15:$CRm))]>;

	def ISB : CRmSystemI<barrier_op, 0b110, "isb",
	[(int_aarch64_isb (i32 imm32_0_15:$CRm))]>;
	}

	def : InstAlias<"clrex", (CLREX 0xf)>;
	def : InstAlias<"isb", (ISB 0xf)>;

	def MRS : MRSI;
	def MSR : MSRI;
	def MSRpstateImm1 : MSRpstateImm0_1;
	def MSRpstateImm4 : MSRpstateImm0_15;

	// The thread pointer (on Linux, at least, where this has been implemented) is
	// TPIDR_EL0. Add pseudo op so we can mark it as not having any side effects.
	let hasSideEffects = 0 in
	def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
	[(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;

	// The cycle counter PMC register is PMCCNTR_EL0.
	let Predicates = [HasPerfMon] in
	def : Pat<(readcyclecounter), (MRS 0xdce8)>;

	// Generic system instructions
	def SYSxt : SystemXtI<0, "sys">;
	def SYSLxt : SystemLXtI<1, "sysl">;

	def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
	(SYSxt imm0_7:$op1, sys_cr_op:$Cn,
	sys_cr_op:$Cm, imm0_7:$op2, XZR)>;

	//===----------------------------------------------------------------------===//
	// Move immediate instructions.
	//===----------------------------------------------------------------------===//

	defm MOVK : InsertImmediate<0b11, "movk">;
	defm MOVN : MoveImmediate<0b00, "movn">;

	let PostEncoderMethod = "fixMOVZ" in
	defm MOVZ : MoveImmediate<0b10, "movz">;

	// First group of aliases covers an implicit "lsl #0".
	def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
	def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
	def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
	def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
	def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
	def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;

	// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
	def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
	def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
	def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
	def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;

	def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
	def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
	def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
	def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;

	def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
	def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
	def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
	def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;

	def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
	def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;

	def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
	def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;

	def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
	def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;

	// Final group of aliases covers true "mov $Rd, $imm" cases.
	multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
	int width, int shift> {
	def _asmoperand : AsmOperandClass {
	let Name = basename # width # "_lsl" # shift # "MovAlias";
	let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
	# shift # ">";
	let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
	}

	def _movimm : Operand<i32> {
	let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
	}

	def : InstAlias<"mov $Rd, $imm",
	(INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
	}

	defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
	defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;

	defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
	defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
	defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
	defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;

	defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
	defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;

	defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
	defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
	defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
	defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;

	let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
	isAsCheapAsAMove = 1 in {
	// FIXME: The following pseudo instructions are only needed because remat
	// cannot handle multiple instructions. When that changes, we can select
	// directly to the real instructions and get rid of these pseudos.

	def MOVi32imm
	: Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
	[(set GPR32:$dst, imm:$src)]>,
	Sched<[WriteImm]>;
	def MOVi64imm
	: Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
	[(set GPR64:$dst, imm:$src)]>,
	Sched<[WriteImm]>;
	} // isReMaterializable, isCodeGenOnly

	// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
	// eventual expansion code fewer bits to worry about getting right. Marshalling
	// the types is a little tricky though:
	def i64imm_32bit : ImmLeaf<i64, [{
	return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
	}]>;

	def s64imm_32bit : ImmLeaf<i64, [{
	int64_t Imm64 = static_cast<int64_t>(Imm);
	return Imm64 >= std::numeric_limits<int32_t>::min() &&
	Imm64 <= std::numeric_limits<int32_t>::max();
	}]>;

	def trunc_imm : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i32);
	}]>;

	def : Pat<(i64 i64imm_32bit:$src),
	(SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;

	// Materialize FP constants via MOVi32imm/MOVi64imm (MachO large code model).
	def bitcast_fpimm_to_i32 : SDNodeXForm<fpimm, [{
	return CurDAG->getTargetConstant(
	N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i32);
	}]>;

	def bitcast_fpimm_to_i64 : SDNodeXForm<fpimm, [{
	return CurDAG->getTargetConstant(
	N->getValueAPF().bitcastToAPInt().getZExtValue(), SDLoc(N), MVT::i64);
	}]>;


	def : Pat<(f32 fpimm:$in),
	(COPY_TO_REGCLASS (MOVi32imm (bitcast_fpimm_to_i32 f32:$in)), FPR32)>;
	def : Pat<(f64 fpimm:$in),
	(COPY_TO_REGCLASS (MOVi64imm (bitcast_fpimm_to_i64 f64:$in)), FPR64)>;


	// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
	// sequences.
	def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
	tglobaladdr:$g1, tglobaladdr:$g0),
	(MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g0, 0),
	tglobaladdr:$g1, 16),
	tglobaladdr:$g2, 32),
	tglobaladdr:$g3, 48)>;

	def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
	tblockaddress:$g1, tblockaddress:$g0),
	(MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g0, 0),
	tblockaddress:$g1, 16),
	tblockaddress:$g2, 32),
	tblockaddress:$g3, 48)>;

	def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
	tconstpool:$g1, tconstpool:$g0),
	(MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g0, 0),
	tconstpool:$g1, 16),
	tconstpool:$g2, 32),
	tconstpool:$g3, 48)>;

	def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
	tjumptable:$g1, tjumptable:$g0),
	(MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g0, 0),
	tjumptable:$g1, 16),
	tjumptable:$g2, 32),
	tjumptable:$g3, 48)>;


	//===----------------------------------------------------------------------===//
	// Arithmetic instructions.
	//===----------------------------------------------------------------------===//

	// Add/subtract with carry.
	defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
	defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;

	def : InstAlias<"ngc $dst, $src", (SBCWr GPR32:$dst, WZR, GPR32:$src)>;
	def : InstAlias<"ngc $dst, $src", (SBCXr GPR64:$dst, XZR, GPR64:$src)>;
	def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
	def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;

	// Add/subtract
	defm ADD : AddSub<0, "add", "sub", add>;
	defm SUB : AddSub<1, "sub", "add">;

	def : InstAlias<"mov $dst, $src",
	(ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
	def : InstAlias<"mov $dst, $src",
	(ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
	def : InstAlias<"mov $dst, $src",
	(ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
	def : InstAlias<"mov $dst, $src",
	(ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;

	defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn", "subs", "cmp">;
	defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp", "adds", "cmn">;

	// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
	def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
	(SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
	def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
	(SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
	def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
	(SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
	def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
	(SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
	def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
	(SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
	def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
	(SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
	let AddedComplexity = 1 in {
	def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
	(SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
	def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
	(SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
	}

	// Because of the immediate format for add/sub-imm instructions, the
	// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
	// These patterns capture that transformation.
	let AddedComplexity = 1 in {
	def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
	(SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
	def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
	(SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
	def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
	(ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
	def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
	(ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
	}

	// Because of the immediate format for add/sub-imm instructions, the
	// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
	// These patterns capture that transformation.
	let AddedComplexity = 1 in {
	def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
	(SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
	def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
	(SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
	def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
	(ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
	def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
	(ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
	}

	def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
	def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
	def : InstAlias<"neg $dst, $src$shift",
	(SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
	def : InstAlias<"neg $dst, $src$shift",
	(SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;

	def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
	def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
	def : InstAlias<"negs $dst, $src$shift",
	(SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
	def : InstAlias<"negs $dst, $src$shift",
	(SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;


	// Unsigned/Signed divide
	defm UDIV : Div<0, "udiv", udiv>;
	defm SDIV : Div<1, "sdiv", sdiv>;

	def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>;
	def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>;
	def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>;
	def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>;

	// Variable shift
	defm ASRV : Shift<0b10, "asr", sra>;
	defm LSLV : Shift<0b00, "lsl", shl>;
	defm LSRV : Shift<0b01, "lsr", srl>;
	defm RORV : Shift<0b11, "ror", rotr>;

	def : ShiftAlias<"asrv", ASRVWr, GPR32>;
	def : ShiftAlias<"asrv", ASRVXr, GPR64>;
	def : ShiftAlias<"lslv", LSLVWr, GPR32>;
	def : ShiftAlias<"lslv", LSLVXr, GPR64>;
	def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
	def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
	def : ShiftAlias<"rorv", RORVWr, GPR32>;
	def : ShiftAlias<"rorv", RORVXr, GPR64>;

	// Multiply-add
	let AddedComplexity = 5 in {
	defm MADD : MulAccum<0, "madd", add>;
	defm MSUB : MulAccum<1, "msub", sub>;

	def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
	(MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
	def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
	(MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;

	def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
	(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
	def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
	(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
	def : Pat<(i32 (mul (ineg GPR32:$Rn), GPR32:$Rm)),
	(MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
	def : Pat<(i64 (mul (ineg GPR64:$Rn), GPR64:$Rm)),
	(MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
	} // AddedComplexity = 5

	let AddedComplexity = 5 in {
	def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
	def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
	def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
	def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;

	def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
	(SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
	def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
	(UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;

	def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
	(SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
	def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
	(UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;

	def : Pat<(i64 (mul (sext GPR32:$Rn), (s64imm_32bit:$C))),
	(SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
	def : Pat<(i64 (mul (zext GPR32:$Rn), (i64imm_32bit:$C))),
	(UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
	def : Pat<(i64 (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C))),
	(SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
	(MOVi32imm (trunc_imm imm:$C)), XZR)>;

	def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
	(SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
	def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
	(UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), XZR)>;
	def : Pat<(i64 (ineg (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)))),
	(SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
	(MOVi32imm (trunc_imm imm:$C)), XZR)>;

	def : Pat<(i64 (add (mul (sext GPR32:$Rn), (s64imm_32bit:$C)), GPR64:$Ra)),
	(SMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	def : Pat<(i64 (add (mul (zext GPR32:$Rn), (i64imm_32bit:$C)), GPR64:$Ra)),
	(UMADDLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	def : Pat<(i64 (add (mul (sext_inreg GPR64:$Rn, i32), (s64imm_32bit:$C)),
	GPR64:$Ra)),
	(SMADDLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
	(MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;

	def : Pat<(i64 (sub GPR64:$Ra, (mul (sext GPR32:$Rn), (s64imm_32bit:$C)))),
	(SMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	def : Pat<(i64 (sub GPR64:$Ra, (mul (zext GPR32:$Rn), (i64imm_32bit:$C)))),
	(UMSUBLrrr GPR32:$Rn, (MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	def : Pat<(i64 (sub GPR64:$Ra, (mul (sext_inreg GPR64:$Rn, i32),
	(s64imm_32bit:$C)))),
	(SMSUBLrrr (i32 (EXTRACT_SUBREG GPR64:$Rn, sub_32)),
	(MOVi32imm (trunc_imm imm:$C)), GPR64:$Ra)>;
	} // AddedComplexity = 5

	def : MulAccumWAlias<"mul", MADDWrrr>;
	def : MulAccumXAlias<"mul", MADDXrrr>;
	def : MulAccumWAlias<"mneg", MSUBWrrr>;
	def : MulAccumXAlias<"mneg", MSUBXrrr>;
	def : WideMulAccumAlias<"smull", SMADDLrrr>;
	def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
	def : WideMulAccumAlias<"umull", UMADDLrrr>;
	def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;

	// Multiply-high
	def SMULHrr : MulHi<0b010, "smulh", mulhs>;
	def UMULHrr : MulHi<0b110, "umulh", mulhu>;

	// CRC32
	def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
	def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
	def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
	def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;

	def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
	def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
	def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
	def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;

	// v8.1 atomic CAS
	defm CAS : CompareAndSwap<0, 0, "">;
	defm CASA : CompareAndSwap<1, 0, "a">;
	defm CASL : CompareAndSwap<0, 1, "l">;
	defm CASAL : CompareAndSwap<1, 1, "al">;

	// v8.1 atomic CASP
	defm CASP : CompareAndSwapPair<0, 0, "">;
	defm CASPA : CompareAndSwapPair<1, 0, "a">;
	defm CASPL : CompareAndSwapPair<0, 1, "l">;
	defm CASPAL : CompareAndSwapPair<1, 1, "al">;

	// v8.1 atomic SWP
	defm SWP : Swap<0, 0, "">;
	defm SWPA : Swap<1, 0, "a">;
	defm SWPL : Swap<0, 1, "l">;
	defm SWPAL : Swap<1, 1, "al">;

	// v8.1 atomic LD<OP>(register). Performs load and then ST<OP>(register)
	defm LDADD : LDOPregister<0b000, "add", 0, 0, "">;
	defm LDADDA : LDOPregister<0b000, "add", 1, 0, "a">;
	defm LDADDL : LDOPregister<0b000, "add", 0, 1, "l">;
	defm LDADDAL : LDOPregister<0b000, "add", 1, 1, "al">;

	defm LDCLR : LDOPregister<0b001, "clr", 0, 0, "">;
	defm LDCLRA : LDOPregister<0b001, "clr", 1, 0, "a">;
	defm LDCLRL : LDOPregister<0b001, "clr", 0, 1, "l">;
	defm LDCLRAL : LDOPregister<0b001, "clr", 1, 1, "al">;

	defm LDEOR : LDOPregister<0b010, "eor", 0, 0, "">;
	defm LDEORA : LDOPregister<0b010, "eor", 1, 0, "a">;
	defm LDEORL : LDOPregister<0b010, "eor", 0, 1, "l">;
	defm LDEORAL : LDOPregister<0b010, "eor", 1, 1, "al">;

	defm LDSET : LDOPregister<0b011, "set", 0, 0, "">;
	defm LDSETA : LDOPregister<0b011, "set", 1, 0, "a">;
	defm LDSETL : LDOPregister<0b011, "set", 0, 1, "l">;
	defm LDSETAL : LDOPregister<0b011, "set", 1, 1, "al">;

	defm LDSMAX : LDOPregister<0b100, "smax", 0, 0, "">;
	defm LDSMAXA : LDOPregister<0b100, "smax", 1, 0, "a">;
	defm LDSMAXL : LDOPregister<0b100, "smax", 0, 1, "l">;
	defm LDSMAXAL : LDOPregister<0b100, "smax", 1, 1, "al">;

	defm LDSMIN : LDOPregister<0b101, "smin", 0, 0, "">;
	defm LDSMINA : LDOPregister<0b101, "smin", 1, 0, "a">;
	defm LDSMINL : LDOPregister<0b101, "smin", 0, 1, "l">;
	defm LDSMINAL : LDOPregister<0b101, "smin", 1, 1, "al">;

	defm LDUMAX : LDOPregister<0b110, "umax", 0, 0, "">;
	defm LDUMAXA : LDOPregister<0b110, "umax", 1, 0, "a">;
	defm LDUMAXL : LDOPregister<0b110, "umax", 0, 1, "l">;
	defm LDUMAXAL : LDOPregister<0b110, "umax", 1, 1, "al">;

	defm LDUMIN : LDOPregister<0b111, "umin", 0, 0, "">;
	defm LDUMINA : LDOPregister<0b111, "umin", 1, 0, "a">;
	defm LDUMINL : LDOPregister<0b111, "umin", 0, 1, "l">;
	defm LDUMINAL : LDOPregister<0b111, "umin", 1, 1, "al">;

	// v8.1 atomic ST<OP>(register) as aliases to "LD<OP>(register) when Rt=xZR"
	defm : STOPregister<"stadd","LDADD">; // STADDx
	defm : STOPregister<"stclr","LDCLR">; // STCLRx
	defm : STOPregister<"steor","LDEOR">; // STEORx
	defm : STOPregister<"stset","LDSET">; // STSETx
	defm : STOPregister<"stsmax","LDSMAX">;// STSMAXx
	defm : STOPregister<"stsmin","LDSMIN">;// STSMINx
	defm : STOPregister<"stumax","LDUMAX">;// STUMAXx
	defm : STOPregister<"stumin","LDUMIN">;// STUMINx

	//===----------------------------------------------------------------------===//
	// Logical instructions.
	//===----------------------------------------------------------------------===//

	// (immediate)
	defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag, "bics">;
	defm AND : LogicalImm<0b00, "and", and, "bic">;
	defm EOR : LogicalImm<0b10, "eor", xor, "eon">;
	defm ORR : LogicalImm<0b01, "orr", or, "orn">;

	// FIXME: these aliases are canonical sometimes (when movz can't be
	// used). Actually, it seems to be working right now, but putting logical_immXX
	// here is a bit dodgy on the AsmParser side too.
	def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
	logical_imm32:$imm), 0>;
	def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
	logical_imm64:$imm), 0>;


	// (register)
	defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
	defm BICS : LogicalRegS<0b11, 1, "bics",
	BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
	defm AND : LogicalReg<0b00, 0, "and", and>;
	defm BIC : LogicalReg<0b00, 1, "bic",
	BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
	defm EON : LogicalReg<0b10, 1, "eon",
	BinOpFrag<(not (xor node:$LHS, node:$RHS))>>;
	defm EOR : LogicalReg<0b10, 0, "eor", xor>;
	defm ORN : LogicalReg<0b01, 1, "orn",
	BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
	defm ORR : LogicalReg<0b01, 0, "orr", or>;

	def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
	def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;

	def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
	def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;

	def : InstAlias<"mvn $Wd, $Wm$sh",
	(ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
	def : InstAlias<"mvn $Xd, $Xm$sh",
	(ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;

	def : InstAlias<"tst $src1, $src2",
	(ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
	def : InstAlias<"tst $src1, $src2",
	(ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;

	def : InstAlias<"tst $src1, $src2",
	(ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
	def : InstAlias<"tst $src1, $src2",
	(ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;

	def : InstAlias<"tst $src1, $src2$sh",
	(ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
	def : InstAlias<"tst $src1, $src2$sh",
	(ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;


	def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
	def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;


	//===----------------------------------------------------------------------===//
	// One operand data processing instructions.
	//===----------------------------------------------------------------------===//

	defm CLS : OneOperandData<0b101, "cls">;
	defm CLZ : OneOperandData<0b100, "clz", ctlz>;
	defm RBIT : OneOperandData<0b000, "rbit", bitreverse>;

	def REV16Wr : OneWRegData<0b001, "rev16",
	UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
	def REV16Xr : OneXRegData<0b001, "rev16", null_frag>;

	def : Pat<(cttz GPR32:$Rn),
	(CLZWr (RBITWr GPR32:$Rn))>;
	def : Pat<(cttz GPR64:$Rn),
	(CLZXr (RBITXr GPR64:$Rn))>;
	def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
	(i32 1))),
	(CLSWr GPR32:$Rn)>;
	def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
	(i64 1))),
	(CLSXr GPR64:$Rn)>;

	// Unlike the other one operand instructions, the instructions with the "rev"
	// mnemonic do not just different in the size bit, but actually use different
	// opcode bits for the different sizes.
	def REVWr : OneWRegData<0b010, "rev", bswap>;
	def REVXr : OneXRegData<0b011, "rev", bswap>;
	def REV32Xr : OneXRegData<0b010, "rev32",
	UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;

	def : InstAlias<"rev64 $Rd, $Rn", (REVXr GPR64:$Rd, GPR64:$Rn), 0>;

	// The bswap commutes with the rotr so we want a pattern for both possible
	// orders.
	def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
	def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;

	//===----------------------------------------------------------------------===//
	// Bitfield immediate extraction instruction.
	//===----------------------------------------------------------------------===//
	let hasSideEffects = 0 in
	defm EXTR : ExtractImm<"extr">;
	def : InstAlias<"ror $dst, $src, $shift",
	(EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
	def : InstAlias<"ror $dst, $src, $shift",
	(EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;

	def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
	(EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
	def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
	(EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;

	//===----------------------------------------------------------------------===//
	// Other bitfield immediate instructions.
	//===----------------------------------------------------------------------===//
	let hasSideEffects = 0 in {
	defm BFM : BitfieldImmWith2RegArgs<0b01, "bfm">;
	defm SBFM : BitfieldImm<0b00, "sbfm">;
	defm UBFM : BitfieldImm<0b10, "ubfm">;
	}

	def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 31 - N->getZExtValue();
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(7, 31 - shift_amt)
	def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 31 - N->getZExtValue();
	enc = enc > 7 ? 7 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(15, 31 - shift_amt)
	def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 31 - N->getZExtValue();
	enc = enc > 15 ? 15 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 63 - N->getZExtValue();
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(7, 63 - shift_amt)
	def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 63 - N->getZExtValue();
	enc = enc > 7 ? 7 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(15, 63 - shift_amt)
	def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 63 - N->getZExtValue();
	enc = enc > 15 ? 15 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	// min(31, 63 - shift_amt)
	def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
	uint64_t enc = 63 - N->getZExtValue();
	enc = enc > 31 ? 31 : enc;
	return CurDAG->getTargetConstant(enc, SDLoc(N), MVT::i64);
	}]>;

	def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
	(UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
	(i64 (i32shift_b imm0_31:$imm)))>;
	def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
	(UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
	(i64 (i64shift_b imm0_63:$imm)))>;

	let AddedComplexity = 10 in {
	def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
	(SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
	def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
	(SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
	}

	def : InstAlias<"asr $dst, $src, $shift",
	(SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
	def : InstAlias<"asr $dst, $src, $shift",
	(SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
	def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
	def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
	def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
	def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
	def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;

	def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
	(UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
	def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
	(UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;

	def : InstAlias<"lsr $dst, $src, $shift",
	(UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
	def : InstAlias<"lsr $dst, $src, $shift",
	(UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
	def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
	def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
	def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
	def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
	def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;

	//===----------------------------------------------------------------------===//
	// Conditional comparison instructions.
	//===----------------------------------------------------------------------===//
	defm CCMN : CondComparison<0, "ccmn", AArch64ccmn>;
	defm CCMP : CondComparison<1, "ccmp", AArch64ccmp>;

	//===----------------------------------------------------------------------===//
	// Conditional select instructions.
	//===----------------------------------------------------------------------===//
	defm CSEL : CondSelect<0, 0b00, "csel">;

	def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
	defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
	defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
	defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;

	def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
	def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;

	def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
	(CSINCWr WZR, WZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
	(CSINCXr XZR, XZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel GPR32:$tval, (i32 1), (i32 imm:$cc), NZCV),
	(CSINCWr GPR32:$tval, WZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel GPR64:$tval, (i64 1), (i32 imm:$cc), NZCV),
	(CSINCXr GPR64:$tval, XZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel (i32 1), GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSINCWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
	def : Pat<(AArch64csel (i64 1), GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSINCXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;
	def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
	(CSINVWr WZR, WZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
	(CSINVXr XZR, XZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel GPR32:$tval, (i32 -1), (i32 imm:$cc), NZCV),
	(CSINVWr GPR32:$tval, WZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel GPR64:$tval, (i64 -1), (i32 imm:$cc), NZCV),
	(CSINVXr GPR64:$tval, XZR, (i32 imm:$cc))>;
	def : Pat<(AArch64csel (i32 -1), GPR32:$fval, (i32 imm:$cc), NZCV),
	(CSINVWr GPR32:$fval, WZR, (i32 (inv_cond_XFORM imm:$cc)))>;
	def : Pat<(AArch64csel (i64 -1), GPR64:$fval, (i32 imm:$cc), NZCV),
	(CSINVXr GPR64:$fval, XZR, (i32 (inv_cond_XFORM imm:$cc)))>;

	// The inverse of the condition code from the alias instruction is what is used
	// in the aliased instruction. The parser all ready inverts the condition code
	// for these aliases.
	def : InstAlias<"cset $dst, $cc",
	(CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
	def : InstAlias<"cset $dst, $cc",
	(CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;

	def : InstAlias<"csetm $dst, $cc",
	(CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
	def : InstAlias<"csetm $dst, $cc",
	(CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;

	def : InstAlias<"cinc $dst, $src, $cc",
	(CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
	def : InstAlias<"cinc $dst, $src, $cc",
	(CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;

	def : InstAlias<"cinv $dst, $src, $cc",
	(CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
	def : InstAlias<"cinv $dst, $src, $cc",
	(CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;

	def : InstAlias<"cneg $dst, $src, $cc",
	(CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
	def : InstAlias<"cneg $dst, $src, $cc",
	(CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;

	//===----------------------------------------------------------------------===//
	// PC-relative instructions.
	//===----------------------------------------------------------------------===//
	let isReMaterializable = 1 in {
	let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
	def ADR : ADRI<0, "adr", adrlabel, []>;
	} // hasSideEffects = 0

	def ADRP : ADRI<1, "adrp", adrplabel,
	[(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
	} // isReMaterializable = 1

	// page address of a constant pool entry, block address
	def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
	def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;

	//===----------------------------------------------------------------------===//
	// Unconditional branch (register) instructions.
	//===----------------------------------------------------------------------===//

	let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
	def RET : BranchReg<0b0010, "ret", []>;
	def DRPS : SpecialReturn<0b0101, "drps">;
	def ERET : SpecialReturn<0b0100, "eret">;
	} // isReturn = 1, isTerminator = 1, isBarrier = 1

	// Default to the LR register.
	def : InstAlias<"ret", (RET LR)>;

	let isCall = 1, Defs = [LR], Uses = [SP] in {
	def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
	} // isCall

	let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
	def BR : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
	} // isBranch, isTerminator, isBarrier, isIndirectBranch

	// Create a separate pseudo-instruction for codegen to use so that we don't
	// flag lr as used in every function. It'll be restored before the RET by the
	// epilogue if it's legitimately used.
	def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]>,
	Sched<[WriteBrReg]> {
	let isTerminator = 1;
	let isBarrier = 1;
	let isReturn = 1;
	}

	// This is a directive-like pseudo-instruction. The purpose is to insert an
	// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
	// (which in the usual case is a BLR).
	let hasSideEffects = 1 in
	def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []>, Sched<[]> {
	let AsmString = ".tlsdesccall $sym";
	}

	// FIXME: maybe the scratch register used shouldn't be fixed to X1?
	// FIXME: can "hasSideEffects be dropped?
	let isCall = 1, Defs = [LR, X0, X1], hasSideEffects = 1,
	isCodeGenOnly = 1 in
	def TLSDESC_CALLSEQ
	: Pseudo<(outs), (ins i64imm:$sym),
	[(AArch64tlsdesc_callseq tglobaltlsaddr:$sym)]>,
	Sched<[WriteI, WriteLD, WriteI, WriteBrReg]>;
	def : Pat<(AArch64tlsdesc_callseq texternalsym:$sym),
	(TLSDESC_CALLSEQ texternalsym:$sym)>;

	//===----------------------------------------------------------------------===//
	// Conditional branch (immediate) instruction.
	//===----------------------------------------------------------------------===//
	def Bcc : BranchCond;

	//===----------------------------------------------------------------------===//
	// Compare-and-branch instructions.
	//===----------------------------------------------------------------------===//
	defm CBZ : CmpBranch<0, "cbz", AArch64cbz>;
	defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;

	//===----------------------------------------------------------------------===//
	// Test-bit-and-branch instructions.
	//===----------------------------------------------------------------------===//
	defm TBZ : TestBranch<0, "tbz", AArch64tbz>;
	defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;

	//===----------------------------------------------------------------------===//
	// Unconditional branch (immediate) instructions.
	//===----------------------------------------------------------------------===//
	let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
	def B : BranchImm<0, "b", [(br bb:$addr)]>;
	} // isBranch, isTerminator, isBarrier

	let isCall = 1, Defs = [LR], Uses = [SP] in {
	def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
	} // isCall
	def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;

	//===----------------------------------------------------------------------===//
	// Exception generation instructions.
	//===----------------------------------------------------------------------===//
	def BRK : ExceptionGeneration<0b001, 0b00, "brk">;
	def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
	def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
	def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
	def HLT : ExceptionGeneration<0b010, 0b00, "hlt">;
	def HVC : ExceptionGeneration<0b000, 0b10, "hvc">;
	def SMC : ExceptionGeneration<0b000, 0b11, "smc">;
	def SVC : ExceptionGeneration<0b000, 0b01, "svc">;

	// DCPSn defaults to an immediate operand of zero if unspecified.
	def : InstAlias<"dcps1", (DCPS1 0)>;
	def : InstAlias<"dcps2", (DCPS2 0)>;
	def : InstAlias<"dcps3", (DCPS3 0)>;

	//===----------------------------------------------------------------------===//
	// Load instructions.
	//===----------------------------------------------------------------------===//

	// Pair (indexed, offset)
	defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
	defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
	defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
	defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
	defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;

	defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;

	// Pair (pre-indexed)
	def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
	def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
	def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
	def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
	def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;

	def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;

	// Pair (post-indexed)
	def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
	def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
	def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
	def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
	def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;

	def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;


	// Pair (no allocate)
	defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
	defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
	defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
	defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
	defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;

	//---
	// (register offset)
	//---

	// Integer
	defm LDRBB : Load8RO<0b00, 0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
	defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
	defm LDRW : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
	defm LDRX : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;

	// Floating-point
	defm LDRB : Load8RO<0b00, 1, 0b01, FPR8, "ldr", untyped, load>;
	defm LDRH : Load16RO<0b01, 1, 0b01, FPR16, "ldr", f16, load>;
	defm LDRS : Load32RO<0b10, 1, 0b01, FPR32, "ldr", f32, load>;
	defm LDRD : Load64RO<0b11, 1, 0b01, FPR64, "ldr", f64, load>;
	defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;

	// Load sign-extended half-word
	defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
	defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;

	// Load sign-extended byte
	defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
	defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;

	// Load sign-extended word
	defm LDRSW : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;

	// Pre-fetch.
	defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;

	// For regular load, we do not have any alignment requirement.
	// Thus, it is safe to directly map the vector loads with interesting
	// addressing modes.
	// FIXME: We could do the same for bitconvert to floating point vectors.
	multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
	ValueType ScalTy, ValueType VecTy,
	Instruction LOADW, Instruction LOADX,
	SubRegIndex sub> {
	def : Pat<(VecTy (scalar_to_vector (ScalTy
	(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
	(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
	(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
	sub)>;

	def : Pat<(VecTy (scalar_to_vector (ScalTy
	(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
	(INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
	(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
	sub)>;
	}

	let AddedComplexity = 10 in {
	defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v8i8, LDRBroW, LDRBroX, bsub>;
	defm : ScalToVecROLoadPat<ro8, extloadi8, i32, v16i8, LDRBroW, LDRBroX, bsub>;

	defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
	defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;

	defm : ScalToVecROLoadPat<ro16, load, i32, v4f16, LDRHroW, LDRHroX, hsub>;
	defm : ScalToVecROLoadPat<ro16, load, i32, v8f16, LDRHroW, LDRHroX, hsub>;

	defm : ScalToVecROLoadPat<ro32, load, i32, v2i32, LDRSroW, LDRSroX, ssub>;
	defm : ScalToVecROLoadPat<ro32, load, i32, v4i32, LDRSroW, LDRSroX, ssub>;

	defm : ScalToVecROLoadPat<ro32, load, f32, v2f32, LDRSroW, LDRSroX, ssub>;
	defm : ScalToVecROLoadPat<ro32, load, f32, v4f32, LDRSroW, LDRSroX, ssub>;

	defm : ScalToVecROLoadPat<ro64, load, i64, v2i64, LDRDroW, LDRDroX, dsub>;

	defm : ScalToVecROLoadPat<ro64, load, f64, v2f64, LDRDroW, LDRDroX, dsub>;


	def : Pat <(v1i64 (scalar_to_vector (i64
	(load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
	ro_Wextend64:$extend))))),
	(LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;

	def : Pat <(v1i64 (scalar_to_vector (i64
	(load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
	ro_Xextend64:$extend))))),
	(LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
	}

	// Match all load 64 bits width whose type is compatible with FPR64
	multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
	Instruction LOADW, Instruction LOADX> {

	def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
	(LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
	(LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
	}

	let AddedComplexity = 10 in {
	let Predicates = [IsLE] in {
	// We must do vector loads with LD1 in big-endian.
	defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v8i8, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
	}

	defm : VecROLoadPat<ro64, v1i64, LDRDroW, LDRDroX>;
	defm : VecROLoadPat<ro64, v1f64, LDRDroW, LDRDroX>;

	// Match all load 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	// We must do vector loads with LD1 in big-endian.
	defm : VecROLoadPat<ro128, v2i64, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v2f64, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v4i32, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v4f32, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v8i16, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v8f16, LDRQroW, LDRQroX>;
	defm : VecROLoadPat<ro128, v16i8, LDRQroW, LDRQroX>;
	}
	} // AddedComplexity = 10

	// zextload -> i64
	multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
	Instruction INSTW, Instruction INSTX> {
	def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
	(SUBREG_TO_REG (i64 0),
	(INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
	sub_32)>;

	def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
	(SUBREG_TO_REG (i64 0),
	(INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
	sub_32)>;
	}

	let AddedComplexity = 10 in {
	defm : ExtLoadTo64ROPat<ro8, zextloadi8, LDRBBroW, LDRBBroX>;
	defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
	defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW, LDRWroX>;

	// zextloadi1 -> zextloadi8
	defm : ExtLoadTo64ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;

	// extload -> zextload
	defm : ExtLoadTo64ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
	defm : ExtLoadTo64ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
	defm : ExtLoadTo64ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;

	// extloadi1 -> zextloadi8
	defm : ExtLoadTo64ROPat<ro8, extloadi1, LDRBBroW, LDRBBroX>;
	}


	// zextload -> i64
	multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
	Instruction INSTW, Instruction INSTX> {
	def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
	(INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
	(INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;

	}

	let AddedComplexity = 10 in {
	// extload -> zextload
	defm : ExtLoadTo32ROPat<ro8, extloadi8, LDRBBroW, LDRBBroX>;
	defm : ExtLoadTo32ROPat<ro16, extloadi16, LDRHHroW, LDRHHroX>;
	defm : ExtLoadTo32ROPat<ro32, extloadi32, LDRWroW, LDRWroX>;

	// zextloadi1 -> zextloadi8
	defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
	}

	//---
	// (unsigned immediate)
	//---
	defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
	[(set GPR64:$Rt,
	(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
	defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
	[(set GPR32:$Rt,
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
	defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
	[(set FPR8:$Rt,
	(load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
	defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
	[(set (f16 FPR16:$Rt),
	(load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
	defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
	[(set (f32 FPR32:$Rt),
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
	defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
	[(set (f64 FPR64:$Rt),
	(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
	defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
	[(set (f128 FPR128:$Rt),
	(load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;

	// For regular load, we do not have any alignment requirement.
	// Thus, it is safe to directly map the vector loads with interesting
	// addressing modes.
	// FIXME: We could do the same for bitconvert to floating point vectors.
	def : Pat <(v8i8 (scalar_to_vector (i32
	(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
	(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
	def : Pat <(v16i8 (scalar_to_vector (i32
	(extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
	def : Pat <(v4i16 (scalar_to_vector (i32
	(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
	(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
	def : Pat <(v8i16 (scalar_to_vector (i32
	(extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
	def : Pat <(v2i32 (scalar_to_vector (i32
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
	(INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
	(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
	def : Pat <(v4i32 (scalar_to_vector (i32
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
	def : Pat <(v1i64 (scalar_to_vector (i64
	(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat <(v2i64 (scalar_to_vector (i64
	(load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
	(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;

	// Match all load 64 bits width whose type is compatible with FPR64
	let Predicates = [IsLE] in {
	// We must use LD1 to perform vector loads in big-endian.
	def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	}
	def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
	(LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;

	// Match all load 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	// We must use LD1 to perform vector loads in big-endian.
	def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
	}
	def : Pat<(f128 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
	(LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;

	defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
	[(set GPR32:$Rt,
	(zextloadi16 (am_indexed16 GPR64sp:$Rn,
	uimm12s2:$offset)))]>;
	defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
	[(set GPR32:$Rt,
	(zextloadi8 (am_indexed8 GPR64sp:$Rn,
	uimm12s1:$offset)))]>;
	// zextload -> i64
	def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;

	// zextloadi1 -> zextloadi8
	def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
	def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;

	// extload -> zextload
	def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
	(LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
	def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
	def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
	def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
	def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
	def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
	def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;

	// load sign-extended half-word
	defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
	[(set GPR32:$Rt,
	(sextloadi16 (am_indexed16 GPR64sp:$Rn,
	uimm12s2:$offset)))]>;
	defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
	[(set GPR64:$Rt,
	(sextloadi16 (am_indexed16 GPR64sp:$Rn,
	uimm12s2:$offset)))]>;

	// load sign-extended byte
	defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
	[(set GPR32:$Rt,
	(sextloadi8 (am_indexed8 GPR64sp:$Rn,
	uimm12s1:$offset)))]>;
	defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
	[(set GPR64:$Rt,
	(sextloadi8 (am_indexed8 GPR64sp:$Rn,
	uimm12s1:$offset)))]>;

	// load sign-extended word
	defm LDRSW : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
	[(set GPR64:$Rt,
	(sextloadi32 (am_indexed32 GPR64sp:$Rn,
	uimm12s4:$offset)))]>;

	// load zero-extended word
	def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
	(SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;

	// Pre-fetch.
	def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
	[(AArch64Prefetch imm:$Rt,
	(am_indexed64 GPR64sp:$Rn,
	uimm12s8:$offset))]>;

	def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;

	//---
	// (literal)
	def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
	def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
	def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
	def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
	def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;

	// load sign-extended word
	def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;

	// prefetch
	def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
	// [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;

	//---
	// (unscaled immediate)
	defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
	[(set GPR64:$Rt,
	(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
	[(set GPR32:$Rt,
	(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
	[(set FPR8:$Rt,
	(load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
	[(set FPR16:$Rt,
	(load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
	[(set (f32 FPR32:$Rt),
	(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
	[(set (f64 FPR64:$Rt),
	(load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
	[(set (f128 FPR128:$Rt),
	(load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;

	defm LDURHH
	: LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
	[(set GPR32:$Rt,
	(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURBB
	: LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
	[(set GPR32:$Rt,
	(zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;

	// Match all load 64 bits width whose type is compatible with FPR64
	let Predicates = [IsLE] in {
	def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v4f16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	}
	def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
	(LDURDi GPR64sp:$Rn, simm9:$offset)>;

	// Match all load 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(v8f16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
	(LDURQi GPR64sp:$Rn, simm9:$offset)>;
	}

	// anyext -> zext
	def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(LDURHHi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	// unscaled zext
	def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(LDURHHi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(LDURBBi GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;


	//---
	// LDR mnemonics fall back to LDUR for negative or unaligned offsets.

	// Define new assembler match classes as we want to only match these when
	// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
	// associate a DiagnosticType either, as we want the diagnostic for the
	// canonical form (the scaled operand) to take precedence.
	class SImm9OffsetOperand<int Width> : AsmOperandClass {
	let Name = "SImm9OffsetFB" # Width;
	let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
	let RenderMethod = "addImmOperands";
	}

	def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
	def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
	def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
	def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
	def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;

	def simm9_offset_fb8 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB8Operand;
	}
	def simm9_offset_fb16 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB16Operand;
	}
	def simm9_offset_fb32 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB32Operand;
	}
	def simm9_offset_fb64 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB64Operand;
	}
	def simm9_offset_fb128 : Operand<i64> {
	let ParserMatchClass = SImm9OffsetFB128Operand;
	}

	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
	def : InstAlias<"ldr $Rt, [$Rn, $offset]",
	(LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;

	// zextload -> i64
	def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
	def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
	(SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;

	// load sign-extended half-word
	defm LDURSHW
	: LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
	[(set GPR32:$Rt,
	(sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURSHX
	: LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
	[(set GPR64:$Rt,
	(sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;

	// load sign-extended byte
	defm LDURSBW
	: LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
	[(set GPR32:$Rt,
	(sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
	defm LDURSBX
	: LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
	[(set GPR64:$Rt,
	(sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;

	// load sign-extended word
	defm LDURSW
	: LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
	[(set GPR64:$Rt,
	(sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;

	// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
	def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
	(LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
	(LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
	(LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
	(LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
	(LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
	(LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
	(LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;

	// Pre-fetch.
	defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
	[(AArch64Prefetch imm:$Rt,
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;

	//---
	// (unscaled immediate, unprivileged)
	defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
	defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;

	defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
	defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;

	// load sign-extended half-word
	defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
	defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;

	// load sign-extended byte
	defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
	defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;

	// load sign-extended word
	defm LDTRSW : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;

	//---
	// (immediate pre-indexed)
	def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
	def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
	def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8, "ldr">;
	def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
	def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
	def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
	def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;

	// load sign-extended half-word
	def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
	def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;

	// load sign-extended byte
	def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
	def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;

	// load zero-extended byte
	def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
	def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;

	// load sign-extended word
	def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;

	//---
	// (immediate post-indexed)
	def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
	def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
	def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8, "ldr">;
	def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
	def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
	def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
	def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;

	// load sign-extended half-word
	def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
	def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;

	// load sign-extended byte
	def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
	def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;

	// load zero-extended byte
	def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
	def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;

	// load sign-extended word
	def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;

	//===----------------------------------------------------------------------===//
	// Store instructions.
	//===----------------------------------------------------------------------===//

	// Pair (indexed, offset)
	// FIXME: Use dedicated range-checked addressing mode operand here.
	defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
	defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
	defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
	defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
	defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;

	// Pair (pre-indexed)
	def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
	def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
	def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
	def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
	def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;

	// Pair (pre-indexed)
	def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
	def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
	def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
	def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
	def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;

	// Pair (no allocate)
	defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
	defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
	defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
	defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
	defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;

	//---
	// (Register offset)

	// Integer
	defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
	defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
	defm STRW : Store32RO<0b10, 0, 0b00, GPR32, "str", i32, store>;
	defm STRX : Store64RO<0b11, 0, 0b00, GPR64, "str", i64, store>;


	// Floating-point
	defm STRB : Store8RO< 0b00, 1, 0b00, FPR8, "str", untyped, store>;
	defm STRH : Store16RO<0b01, 1, 0b00, FPR16, "str", f16, store>;
	defm STRS : Store32RO<0b10, 1, 0b00, FPR32, "str", f32, store>;
	defm STRD : Store64RO<0b11, 1, 0b00, FPR64, "str", f64, store>;
	defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128, store>;

	multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
	Instruction STRW, Instruction STRX> {

	def : Pat<(storeop GPR64:$Rt,
	(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
	(STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
	GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(storeop GPR64:$Rt,
	(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
	(STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
	GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
	}

	let AddedComplexity = 10 in {
	// truncstore i64
	defm : TruncStoreFrom64ROPat<ro8, truncstorei8, STRBBroW, STRBBroX>;
	defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
	defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW, STRWroX>;
	}

	multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
	Instruction STRW, Instruction STRX> {
	def : Pat<(store (VecTy FPR:$Rt),
	(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
	(STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(store (VecTy FPR:$Rt),
	(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
	(STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
	}

	let AddedComplexity = 10 in {
	// Match all store 64 bits width whose type is compatible with FPR64
	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
	}

	defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
	defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;

	// Match all store 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
	defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
	}
	} // AddedComplexity = 10

	// Match stores from lane 0 to the appropriate subreg's store.
	multiclass VecROStoreLane0Pat<ROAddrMode ro, SDPatternOperator storeop,
	ValueType VecTy, ValueType STy,
	SubRegIndex SubRegIdx,
	Instruction STRW, Instruction STRX> {

	def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
	(ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
	(STRW (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
	GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;

	def : Pat<(storeop (STy (vector_extract (VecTy VecListOne128:$Vt), 0)),
	(ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
	(STRX (EXTRACT_SUBREG VecListOne128:$Vt, SubRegIdx),
	GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
	}

	let AddedComplexity = 19 in {
	defm : VecROStoreLane0Pat<ro16, truncstorei16, v8i16, i32, hsub, STRHroW, STRHroX>;
	defm : VecROStoreLane0Pat<ro16, store , v8i16, i16, hsub, STRHroW, STRHroX>;
	defm : VecROStoreLane0Pat<ro32, truncstorei32, v4i32, i32, ssub, STRSroW, STRSroX>;
	defm : VecROStoreLane0Pat<ro32, store , v4i32, i32, ssub, STRSroW, STRSroX>;
	defm : VecROStoreLane0Pat<ro32, store , v4f32, f32, ssub, STRSroW, STRSroX>;
	defm : VecROStoreLane0Pat<ro64, store , v2i64, i64, dsub, STRDroW, STRDroX>;
	defm : VecROStoreLane0Pat<ro64, store , v2f64, f64, dsub, STRDroW, STRDroX>;
	}

	//---
	// (unsigned immediate)
	defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
	[(store GPR64:$Rt,
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
	defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
	[(store GPR32:$Rt,
	(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
	defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
	[(store FPR8:$Rt,
	(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
	defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
	[(store (f16 FPR16:$Rt),
	(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
	defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
	[(store (f32 FPR32:$Rt),
	(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
	defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
	[(store (f64 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
	defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;

	defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
	[(truncstorei16 GPR32:$Rt,
	(am_indexed16 GPR64sp:$Rn,
	uimm12s2:$offset))]>;
	defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1, "strb",
	[(truncstorei8 GPR32:$Rt,
	(am_indexed8 GPR64sp:$Rn,
	uimm12s1:$offset))]>;

	// Match all store 64 bits width whose type is compatible with FPR64
	let AddedComplexity = 10 in {
	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	def : Pat<(store (v2f32 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v8i8 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v4i16 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v2i32 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v4f16 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	}
	def : Pat<(store (v1f64 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
	def : Pat<(store (v1i64 FPR64:$Rt),
	(am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
	(STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;

	// Match all store 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	def : Pat<(store (v4f32 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v2f64 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v16i8 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v8i16 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v4i32 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v2i64 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	def : Pat<(store (v8f16 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
	}
	def : Pat<(store (f128 FPR128:$Rt),
	(am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
	(STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;

	// truncstore i64
	def : Pat<(truncstorei32 GPR64:$Rt,
	(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
	(STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
	def : Pat<(truncstorei16 GPR64:$Rt,
	(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
	(STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
	def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
	(STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;

	} // AddedComplexity = 10

	//---
	// (unscaled immediate)
	defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
	[(store GPR64:$Rt,
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
	[(store GPR32:$Rt,
	(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
	[(store FPR8:$Rt,
	(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
	[(store (f16 FPR16:$Rt),
	(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
	[(store (f32 FPR32:$Rt),
	(am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
	[(store (f64 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
	[(store (f128 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
	[(truncstorei16 GPR32:$Rt,
	(am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
	defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
	[(truncstorei8 GPR32:$Rt,
	(am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;

	// Match all store 64 bits width whose type is compatible with FPR64
	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	def : Pat<(store (v2f32 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v8i8 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v4i16 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v2i32 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v4f16 FPR64:$Rt),
	(am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	}
	def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;

	// Match all store 128 bits width whose type is compatible with FPR128
	let Predicates = [IsLE] in {
	// We must use ST1 to store vectors in big-endian.
	def : Pat<(store (v4f32 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v2f64 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v16i8 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v8i16 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v4i32 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v2i64 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v2f64 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(store (v8f16 FPR128:$Rt),
	(am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
	}

	// unscaled i64 truncating stores
	def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
	(STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
	(STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
	def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
	(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;

	//---
	// STR mnemonics fall back to STUR for negative or unaligned offsets.
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
	def : InstAlias<"str $Rt, [$Rn, $offset]",
	(STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;

	def : InstAlias<"strb $Rt, [$Rn, $offset]",
	(STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
	def : InstAlias<"strh $Rt, [$Rn, $offset]",
	(STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;

	//---
	// (unscaled immediate, unprivileged)
	defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
	defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;

	defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
	defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;

	//---
	// (immediate pre-indexed)
	def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str", pre_store, i32>;
	def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str", pre_store, i64>;
	def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8, "str", pre_store, untyped>;
	def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str", pre_store, f16>;
	def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str", pre_store, f32>;
	def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str", pre_store, f64>;
	def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;

	def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8, i32>;
	def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;

	// truncstore i64
	def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;
	def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;
	def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;

	def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;

	def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(pre_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;

	//---
	// (immediate post-indexed)
	def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str", post_store, i32>;
	def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str", post_store, i64>;
	def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8, "str", post_store, untyped>;
	def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str", post_store, f16>;
	def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str", post_store, f32>;
	def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str", post_store, f64>;
	def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;

	def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
	def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;

	// truncstore i64
	def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;
	def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;
	def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
	(STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
	simm9:$off)>;

	def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v4f16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
	(STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;

	def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
	def : Pat<(post_store (v8f16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
	(STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;

	//===----------------------------------------------------------------------===//
	// Load/store exclusive instructions.
	//===----------------------------------------------------------------------===//

	def LDARW : LoadAcquire <0b10, 1, 1, 0, 1, GPR32, "ldar">;
	def LDARX : LoadAcquire <0b11, 1, 1, 0, 1, GPR64, "ldar">;
	def LDARB : LoadAcquire <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
	def LDARH : LoadAcquire <0b01, 1, 1, 0, 1, GPR32, "ldarh">;

	def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
	def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
	def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
	def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;

	def LDXRW : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
	def LDXRX : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
	def LDXRB : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
	def LDXRH : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;

	def STLRW : StoreRelease <0b10, 1, 0, 0, 1, GPR32, "stlr">;
	def STLRX : StoreRelease <0b11, 1, 0, 0, 1, GPR64, "stlr">;
	def STLRB : StoreRelease <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
	def STLRH : StoreRelease <0b01, 1, 0, 0, 1, GPR32, "stlrh">;

	def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
	def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
	def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
	def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;

	def STXRW : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
	def STXRX : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
	def STXRB : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
	def STXRH : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;

	def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
	def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;

	def LDXPW : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
	def LDXPX : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;

	def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
	def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;

	def STXPW : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
	def STXPX : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;

	let Predicates = [HasV8_1a] in {
	// v8.1a "Limited Order Region" extension load-acquire instructions
	def LDLARW : LoadAcquire <0b10, 1, 1, 0, 0, GPR32, "ldlar">;
	def LDLARX : LoadAcquire <0b11, 1, 1, 0, 0, GPR64, "ldlar">;
	def LDLARB : LoadAcquire <0b00, 1, 1, 0, 0, GPR32, "ldlarb">;
	def LDLARH : LoadAcquire <0b01, 1, 1, 0, 0, GPR32, "ldlarh">;

	// v8.1a "Limited Order Region" extension store-release instructions
	def STLLRW : StoreRelease <0b10, 1, 0, 0, 0, GPR32, "stllr">;
	def STLLRX : StoreRelease <0b11, 1, 0, 0, 0, GPR64, "stllr">;
	def STLLRB : StoreRelease <0b00, 1, 0, 0, 0, GPR32, "stllrb">;
	def STLLRH : StoreRelease <0b01, 1, 0, 0, 0, GPR32, "stllrh">;
	}

	//===----------------------------------------------------------------------===//
	// Scaled floating point to integer conversion instructions.
	//===----------------------------------------------------------------------===//

	defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
	defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
	defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
	defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
	defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
	defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
	defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
	defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
	defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
	defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
	defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
	defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;

	multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
	def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
	def : Pat<(i64 (round f16:$Rn)), (!cast<Instruction>(INST # UXHr) $Rn)>;
	def : Pat<(i32 (round f32:$Rn)), (!cast<Instruction>(INST # UWSr) $Rn)>;
	def : Pat<(i64 (round f32:$Rn)), (!cast<Instruction>(INST # UXSr) $Rn)>;
	def : Pat<(i32 (round f64:$Rn)), (!cast<Instruction>(INST # UWDr) $Rn)>;
	def : Pat<(i64 (round f64:$Rn)), (!cast<Instruction>(INST # UXDr) $Rn)>;

	def : Pat<(i32 (round (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
	(!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
	def : Pat<(i64 (round (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
	(!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
	def : Pat<(i32 (round (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
	(!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
	def : Pat<(i64 (round (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
	(!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
	def : Pat<(i32 (round (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
	(!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
	def : Pat<(i64 (round (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
	(!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
	}

	defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
	defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;

	multiclass FPToIntegerPats<SDNode to_int, SDNode round, string INST> {
	def : Pat<(i32 (to_int (round f32:$Rn))),
	(!cast<Instruction>(INST # UWSr) f32:$Rn)>;
	def : Pat<(i64 (to_int (round f32:$Rn))),
	(!cast<Instruction>(INST # UXSr) f32:$Rn)>;
	def : Pat<(i32 (to_int (round f64:$Rn))),
	(!cast<Instruction>(INST # UWDr) f64:$Rn)>;
	def : Pat<(i64 (to_int (round f64:$Rn))),
	(!cast<Instruction>(INST # UXDr) f64:$Rn)>;
	}

	defm : FPToIntegerPats<fp_to_sint, fceil, "FCVTPS">;
	defm : FPToIntegerPats<fp_to_uint, fceil, "FCVTPU">;
	defm : FPToIntegerPats<fp_to_sint, ffloor, "FCVTMS">;
	defm : FPToIntegerPats<fp_to_uint, ffloor, "FCVTMU">;
	defm : FPToIntegerPats<fp_to_sint, ftrunc, "FCVTZS">;
	defm : FPToIntegerPats<fp_to_uint, ftrunc, "FCVTZU">;
	defm : FPToIntegerPats<fp_to_sint, fround, "FCVTAS">;
	defm : FPToIntegerPats<fp_to_uint, fround, "FCVTAU">;

	//===----------------------------------------------------------------------===//
	// Scaled integer to floating point conversion instructions.
	//===----------------------------------------------------------------------===//

	defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
	defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;

	//===----------------------------------------------------------------------===//
	// Unscaled integer to floating point conversion instruction.
	//===----------------------------------------------------------------------===//

	defm FMOV : UnscaledConversion<"fmov">;

	// Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable
	let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in {
	def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>,
	Sched<[WriteF]>;
	def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>,
	Sched<[WriteF]>;
	}
	// Similarly add aliases
	def : InstAlias<"fmov $Rd, #0.0", (FMOVWHr FPR16:$Rd, WZR), 0>,
	Requires<[HasFullFP16]>;
	def : InstAlias<"fmov $Rd, #0.0", (FMOVWSr FPR32:$Rd, WZR), 0>;
	def : InstAlias<"fmov $Rd, #0.0", (FMOVXDr FPR64:$Rd, XZR), 0>;

	//===----------------------------------------------------------------------===//
	// Floating point conversion instruction.
	//===----------------------------------------------------------------------===//

	defm FCVT : FPConversion<"fcvt">;

	//===----------------------------------------------------------------------===//
	// Floating point single operand instructions.
	//===----------------------------------------------------------------------===//

	defm FABS : SingleOperandFPData<0b0001, "fabs", fabs>;
	defm FMOV : SingleOperandFPData<0b0000, "fmov">;
	defm FNEG : SingleOperandFPData<0b0010, "fneg", fneg>;
	defm FRINTA : SingleOperandFPData<0b1100, "frinta", fround>;
	defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
	defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
	defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
	defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;

	def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
	(FRINTNDr FPR64:$Rn)>;

	defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
	defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;

	let SchedRW = [WriteFDiv] in {
	defm FSQRT : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
	}

	//===----------------------------------------------------------------------===//
	// Floating point two operand instructions.
	//===----------------------------------------------------------------------===//

	defm FADD : TwoOperandFPData<0b0010, "fadd", fadd>;
	let SchedRW = [WriteFDiv] in {
	defm FDIV : TwoOperandFPData<0b0001, "fdiv", fdiv>;
	}
	defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", fmaxnum>;
	defm FMAX : TwoOperandFPData<0b0100, "fmax", fmaxnan>;
	defm FMINNM : TwoOperandFPData<0b0111, "fminnm", fminnum>;
	defm FMIN : TwoOperandFPData<0b0101, "fmin", fminnan>;
	let SchedRW = [WriteFMul] in {
	defm FMUL : TwoOperandFPData<0b0000, "fmul", fmul>;
	defm FNMUL : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
	}
	defm FSUB : TwoOperandFPData<0b0011, "fsub", fsub>;

	def : Pat<(v1f64 (fmaxnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v1f64 (fminnan (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FMINDrr FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v1f64 (fmaxnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v1f64 (fminnum (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;

	//===----------------------------------------------------------------------===//
	// Floating point three operand instructions.
	//===----------------------------------------------------------------------===//

	defm FMADD : ThreeOperandFPData<0, 0, "fmadd", fma>;
	defm FMSUB : ThreeOperandFPData<0, 1, "fmsub",
	TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
	defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
	TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
	defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
	TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;

	// The following def pats catch the case where the LHS of an FMA is negated.
	// The TriOpFrag above catches the case where the middle operand is negated.

	// N.b. FMSUB etc have the accumulator at the end of (outs), unlike
	// the NEON variant.
	def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
	(FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;

	def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
	(FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;

	// We handled -(a + bc) for FNMADD above, now it's time for "(-a) + (-b)c" and
	// "(-a) + b*(-c)".
	def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
	(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;

	def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
	(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;

	def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
	(FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;

	def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
	(FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;

	//===----------------------------------------------------------------------===//
	// Floating point comparison instructions.
	//===----------------------------------------------------------------------===//

	defm FCMPE : FPComparison<1, "fcmpe">;
	defm FCMP : FPComparison<0, "fcmp", AArch64fcmp>;

	//===----------------------------------------------------------------------===//
	// Floating point conditional comparison instructions.
	//===----------------------------------------------------------------------===//

	defm FCCMPE : FPCondComparison<1, "fccmpe">;
	defm FCCMP : FPCondComparison<0, "fccmp", AArch64fccmp>;

	//===----------------------------------------------------------------------===//
	// Floating point conditional select instruction.
	//===----------------------------------------------------------------------===//

	defm FCSEL : FPCondSelect<"fcsel">;

	// CSEL instructions providing f128 types need to be handled by a
	// pseudo-instruction since the eventual code will need to introduce basic
	// blocks and control flow.
	def F128CSEL : Pseudo<(outs FPR128:$Rd),
	(ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
	[(set (f128 FPR128:$Rd),
	(AArch64csel FPR128:$Rn, FPR128:$Rm,
	(i32 imm:$cond), NZCV))]> {
	let Uses = [NZCV];
	let usesCustomInserter = 1;
	let hasNoSchedulingInfo = 1;
	}


	//===----------------------------------------------------------------------===//
	// Floating point immediate move.
	//===----------------------------------------------------------------------===//

	let isReMaterializable = 1 in {
	defm FMOV : FPMoveImmediate<"fmov">;
	}

	//===----------------------------------------------------------------------===//
	// Advanced SIMD two vector instructions.
	//===----------------------------------------------------------------------===//

	defm UABDL : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
	int_aarch64_neon_uabd>;
	// Match UABDL in log2-shuffle patterns.
	def : Pat<(abs (v8i16 (sub (zext (v8i8 V64:$opA)),
	(zext (v8i8 V64:$opB))))),
	(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
	def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
	(v8i16 (add (sub (zext (v8i8 V64:$opA)),
	(zext (v8i8 V64:$opB))),
	(AArch64vashr v8i16:$src, (i32 15))))),
	(UABDLv8i8_v8i16 V64:$opA, V64:$opB)>;
	def : Pat<(abs (v8i16 (sub (zext (extract_high_v16i8 V128:$opA)),
	(zext (extract_high_v16i8 V128:$opB))))),
	(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
	def : Pat<(xor (v8i16 (AArch64vashr v8i16:$src, (i32 15))),
	(v8i16 (add (sub (zext (extract_high_v16i8 V128:$opA)),
	(zext (extract_high_v16i8 V128:$opB))),
	(AArch64vashr v8i16:$src, (i32 15))))),
	(UABDLv16i8_v8i16 V128:$opA, V128:$opB)>;
	def : Pat<(abs (v4i32 (sub (zext (v4i16 V64:$opA)),
	(zext (v4i16 V64:$opB))))),
	(UABDLv4i16_v4i32 V64:$opA, V64:$opB)>;
	def : Pat<(abs (v4i32 (sub (zext (extract_high_v8i16 V128:$opA)),
	(zext (extract_high_v8i16 V128:$opB))))),
	(UABDLv8i16_v4i32 V128:$opA, V128:$opB)>;
	def : Pat<(abs (v2i64 (sub (zext (v2i32 V64:$opA)),
	(zext (v2i32 V64:$opB))))),
	(UABDLv2i32_v2i64 V64:$opA, V64:$opB)>;
	def : Pat<(abs (v2i64 (sub (zext (extract_high_v4i32 V128:$opA)),
	(zext (extract_high_v4i32 V128:$opB))))),
	(UABDLv4i32_v2i64 V128:$opA, V128:$opB)>;

	defm ABS : SIMDTwoVectorBHSD<0, 0b01011, "abs", abs>;
	defm CLS : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
	defm CLZ : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
	defm CMEQ : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
	defm CMGE : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
	defm CMGT : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
	defm CMLE : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
	defm CMLT : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
	defm CNT : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
	defm FABS : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;

	defm FCMEQ : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
	defm FCMGE : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
	defm FCMGT : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
	defm FCMLE : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
	defm FCMLT : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
	defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
	defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
	defm FCVTL : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
	def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
	(FCVTLv4i16 V64:$Rn)>;
	def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
	(i64 4)))),
	(FCVTLv8i16 V128:$Rn)>;
	def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
	def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
	(i64 2))))),
	(FCVTLv4i32 V128:$Rn)>;

	def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
	def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
	(i64 4))))),
	(FCVTLv8i16 V128:$Rn)>;

	defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
	defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
	defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
	defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
	defm FCVTN : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
	def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
	(FCVTNv4i16 V128:$Rn)>;
	def : Pat<(concat_vectors V64:$Rd,
	(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
	(FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
	def : Pat<(v2f32 (fpround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
	def : Pat<(v4f16 (fpround (v4f32 V128:$Rn))), (FCVTNv4i16 V128:$Rn)>;
	def : Pat<(concat_vectors V64:$Rd, (v2f32 (fpround (v2f64 V128:$Rn)))),
	(FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
	defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
	defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
	defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
	int_aarch64_neon_fcvtxn>;
	defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
	defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;

	def : Pat<(v4i16 (int_aarch64_neon_fcvtzs v4f16:$Rn)), (FCVTZSv4f16 $Rn)>;
	def : Pat<(v8i16 (int_aarch64_neon_fcvtzs v8f16:$Rn)), (FCVTZSv8f16 $Rn)>;
	def : Pat<(v2i32 (int_aarch64_neon_fcvtzs v2f32:$Rn)), (FCVTZSv2f32 $Rn)>;
	def : Pat<(v4i32 (int_aarch64_neon_fcvtzs v4f32:$Rn)), (FCVTZSv4f32 $Rn)>;
	def : Pat<(v2i64 (int_aarch64_neon_fcvtzs v2f64:$Rn)), (FCVTZSv2f64 $Rn)>;

	def : Pat<(v4i16 (int_aarch64_neon_fcvtzu v4f16:$Rn)), (FCVTZUv4f16 $Rn)>;
	def : Pat<(v8i16 (int_aarch64_neon_fcvtzu v8f16:$Rn)), (FCVTZUv8f16 $Rn)>;
	def : Pat<(v2i32 (int_aarch64_neon_fcvtzu v2f32:$Rn)), (FCVTZUv2f32 $Rn)>;
	def : Pat<(v4i32 (int_aarch64_neon_fcvtzu v4f32:$Rn)), (FCVTZUv4f32 $Rn)>;
	def : Pat<(v2i64 (int_aarch64_neon_fcvtzu v2f64:$Rn)), (FCVTZUv2f64 $Rn)>;

	defm FNEG : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
	defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
	defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", fround>;
	defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
	defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
	defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
	defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
	defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
	defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
	defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
	defm FSQRT : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
	defm NEG : SIMDTwoVectorBHSD<1, 0b01011, "neg",
	UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
	defm NOT : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
	// Aliases for MVN -> NOT.
	def : InstAlias<"mvn{ $Vd.8b, $Vn.8b\|.8b $Vd, $Vn}",
	(NOTv8i8 V64:$Vd, V64:$Vn)>;
	def : InstAlias<"mvn{ $Vd.16b, $Vn.16b\|.16b $Vd, $Vn}",
	(NOTv16i8 V128:$Vd, V128:$Vn)>;

	def : Pat<(AArch64neg (v8i8 V64:$Rn)), (NEGv8i8 V64:$Rn)>;
	def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
	def : Pat<(AArch64neg (v4i16 V64:$Rn)), (NEGv4i16 V64:$Rn)>;
	def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
	def : Pat<(AArch64neg (v2i32 V64:$Rn)), (NEGv2i32 V64:$Rn)>;
	def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
	def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;

	def : Pat<(AArch64not (v8i8 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(AArch64not (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(AArch64not (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(AArch64not (v1i64 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;

	def : Pat<(vnot (v4i16 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(vnot (v2i32 V64:$Rn)), (NOTv8i8 V64:$Rn)>;
	def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
	def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;

	defm RBIT : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
	defm REV16 : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
	defm REV32 : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
	defm REV64 : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
	defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
	BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
	defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
	defm SCVTF : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
	defm SHLL : SIMDVectorLShiftLongBySizeBHS;
	defm SQABS : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
	defm SQNEG : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
	defm SQXTN : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
	defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
	defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
	defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
	BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
	defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
	int_aarch64_neon_uaddlp>;
	defm UCVTF : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
	defm UQXTN : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
	defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
	defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
	defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
	defm XTN : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;

	def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
	def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
	def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
	def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
	def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
	def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;

	// Patterns for vector long shift (by element width). These need to match all
	// three of zext, sext and anyext so it's easier to pull the patterns out of the
	// definition.
	multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
	def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
	(SHLLv8i8 V64:$Rn)>;
	def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
	(SHLLv16i8 V128:$Rn)>;
	def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
	(SHLLv4i16 V64:$Rn)>;
	def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
	(SHLLv8i16 V128:$Rn)>;
	def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
	(SHLLv2i32 V64:$Rn)>;
	def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
	(SHLLv4i32 V128:$Rn)>;
	}

	defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
	defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
	defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;

	//===----------------------------------------------------------------------===//
	// Advanced SIMD three vector instructions.
	//===----------------------------------------------------------------------===//

	defm ADD : SIMDThreeSameVector<0, 0b10000, "add", add>;
	defm ADDP : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
	defm CMEQ : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
	defm CMGE : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
	defm CMGT : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
	defm CMHI : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
	defm CMHS : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
	defm CMTST : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
	defm FABD : SIMDThreeSameVectorFP<1,1,0b010,"fabd", int_aarch64_neon_fabd>;
	defm FACGE : SIMDThreeSameVectorFPCmp<1,0,0b101,"facge",int_aarch64_neon_facge>;
	defm FACGT : SIMDThreeSameVectorFPCmp<1,1,0b101,"facgt",int_aarch64_neon_facgt>;
	defm FADDP : SIMDThreeSameVectorFP<1,0,0b010,"faddp",int_aarch64_neon_addp>;
	defm FADD : SIMDThreeSameVectorFP<0,0,0b010,"fadd", fadd>;
	defm FCMEQ : SIMDThreeSameVectorFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
	defm FCMGE : SIMDThreeSameVectorFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
	defm FCMGT : SIMDThreeSameVectorFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
	defm FDIV : SIMDThreeSameVectorFP<1,0,0b111,"fdiv", fdiv>;
	defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
	defm FMAXNM : SIMDThreeSameVectorFP<0,0,0b000,"fmaxnm", fmaxnum>;
	defm FMAXP : SIMDThreeSameVectorFP<1,0,0b110,"fmaxp", int_aarch64_neon_fmaxp>;
	defm FMAX : SIMDThreeSameVectorFP<0,0,0b110,"fmax", fmaxnan>;
	defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b000,"fminnmp", int_aarch64_neon_fminnmp>;
	defm FMINNM : SIMDThreeSameVectorFP<0,1,0b000,"fminnm", fminnum>;
	defm FMINP : SIMDThreeSameVectorFP<1,1,0b110,"fminp", int_aarch64_neon_fminp>;
	defm FMIN : SIMDThreeSameVectorFP<0,1,0b110,"fmin", fminnan>;

	// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
	// instruction expects the addend first, while the fma intrinsic puts it last.
	defm FMLA : SIMDThreeSameVectorFPTied<0, 0, 0b001, "fmla",
	TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
	defm FMLS : SIMDThreeSameVectorFPTied<0, 1, 0b001, "fmls",
	TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;

	// The following def pats catch the case where the LHS of an FMA is negated.
	// The TriOpFrag above catches the case where the middle operand is negated.
	def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
	(FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;

	def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
	(FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;

	def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
	(FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;

	defm FMULX : SIMDThreeSameVectorFP<0,0,0b011,"fmulx", int_aarch64_neon_fmulx>;
	defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
	defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
	defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
	defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
	defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
	TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
	defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
	TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
	defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
	defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
	defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
	defm SABD : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
	defm SHADD : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
	defm SHSUB : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
	defm SMAXP : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
	defm SMAX : SIMDThreeSameVectorBHS<0,0b01100,"smax", smax>;
	defm SMINP : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
	defm SMIN : SIMDThreeSameVectorBHS<0,0b01101,"smin", smin>;
	defm SQADD : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
	defm SQDMULH : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
	defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
	defm SQRSHL : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
	defm SQSHL : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
	defm SQSUB : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
	defm SRHADD : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
	defm SRSHL : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
	defm SSHL : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
	defm SUB : SIMDThreeSameVector<1,0b10000,"sub", sub>;
	defm UABA : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
	defm UABD : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
	defm UHADD : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
	defm UHSUB : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
	defm UMAXP : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
	defm UMAX : SIMDThreeSameVectorBHS<1,0b01100,"umax", umax>;
	defm UMINP : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
	defm UMIN : SIMDThreeSameVectorBHS<1,0b01101,"umin", umin>;
	defm UQADD : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
	defm UQRSHL : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
	defm UQSHL : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
	defm UQSUB : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
	defm URHADD : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
	defm URSHL : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
	defm USHL : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
	defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
	int_aarch64_neon_sqadd>;
	defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
	int_aarch64_neon_sqsub>;

	defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
	defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
	BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
	defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
	defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
	defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
	TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
	defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
	defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
	BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
	defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;


	def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
	(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
	(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
	(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
	(BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;

	def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
	(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
	def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
	(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
	def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
	(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
	def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
	(BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;

	def : InstAlias<"mov{\t$dst.16b, $src.16b\|.16b\t$dst, $src}",
	(ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
	def : InstAlias<"mov{\t$dst.8h, $src.8h\|.8h\t$dst, $src}",
	(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
	def : InstAlias<"mov{\t$dst.4s, $src.4s\|.4s\t$dst, $src}",
	(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
	def : InstAlias<"mov{\t$dst.2d, $src.2d\|.2d\t$dst, $src}",
	(ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;

	def : InstAlias<"mov{\t$dst.8b, $src.8b\|.8b\t$dst, $src}",
	(ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
	def : InstAlias<"mov{\t$dst.4h, $src.4h\|.4h\t$dst, $src}",
	(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
	def : InstAlias<"mov{\t$dst.2s, $src.2s\|.2s\t$dst, $src}",
	(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
	def : InstAlias<"mov{\t$dst.1d, $src.1d\|.1d\t$dst, $src}",
	(ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;

	def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
	"\|cmls.8b\t$dst, $src1, $src2}",
	(CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
	"\|cmls.16b\t$dst, $src1, $src2}",
	(CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
	"\|cmls.4h\t$dst, $src1, $src2}",
	(CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
	"\|cmls.8h\t$dst, $src1, $src2}",
	(CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
	"\|cmls.2s\t$dst, $src1, $src2}",
	(CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
	"\|cmls.4s\t$dst, $src1, $src2}",
	(CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
	"\|cmls.2d\t$dst, $src1, $src2}",
	(CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;

	def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
	"\|cmlo.8b\t$dst, $src1, $src2}",
	(CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
	"\|cmlo.16b\t$dst, $src1, $src2}",
	(CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
	"\|cmlo.4h\t$dst, $src1, $src2}",
	(CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
	"\|cmlo.8h\t$dst, $src1, $src2}",
	(CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
	"\|cmlo.2s\t$dst, $src1, $src2}",
	(CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
	"\|cmlo.4s\t$dst, $src1, $src2}",
	(CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
	"\|cmlo.2d\t$dst, $src1, $src2}",
	(CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;

	def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
	"\|cmle.8b\t$dst, $src1, $src2}",
	(CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
	"\|cmle.16b\t$dst, $src1, $src2}",
	(CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
	"\|cmle.4h\t$dst, $src1, $src2}",
	(CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
	"\|cmle.8h\t$dst, $src1, $src2}",
	(CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
	"\|cmle.2s\t$dst, $src1, $src2}",
	(CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
	"\|cmle.4s\t$dst, $src1, $src2}",
	(CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
	"\|cmle.2d\t$dst, $src1, $src2}",
	(CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;

	def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
	"\|cmlt.8b\t$dst, $src1, $src2}",
	(CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
	"\|cmlt.16b\t$dst, $src1, $src2}",
	(CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
	"\|cmlt.4h\t$dst, $src1, $src2}",
	(CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
	"\|cmlt.8h\t$dst, $src1, $src2}",
	(CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
	"\|cmlt.2s\t$dst, $src1, $src2}",
	(CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
	"\|cmlt.4s\t$dst, $src1, $src2}",
	(CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
	"\|cmlt.2d\t$dst, $src1, $src2}",
	(CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<"{fcmle\t$dst.4h, $src1.4h, $src2.4h" #
	"\|fcmle.4h\t$dst, $src1, $src2}",
	(FCMGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{fcmle\t$dst.8h, $src1.8h, $src2.8h" #
	"\|fcmle.8h\t$dst, $src1, $src2}",
	(FCMGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
	}
	def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
	"\|fcmle.2s\t$dst, $src1, $src2}",
	(FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
	"\|fcmle.4s\t$dst, $src1, $src2}",
	(FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
	"\|fcmle.2d\t$dst, $src1, $src2}",
	(FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<"{fcmlt\t$dst.4h, $src1.4h, $src2.4h" #
	"\|fcmlt.4h\t$dst, $src1, $src2}",
	(FCMGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{fcmlt\t$dst.8h, $src1.8h, $src2.8h" #
	"\|fcmlt.8h\t$dst, $src1, $src2}",
	(FCMGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
	}
	def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
	"\|fcmlt.2s\t$dst, $src1, $src2}",
	(FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
	"\|fcmlt.4s\t$dst, $src1, $src2}",
	(FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
	"\|fcmlt.2d\t$dst, $src1, $src2}",
	(FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<"{facle\t$dst.4h, $src1.4h, $src2.4h" #
	"\|facle.4h\t$dst, $src1, $src2}",
	(FACGEv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{facle\t$dst.8h, $src1.8h, $src2.8h" #
	"\|facle.8h\t$dst, $src1, $src2}",
	(FACGEv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
	}
	def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
	"\|facle.2s\t$dst, $src1, $src2}",
	(FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
	"\|facle.4s\t$dst, $src1, $src2}",
	(FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
	"\|facle.2d\t$dst, $src1, $src2}",
	(FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;

	let Predicates = [HasNEON, HasFullFP16] in {
	def : InstAlias<"{faclt\t$dst.4h, $src1.4h, $src2.4h" #
	"\|faclt.4h\t$dst, $src1, $src2}",
	(FACGTv4f16 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{faclt\t$dst.8h, $src1.8h, $src2.8h" #
	"\|faclt.8h\t$dst, $src1, $src2}",
	(FACGTv8f16 V128:$dst, V128:$src2, V128:$src1), 0>;
	}
	def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
	"\|faclt.2s\t$dst, $src1, $src2}",
	(FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
	def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
	"\|faclt.4s\t$dst, $src1, $src2}",
	(FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
	def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
	"\|faclt.2d\t$dst, $src1, $src2}",
	(FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;

	//===----------------------------------------------------------------------===//
	// Advanced SIMD three scalar instructions.
	//===----------------------------------------------------------------------===//

	defm ADD : SIMDThreeScalarD<0, 0b10000, "add", add>;
	defm CMEQ : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
	defm CMGE : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
	defm CMGT : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
	defm CMHI : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
	defm CMHS : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
	defm CMTST : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
	defm FABD : SIMDFPThreeScalar<1, 1, 0b010, "fabd", int_aarch64_sisd_fabd>;
	def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
	(FABD64 FPR64:$Rn, FPR64:$Rm)>;
	defm FACGE : SIMDThreeScalarFPCmp<1, 0, 0b101, "facge",
	int_aarch64_neon_facge>;
	defm FACGT : SIMDThreeScalarFPCmp<1, 1, 0b101, "facgt",
	int_aarch64_neon_facgt>;
	defm FCMEQ : SIMDThreeScalarFPCmp<0, 0, 0b100, "fcmeq", AArch64fcmeq>;
	defm FCMGE : SIMDThreeScalarFPCmp<1, 0, 0b100, "fcmge", AArch64fcmge>;
	defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
	defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx>;
	defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps>;
	defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts>;
	defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
	defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
	defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
	defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
	defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
	defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
	defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>;
	defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>;
	defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>;
	defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
	defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
	defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
	defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
	defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
	defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
	let Predicates = [HasRDM] in {
	defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
	defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
	def : Pat<(i32 (int_aarch64_neon_sqadd
	(i32 FPR32:$Rd),
	(i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
	(i32 FPR32:$Rm))))),
	(SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
	def : Pat<(i32 (int_aarch64_neon_sqsub
	(i32 FPR32:$Rd),
	(i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
	(i32 FPR32:$Rm))))),
	(SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
	}

	def : InstAlias<"cmls $dst, $src1, $src2",
	(CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"cmle $dst, $src1, $src2",
	(CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"cmlo $dst, $src1, $src2",
	(CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"cmlt $dst, $src1, $src2",
	(CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"fcmle $dst, $src1, $src2",
	(FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
	def : InstAlias<"fcmle $dst, $src1, $src2",
	(FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"fcmlt $dst, $src1, $src2",
	(FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
	def : InstAlias<"fcmlt $dst, $src1, $src2",
	(FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"facle $dst, $src1, $src2",
	(FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
	def : InstAlias<"facle $dst, $src1, $src2",
	(FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
	def : InstAlias<"faclt $dst, $src1, $src2",
	(FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
	def : InstAlias<"faclt $dst, $src1, $src2",
	(FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;

	//===----------------------------------------------------------------------===//
	// Advanced SIMD three scalar instructions (mixed operands).
	//===----------------------------------------------------------------------===//
	defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
	int_aarch64_neon_sqdmulls_scalar>;
	defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
	defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;

	def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
	(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
	(i32 FPR32:$Rm))))),
	(SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
	def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
	(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
	(i32 FPR32:$Rm))))),
	(SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;

	//===----------------------------------------------------------------------===//
	// Advanced SIMD two scalar instructions.
	//===----------------------------------------------------------------------===//

	defm ABS : SIMDTwoScalarD< 0, 0b01011, "abs", abs>;
	defm CMEQ : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
	defm CMGE : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
	defm CMGT : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
	defm CMLE : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
	defm CMLT : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
	defm FCMEQ : SIMDFPCmpTwoScalar<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
	defm FCMGE : SIMDFPCmpTwoScalar<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
	defm FCMGT : SIMDFPCmpTwoScalar<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
	defm FCMLE : SIMDFPCmpTwoScalar<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
	defm FCMLT : SIMDFPCmpTwoScalar<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
	defm FCVTAS : SIMDFPTwoScalar< 0, 0, 0b11100, "fcvtas">;
	defm FCVTAU : SIMDFPTwoScalar< 1, 0, 0b11100, "fcvtau">;
	defm FCVTMS : SIMDFPTwoScalar< 0, 0, 0b11011, "fcvtms">;
	defm FCVTMU : SIMDFPTwoScalar< 1, 0, 0b11011, "fcvtmu">;
	defm FCVTNS : SIMDFPTwoScalar< 0, 0, 0b11010, "fcvtns">;
	defm FCVTNU : SIMDFPTwoScalar< 1, 0, 0b11010, "fcvtnu">;
	defm FCVTPS : SIMDFPTwoScalar< 0, 1, 0b11010, "fcvtps">;
	defm FCVTPU : SIMDFPTwoScalar< 1, 1, 0b11010, "fcvtpu">;
	def FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
	defm FCVTZS : SIMDFPTwoScalar< 0, 1, 0b11011, "fcvtzs">;
	defm FCVTZU : SIMDFPTwoScalar< 1, 1, 0b11011, "fcvtzu">;
	defm FRECPE : SIMDFPTwoScalar< 0, 1, 0b11101, "frecpe">;
	defm FRECPX : SIMDFPTwoScalar< 0, 1, 0b11111, "frecpx">;
	defm FRSQRTE : SIMDFPTwoScalar< 1, 1, 0b11101, "frsqrte">;
	defm NEG : SIMDTwoScalarD< 1, 0b01011, "neg",
	UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
	defm SCVTF : SIMDFPTwoScalarCVT< 0, 0, 0b11101, "scvtf", AArch64sitof>;
	defm SQABS : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
	defm SQNEG : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
	defm SQXTN : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
	defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
	defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
	int_aarch64_neon_suqadd>;
	defm UCVTF : SIMDFPTwoScalarCVT< 1, 0, 0b11101, "ucvtf", AArch64uitof>;
	defm UQXTN : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
	defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
	int_aarch64_neon_usqadd>;

	def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;

	def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
	(FCVTASv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
	(FCVTAUv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
	(FCVTMSv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
	(FCVTMUv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
	(FCVTNSv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
	(FCVTNUv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
	(FCVTPSv1i64 FPR64:$Rn)>;
	def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
	(FCVTPUv1i64 FPR64:$Rn)>;

	def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
	(FRECPEv1i32 FPR32:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
	(FRECPEv1i64 FPR64:$Rn)>;
	def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
	(FRECPEv1i64 FPR64:$Rn)>;

	def : Pat<(f32 (AArch64frecpe (f32 FPR32:$Rn))),
	(FRECPEv1i32 FPR32:$Rn)>;
	def : Pat<(v2f32 (AArch64frecpe (v2f32 V64:$Rn))),
	(FRECPEv2f32 V64:$Rn)>;
	def : Pat<(v4f32 (AArch64frecpe (v4f32 FPR128:$Rn))),
	(FRECPEv4f32 FPR128:$Rn)>;
	def : Pat<(f64 (AArch64frecpe (f64 FPR64:$Rn))),
	(FRECPEv1i64 FPR64:$Rn)>;
	def : Pat<(v1f64 (AArch64frecpe (v1f64 FPR64:$Rn))),
	(FRECPEv1i64 FPR64:$Rn)>;
	def : Pat<(v2f64 (AArch64frecpe (v2f64 FPR128:$Rn))),
	(FRECPEv2f64 FPR128:$Rn)>;

	def : Pat<(f32 (AArch64frecps (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
	(FRECPS32 FPR32:$Rn, FPR32:$Rm)>;
	def : Pat<(v2f32 (AArch64frecps (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
	(FRECPSv2f32 V64:$Rn, V64:$Rm)>;
	def : Pat<(v4f32 (AArch64frecps (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
	(FRECPSv4f32 FPR128:$Rn, FPR128:$Rm)>;
	def : Pat<(f64 (AArch64frecps (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
	(FRECPS64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v2f64 (AArch64frecps (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
	(FRECPSv2f64 FPR128:$Rn, FPR128:$Rm)>;

	def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
	(FRECPXv1i32 FPR32:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
	(FRECPXv1i64 FPR64:$Rn)>;

	def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
	(FRSQRTEv1i32 FPR32:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
	(FRSQRTEv1i64 FPR64:$Rn)>;
	def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
	(FRSQRTEv1i64 FPR64:$Rn)>;

	def : Pat<(f32 (AArch64frsqrte (f32 FPR32:$Rn))),
	(FRSQRTEv1i32 FPR32:$Rn)>;
	def : Pat<(v2f32 (AArch64frsqrte (v2f32 V64:$Rn))),
	(FRSQRTEv2f32 V64:$Rn)>;
	def : Pat<(v4f32 (AArch64frsqrte (v4f32 FPR128:$Rn))),
	(FRSQRTEv4f32 FPR128:$Rn)>;
	def : Pat<(f64 (AArch64frsqrte (f64 FPR64:$Rn))),
	(FRSQRTEv1i64 FPR64:$Rn)>;
	def : Pat<(v1f64 (AArch64frsqrte (v1f64 FPR64:$Rn))),
	(FRSQRTEv1i64 FPR64:$Rn)>;
	def : Pat<(v2f64 (AArch64frsqrte (v2f64 FPR128:$Rn))),
	(FRSQRTEv2f64 FPR128:$Rn)>;

	def : Pat<(f32 (AArch64frsqrts (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
	(FRSQRTS32 FPR32:$Rn, FPR32:$Rm)>;
	def : Pat<(v2f32 (AArch64frsqrts (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
	(FRSQRTSv2f32 V64:$Rn, V64:$Rm)>;
	def : Pat<(v4f32 (AArch64frsqrts (v4f32 FPR128:$Rn), (v4f32 FPR128:$Rm))),
	(FRSQRTSv4f32 FPR128:$Rn, FPR128:$Rm)>;
	def : Pat<(f64 (AArch64frsqrts (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
	(FRSQRTS64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
	(FRSQRTSv2f64 FPR128:$Rn, FPR128:$Rm)>;

	// If an integer is about to be converted to a floating point value,
	// just load it on the floating point unit.
	// Here are the patterns for 8 and 16-bits to float.
	// 8-bits -> float.
	multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
	SDPatternOperator loadop, Instruction UCVTF,
	ROAddrMode ro, Instruction LDRW, Instruction LDRX,
	SubRegIndex sub> {
	def : Pat<(DstTy (uint_to_fp (SrcTy
	(loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
	ro.Wext:$extend))))),
	(UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
	(LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
	sub))>;

	def : Pat<(DstTy (uint_to_fp (SrcTy
	(loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
	ro.Wext:$extend))))),
	(UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
	(LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
	sub))>;
	}

	defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
	UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
	def : Pat <(f32 (uint_to_fp (i32
	(zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
	(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
	def : Pat <(f32 (uint_to_fp (i32
	(zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
	(LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
	// 16-bits -> float.
	defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
	UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
	def : Pat <(f32 (uint_to_fp (i32
	(zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
	(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
	def : Pat <(f32 (uint_to_fp (i32
	(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
	(LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
	// 32-bits are handled in target specific dag combine:
	// performIntToFpCombine.
	// 64-bits integer to 32-bits floating point, not possible with
	// UCVTF on floating point registers (both source and destination
	// must have the same size).

	// Here are the patterns for 8, 16, 32, and 64-bits to double.
	// 8-bits -> double.
	defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
	UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
	def : Pat <(f64 (uint_to_fp (i32
	(zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
	def : Pat <(f64 (uint_to_fp (i32
	(zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
	// 16-bits -> double.
	defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
	UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
	def : Pat <(f64 (uint_to_fp (i32
	(zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
	def : Pat <(f64 (uint_to_fp (i32
	(zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
	// 32-bits -> double.
	defm : UIntToFPROLoadPat<f64, i32, load,
	UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
	def : Pat <(f64 (uint_to_fp (i32
	(load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
	def : Pat <(f64 (uint_to_fp (i32
	(load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
	(UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	(LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
	// 64-bits -> double are handled in target specific dag combine:
	// performIntToFpCombine.

	//===----------------------------------------------------------------------===//
	// Advanced SIMD three different-sized vector instructions.
	//===----------------------------------------------------------------------===//

	defm ADDHN : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
	defm SUBHN : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
	defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
	defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
	defm PMULL : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
	defm SABAL : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
	int_aarch64_neon_sabd>;
	defm SABDL : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
	int_aarch64_neon_sabd>;
	defm SADDL : SIMDLongThreeVectorBHS< 0, 0b0000, "saddl",
	BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
	defm SADDW : SIMDWideThreeVectorBHS< 0, 0b0001, "saddw",
	BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
	defm SMLAL : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
	defm SMLSL : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
	TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
	defm SMULL : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
	defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
	int_aarch64_neon_sqadd>;
	defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
	int_aarch64_neon_sqsub>;
	defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
	int_aarch64_neon_sqdmull>;
	defm SSUBL : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
	BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
	defm SSUBW : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
	BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
	defm UABAL : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
	int_aarch64_neon_uabd>;
	defm UADDL : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
	BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
	defm UADDW : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
	BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
	defm UMLAL : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
	defm UMLSL : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
	TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
	defm UMULL : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
	defm USUBL : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
	BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
	defm USUBW : SIMDWideThreeVectorBHS< 1, 0b0011, "usubw",
	BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;

	// Additional patterns for SMULL and UMULL
	multiclass Neon_mul_widen_patterns<SDPatternOperator opnode,
	Instruction INST8B, Instruction INST4H, Instruction INST2S> {
	def : Pat<(v8i16 (opnode (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
	(INST8B V64:$Rn, V64:$Rm)>;
	def : Pat<(v4i32 (opnode (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
	(INST4H V64:$Rn, V64:$Rm)>;
	def : Pat<(v2i64 (opnode (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
	(INST2S V64:$Rn, V64:$Rm)>;
	}

	defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
	SMULLv4i16_v4i32, SMULLv2i32_v2i64>;
	defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
	UMULLv4i16_v4i32, UMULLv2i32_v2i64>;

	// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
	multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
	Instruction INST8B, Instruction INST4H, Instruction INST2S> {
	def : Pat<(v8i16 (opnode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm))),
	(INST8B V128:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(v4i32 (opnode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm))),
	(INST4H V128:$Rd, V64:$Rn, V64:$Rm)>;
	def : Pat<(v2i64 (opnode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm))),
	(INST2S V128:$Rd, V64:$Rn, V64:$Rm)>;
	}

	defm : Neon_mulacc_widen_patterns<
	TriOpFrag<(add node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
	SMLALv8i8_v8i16, SMLALv4i16_v4i32, SMLALv2i32_v2i64>;
	defm : Neon_mulacc_widen_patterns<
	TriOpFrag<(add node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
	UMLALv8i8_v8i16, UMLALv4i16_v4i32, UMLALv2i32_v2i64>;
	defm : Neon_mulacc_widen_patterns<
	TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>,
	SMLSLv8i8_v8i16, SMLSLv4i16_v4i32, SMLSLv2i32_v2i64>;
	defm : Neon_mulacc_widen_patterns<
	TriOpFrag<(sub node:$LHS, (AArch64umull node:$MHS, node:$RHS))>,
	UMLSLv8i8_v8i16, UMLSLv4i16_v4i32, UMLSLv2i32_v2i64>;

	// Patterns for 64-bit pmull
	def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
	(PMULLv1i64 V64:$Rn, V64:$Rm)>;
	def : Pat<(int_aarch64_neon_pmull64 (extractelt (v2i64 V128:$Rn), (i64 1)),
	(extractelt (v2i64 V128:$Rm), (i64 1))),
	(PMULLv2i64 V128:$Rn, V128:$Rm)>;

	// CodeGen patterns for addhn and subhn instructions, which can actually be
	// written in LLVM IR without too much difficulty.

	// ADDHN
	def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
	(ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
	def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 16))))),
	(ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
	def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 32))))),
	(ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v8i8 V64:$Rd),
	(trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 8))))),
	(ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v4i16 V64:$Rd),
	(trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 16))))),
	(ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v2i32 V64:$Rd),
	(trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
	(i32 32))))),
	(ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;

	// SUBHN
	def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
	(SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
	def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 16))))),
	(SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
	def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 32))))),
	(SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v8i8 V64:$Rd),
	(trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 8))))),
	(SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v4i16 V64:$Rd),
	(trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 16))))),
	(SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;
	def : Pat<(concat_vectors (v2i32 V64:$Rd),
	(trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
	(i32 32))))),
	(SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
	V128:$Rn, V128:$Rm)>;

	//----------------------------------------------------------------------------
	// AdvSIMD bitwise extract from vector instruction.
	//----------------------------------------------------------------------------

	defm EXT : SIMDBitwiseExtract<"ext">;

	def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
	(EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
	def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
	(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
	def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
	(EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
	def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
	(EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
	def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
	(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
	def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
	(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
	def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
	(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
	def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
	(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
	def : Pat<(v4f16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
	(EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
	def : Pat<(v8f16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
	(EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;

	// We use EXT to handle extract_subvector to copy the upper 64-bits of a
	// 128-bit vector.
	def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 8))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
	def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
	def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
	def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
	def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 4))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
	def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
	def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
	(EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;


	//----------------------------------------------------------------------------
	// AdvSIMD zip vector
	//----------------------------------------------------------------------------

	defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
	defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
	defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
	defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
	defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
	defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;

	//----------------------------------------------------------------------------
	// AdvSIMD TBL/TBX instructions
	//----------------------------------------------------------------------------

	defm TBL : SIMDTableLookup< 0, "tbl">;
	defm TBX : SIMDTableLookupTied<1, "tbx">;

	def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
	(TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
	def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
	(TBLv16i8One V128:$Ri, V128:$Rn)>;

	def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
	(v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
	(TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
	def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
	(v16i8 V128:$Ri), (v16i8 V128:$Rn))),
	(TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;


	//----------------------------------------------------------------------------
	// AdvSIMD scalar CPY instruction
	//----------------------------------------------------------------------------

	defm CPY : SIMDScalarCPY<"cpy">;

	//----------------------------------------------------------------------------
	// AdvSIMD scalar pairwise instructions
	//----------------------------------------------------------------------------

	defm ADDP : SIMDPairwiseScalarD<0, 0b11011, "addp">;
	defm FADDP : SIMDFPPairwiseScalar<0, 0b01101, "faddp">;
	defm FMAXNMP : SIMDFPPairwiseScalar<0, 0b01100, "fmaxnmp">;
	defm FMAXP : SIMDFPPairwiseScalar<0, 0b01111, "fmaxp">;
	defm FMINNMP : SIMDFPPairwiseScalar<1, 0b01100, "fminnmp">;
	defm FMINP : SIMDFPPairwiseScalar<1, 0b01111, "fminp">;
	def : Pat<(v2i64 (AArch64saddv V128:$Rn)),
	(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
	def : Pat<(v2i64 (AArch64uaddv V128:$Rn)),
	(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), (ADDPv2i64p V128:$Rn), dsub)>;
	def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
	(FADDPv2i32p V64:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
	(FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
	def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
	(FADDPv2i64p V128:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
	(FMAXNMPv2i32p V64:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
	(FMAXNMPv2i64p V128:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
	(FMAXPv2i32p V64:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
	(FMAXPv2i64p V128:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
	(FMINNMPv2i32p V64:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
	(FMINNMPv2i64p V128:$Rn)>;
	def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
	(FMINPv2i32p V64:$Rn)>;
	def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
	(FMINPv2i64p V128:$Rn)>;

	//----------------------------------------------------------------------------
	// AdvSIMD INS/DUP instructions
	//----------------------------------------------------------------------------

	def DUPv8i8gpr : SIMDDupFromMain<0, {?,?,?,?,1}, ".8b", v8i8, V64, GPR32>;
	def DUPv16i8gpr : SIMDDupFromMain<1, {?,?,?,?,1}, ".16b", v16i8, V128, GPR32>;
	def DUPv4i16gpr : SIMDDupFromMain<0, {?,?,?,1,0}, ".4h", v4i16, V64, GPR32>;
	def DUPv8i16gpr : SIMDDupFromMain<1, {?,?,?,1,0}, ".8h", v8i16, V128, GPR32>;
	def DUPv2i32gpr : SIMDDupFromMain<0, {?,?,1,0,0}, ".2s", v2i32, V64, GPR32>;
	def DUPv4i32gpr : SIMDDupFromMain<1, {?,?,1,0,0}, ".4s", v4i32, V128, GPR32>;
	def DUPv2i64gpr : SIMDDupFromMain<1, {?,1,0,0,0}, ".2d", v2i64, V128, GPR64>;

	def DUPv2i64lane : SIMDDup64FromElement;
	def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
	def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
	def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
	def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
	def DUPv8i8lane : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
	def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;

	def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
	(v2f32 (DUPv2i32lane
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
	(i64 0)))>;
	def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
	(v4f32 (DUPv4i32lane
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
	(i64 0)))>;
	def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
	(v2f64 (DUPv2i64lane
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
	(i64 0)))>;
	def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
	(v4f16 (DUPv4i16lane
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
	(i64 0)))>;
	def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
	(v8f16 (DUPv8i16lane
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
	(i64 0)))>;

	def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
	(DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
	def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
	(DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;

	def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
	(DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
	def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
	(DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
	def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
	(DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;

	// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
	// instruction even if the types don't match: we just have to remap the lane
	// carefully. N.b. this trick only applies to truncations.
	def VecIndex_x2 : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(2 * N->getZExtValue(), SDLoc(N), MVT::i64);
	}]>;
	def VecIndex_x4 : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(4 * N->getZExtValue(), SDLoc(N), MVT::i64);
	}]>;
	def VecIndex_x8 : SDNodeXForm<imm, [{
	return CurDAG->getTargetConstant(8 * N->getZExtValue(), SDLoc(N), MVT::i64);
	}]>;

	multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
	ValueType Src128VT, ValueType ScalVT,
	Instruction DUP, SDNodeXForm IdxXFORM> {
	def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
	imm:$idx)))),
	(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

	def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
	imm:$idx)))),
	(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
	}

	defm : DUPWithTruncPats<v8i8, v4i16, v8i16, i32, DUPv8i8lane, VecIndex_x2>;
	defm : DUPWithTruncPats<v8i8, v2i32, v4i32, i32, DUPv8i8lane, VecIndex_x4>;
	defm : DUPWithTruncPats<v4i16, v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;

	defm : DUPWithTruncPats<v16i8, v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
	defm : DUPWithTruncPats<v16i8, v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
	defm : DUPWithTruncPats<v8i16, v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;

	multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
	SDNodeXForm IdxXFORM> {
	def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v2i64 V128:$Rn),
	imm:$idx))))),
	(DUP V128:$Rn, (IdxXFORM imm:$idx))>;

	def : Pat<(ResVT (AArch64dup (i32 (trunc (extractelt (v1i64 V64:$Rn),
	imm:$idx))))),
	(DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
	}

	defm : DUPWithTrunci64Pats<v8i8, DUPv8i8lane, VecIndex_x8>;
	defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane, VecIndex_x4>;
	defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane, VecIndex_x2>;

	defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
	defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
	defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;

	// SMOV and UMOV definitions, with some extra patterns for convenience
	defm SMOV : SMov;
	defm UMOV : UMov;

	def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
	(i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
	def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
	(i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
	def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
	(i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
	def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
	(i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
	def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
	(i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
	def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
	(i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;

	def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v16i8 V128:$Rn),
	VectorIndexB:$idx)))), i8),
	(i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
	def : Pat<(sext_inreg (i64 (anyext (i32 (vector_extract (v8i16 V128:$Rn),
	VectorIndexH:$idx)))), i16),
	(i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;

	// Extracting i8 or i16 elements will have the zero-extend transformed to
	// an 'and' mask by type legalization since neither i8 nor i16 are legal types
	// for AArch64. Match these patterns here since UMOV already zeroes out the high
	// bits of the destination register.
	def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
	(i32 0xff)),
	(i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
	def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
	(i32 0xffff)),
	(i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;

	defm INS : SIMDIns;

	def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
	(SUBREG_TO_REG (i32 0),
	(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
	def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
	(SUBREG_TO_REG (i32 0),
	(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;

	def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
	(SUBREG_TO_REG (i32 0),
	(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
	def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
	(SUBREG_TO_REG (i32 0),
	(f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;

	def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
	(v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
	(i32 FPR32:$Rn), ssub))>;
	def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
	(v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	(i32 FPR32:$Rn), ssub))>;
	def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
	(v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
	(i64 FPR64:$Rn), dsub))>;

	def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
	(INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
	def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
	(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;

	def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
	(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
	def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
	(INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
	def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
	(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;

	def : Pat<(v4f16 (vector_insert (v4f16 V64:$Rn),
	(f16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
	(EXTRACT_SUBREG
	(INSvi16lane
	(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
	VectorIndexS:$imm,
	(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
	(i64 0)),
	dsub)>;

	def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
	(f16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
	(INSvi16lane
	V128:$Rn, VectorIndexH:$imm,
	(v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
	(i64 0))>;

	def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
	(f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
	(EXTRACT_SUBREG
	(INSvi32lane
	(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
	VectorIndexS:$imm,
	(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
	(i64 0)),
	dsub)>;
	def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
	(f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
	(INSvi32lane
	V128:$Rn, VectorIndexS:$imm,
	(v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
	(i64 0))>;
	def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
	(f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
	(INSvi64lane
	V128:$Rn, VectorIndexD:$imm,
	(v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
	(i64 0))>;

	// Copy an element at a constant index in one vector into a constant indexed
	// element of another.
	// FIXME refactor to a shared class/dev parameterized on vector type, vector
	// index type and INS extension
	def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
	(v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
	VectorIndexB:$idx2)),
	(v16i8 (INSvi8lane
	V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
	)>;
	def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
	(v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
	VectorIndexH:$idx2)),
	(v8i16 (INSvi16lane
	V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
	)>;
	def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
	(v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
	VectorIndexS:$idx2)),
	(v4i32 (INSvi32lane
	V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
	)>;
	def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
	(v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
	VectorIndexD:$idx2)),
	(v2i64 (INSvi64lane
	V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
	)>;

	multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
	ValueType VTScal, Instruction INS> {
	def : Pat<(VT128 (vector_insert V128:$src,
	(VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
	imm:$Immd)),
	(INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;

	def : Pat<(VT128 (vector_insert V128:$src,
	(VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
	imm:$Immd)),
	(INS V128:$src, imm:$Immd,
	(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;

	def : Pat<(VT64 (vector_insert V64:$src,
	(VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
	imm:$Immd)),
	(EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
	imm:$Immd, V128:$Rn, imm:$Immn),
	dsub)>;

	def : Pat<(VT64 (vector_insert V64:$src,
	(VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
	imm:$Immd)),
	(EXTRACT_SUBREG
	(INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
	(SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
	dsub)>;
	}

	defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
	defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
	defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;


	// Floating point vector extractions are codegen'd as either a sequence of
	// subregister extractions, or a MOV (aka CPY here, alias for DUP) if
	// the lane number is anything other than zero.
	def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
	(f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
	def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
	(f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
	def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
	(f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;

	def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
	(f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
	def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
	(f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
	def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
	(f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;

	// All concat_vectors operations are canonicalised to act on i64 vectors for
	// AArch64. In the general case we need an instruction, which had just as well be
	// INS.
	class ConcatPat<ValueType DstTy, ValueType SrcTy>
	: Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
	(INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;

	def : ConcatPat<v2i64, v1i64>;
	def : ConcatPat<v2f64, v1f64>;
	def : ConcatPat<v4i32, v2i32>;
	def : ConcatPat<v4f32, v2f32>;
	def : ConcatPat<v8i16, v4i16>;
	def : ConcatPat<v8f16, v4f16>;
	def : ConcatPat<v16i8, v8i8>;

	// If the high lanes are undef, though, we can just ignore them:
	class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
	: Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
	(INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;

	def : ConcatUndefPat<v2i64, v1i64>;
	def : ConcatUndefPat<v2f64, v1f64>;
	def : ConcatUndefPat<v4i32, v2i32>;
	def : ConcatUndefPat<v4f32, v2f32>;
	def : ConcatUndefPat<v8i16, v4i16>;
	def : ConcatUndefPat<v16i8, v8i8>;

	//----------------------------------------------------------------------------
	// AdvSIMD across lanes instructions
	//----------------------------------------------------------------------------

	defm ADDV : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
	defm SMAXV : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
	defm SMINV : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
	defm UMAXV : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
	defm UMINV : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
	defm SADDLV : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
	defm UADDLV : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
	defm FMAXNMV : SIMDFPAcrossLanes<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
	defm FMAXV : SIMDFPAcrossLanes<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
	defm FMINNMV : SIMDFPAcrossLanes<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
	defm FMINV : SIMDFPAcrossLanes<0b01111, 1, "fminv", int_aarch64_neon_fminv>;

	// Patterns for across-vector intrinsics, that have a node equivalent, that
	// returns a vector (with only the low lane defined) instead of a scalar.
	// In effect, opNode is the same as (scalar_to_vector (IntNode)).
	multiclass SIMDAcrossLanesIntrinsic<string baseOpc,
	SDPatternOperator opNode> {
	// If a lane instruction caught the vector_extract around opNode, we can
	// directly match the latter to the instruction.
	def : Pat<(v8i8 (opNode V64:$Rn)),
	(INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub)>;
	def : Pat<(v16i8 (opNode V128:$Rn)),
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub)>;
	def : Pat<(v4i16 (opNode V64:$Rn)),
	(INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub)>;
	def : Pat<(v8i16 (opNode V128:$Rn)),
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub)>;
	def : Pat<(v4i32 (opNode V128:$Rn)),
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub)>;


	// If none did, fallback to the explicit patterns, consuming the vector_extract.
	def : Pat<(i32 (vector_extract (insert_subvector undef, (v8i8 (opNode V64:$Rn)),
	(i32 0)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn),
	bsub), ssub)>;
	def : Pat<(i32 (vector_extract (v16i8 (opNode V128:$Rn)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn),
	bsub), ssub)>;
	def : Pat<(i32 (vector_extract (insert_subvector undef,
	(v4i16 (opNode V64:$Rn)), (i32 0)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn),
	hsub), ssub)>;
	def : Pat<(i32 (vector_extract (v8i16 (opNode V128:$Rn)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn),
	hsub), ssub)>;
	def : Pat<(i32 (vector_extract (v4i32 (opNode V128:$Rn)), (i64 0))),
	(EXTRACT_SUBREG (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn),
	ssub), ssub)>;

	}

	multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc,
	SDPatternOperator opNode>
	: SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
	// If there is a sign extension after this intrinsic, consume it as smov already
	// performed it
	def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
	(opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), i8)),
	(i32 (SMOVvi8to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
	(i64 0)))>;
	def : Pat<(i32 (sext_inreg (i32 (vector_extract
	(opNode (v16i8 V128:$Rn)), (i64 0))), i8)),
	(i32 (SMOVvi8to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
	(i64 0)))>;
	def : Pat<(i32 (sext_inreg (i32 (vector_extract (insert_subvector undef,
	(opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), i16)),
	(i32 (SMOVvi16to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
	(i64 0)))>;
	def : Pat<(i32 (sext_inreg (i32 (vector_extract
	(opNode (v8i16 V128:$Rn)), (i64 0))), i16)),
	(i32 (SMOVvi16to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
	(i64 0)))>;
	}

	multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc,
	SDPatternOperator opNode>
	: SIMDAcrossLanesIntrinsic<baseOpc, opNode> {
	// If there is a masking operation keeping only what has been actually
	// generated, consume it.
	def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
	(opNode (v8i8 V64:$Rn)), (i32 0)), (i64 0))), maski8_or_more)),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
	ssub))>;
	def : Pat<(i32 (and (i32 (vector_extract (opNode (v16i8 V128:$Rn)), (i64 0))),
	maski8_or_more)),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
	ssub))>;
	def : Pat<(i32 (and (i32 (vector_extract (insert_subvector undef,
	(opNode (v4i16 V64:$Rn)), (i32 0)), (i64 0))), maski16_or_more)),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
	ssub))>;
	def : Pat<(i32 (and (i32 (vector_extract (opNode (v8i16 V128:$Rn)), (i64 0))),
	maski16_or_more)),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
	ssub))>;
	}

	defm : SIMDAcrossLanesSignedIntrinsic<"ADDV", AArch64saddv>;
	// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
	def : Pat<(v2i32 (AArch64saddv (v2i32 V64:$Rn))),
	(ADDPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV", AArch64uaddv>;
	// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
	def : Pat<(v2i32 (AArch64uaddv (v2i32 V64:$Rn))),
	(ADDPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", AArch64smaxv>;
	def : Pat<(v2i32 (AArch64smaxv (v2i32 V64:$Rn))),
	(SMAXPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", AArch64sminv>;
	def : Pat<(v2i32 (AArch64sminv (v2i32 V64:$Rn))),
	(SMINPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", AArch64umaxv>;
	def : Pat<(v2i32 (AArch64umaxv (v2i32 V64:$Rn))),
	(UMAXPv2i32 V64:$Rn, V64:$Rn)>;

	defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", AArch64uminv>;
	def : Pat<(v2i32 (AArch64uminv (v2i32 V64:$Rn))),
	(UMINPv2i32 V64:$Rn, V64:$Rn)>;

	multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
	def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
	(i32 (SMOVvi16to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
	(i64 0)))>;
	def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
	(i32 (SMOVvi16to32
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
	(i64 0)))>;

	def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
	ssub))>;
	def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
	ssub))>;

	def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
	(i64 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
	dsub))>;
	}

	multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
	Intrinsic intOp> {
	def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
	ssub))>;
	def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
	ssub))>;

	def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
	ssub))>;
	def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
	(i32 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
	ssub))>;

	def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
	(i64 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
	dsub))>;
	}

	defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
	defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;

	// The vaddlv_s32 intrinsic gets mapped to SADDLP.
	def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
	(i64 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(SADDLPv2i32_v1i64 V64:$Rn), dsub),
	dsub))>;
	// The vaddlv_u32 intrinsic gets mapped to UADDLP.
	def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
	(i64 (EXTRACT_SUBREG
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
	(UADDLPv2i32_v1i64 V64:$Rn), dsub),
	dsub))>;

	//------------------------------------------------------------------------------
	// AdvSIMD modified immediate instructions
	//------------------------------------------------------------------------------

	// AdvSIMD BIC
	defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
	// AdvSIMD ORR
	defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;

	def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;

	def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0)>;
	def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;

	def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	// AdvSIMD FMOV
	def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1111, V128, fpimm8,
	"fmov", ".2d",
	[(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1111, V64, fpimm8,
	"fmov", ".2s",
	[(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1111, V128, fpimm8,
	"fmov", ".4s",
	[(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	let Predicates = [HasNEON, HasFullFP16] in {
	def FMOVv4f16_ns : SIMDModifiedImmVectorNoShift<0, 0, 1, 0b1111, V64, fpimm8,
	"fmov", ".4h",
	[(set (v4f16 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	def FMOVv8f16_ns : SIMDModifiedImmVectorNoShift<1, 0, 1, 0b1111, V128, fpimm8,
	"fmov", ".8h",
	[(set (v8f16 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
	} // Predicates = [HasNEON, HasFullFP16]

	// AdvSIMD MOVI

	// EDIT byte mask: scalar
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in
	def MOVID : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
	[(set FPR64:$Rd, simdimmtype10:$imm8)]>;
	// The movi_edit node has the immediate value already encoded, so we use
	// a plain imm0_255 here.
	def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
	(MOVID imm0_255:$shift)>;

	def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
	def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
	def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
	def : Pat<(v8i8 immAllZerosV), (MOVID (i32 0))>;

	def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
	def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
	def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
	def : Pat<(v8i8 immAllOnesV), (MOVID (i32 255))>;

	// EDIT byte mask: 2d

	// The movi_edit node has the immediate value already encoded, so we use
	// a plain imm0_255 in the pattern
	let isReMaterializable = 1, isAsCheapAsAMove = 1 in
	def MOVIv2d_ns : SIMDModifiedImmVectorNoShift<1, 1, 0, 0b1110, V128,
	simdimmtype10,
	"movi", ".2d",
	[(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;

	def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
	def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
	def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
	def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;

	def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
	def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
	def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
	def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;

	def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
	def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;

	// EDIT per word & halfword: 2s, 4h, 4s, & 8h
	defm MOVI : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;

	def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MOVIv8i16 imm0_255:$imm8, imm:$shift)>;

	// EDIT per word: 2s & 4s with MSL shifter
	def MOVIv2s_msl : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
	[(set (v2i32 V64:$Rd),
	(AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
	def MOVIv4s_msl : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
	[(set (v4i32 V128:$Rd),
	(AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;

	// Per byte: 8b & 16b
	def MOVIv8b_ns : SIMDModifiedImmVectorNoShift<0, 0, 0, 0b1110, V64, imm0_255,
	"movi", ".8b",
	[(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
	def MOVIv16b_ns : SIMDModifiedImmVectorNoShift<1, 0, 0, 0b1110, V128, imm0_255,
	"movi", ".16b",
	[(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;

	// AdvSIMD MVNI

	// EDIT per word & halfword: 2s, 4h, 4s, & 8h
	defm MVNI : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;

	def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd, imm0_255:$imm, 0), 0>;
	def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;

	def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
	def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
	(MVNIv8i16 imm0_255:$imm8, imm:$shift)>;

	// EDIT per word: 2s & 4s with MSL shifter
	def MVNIv2s_msl : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
	[(set (v2i32 V64:$Rd),
	(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
	def MVNIv4s_msl : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
	[(set (v4i32 V128:$Rd),
	(AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;

	//----------------------------------------------------------------------------
	// AdvSIMD indexed element
	//----------------------------------------------------------------------------

	let hasSideEffects = 0 in {
	defm FMLA : SIMDFPIndexedTied<0, 0b0001, "fmla">;
	defm FMLS : SIMDFPIndexedTied<0, 0b0101, "fmls">;
	}

	// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
	// instruction expects the addend first, while the intrinsic expects it last.

	// On the other hand, there are quite a few valid combinatorial options due to
	// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
	defm : SIMDFPIndexedTiedPatterns<"FMLA",
	TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
	defm : SIMDFPIndexedTiedPatterns<"FMLA",
	TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;

	defm : SIMDFPIndexedTiedPatterns<"FMLS",
	TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
	defm : SIMDFPIndexedTiedPatterns<"FMLS",
	TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
	defm : SIMDFPIndexedTiedPatterns<"FMLS",
	TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
	defm : SIMDFPIndexedTiedPatterns<"FMLS",
	TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;

	multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
	// 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
	// and DUP scalar.
	def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
	(AArch64duplane32 (v4f32 (fneg V128:$Rm)),
	VectorIndexS:$idx))),
	(FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
	(v2f32 (AArch64duplane32
	(v4f32 (insert_subvector undef,
	(v2f32 (fneg V64:$Rm)),
	(i32 0))),
	VectorIndexS:$idx)))),
	(FMLSv2i32_indexed V64:$Rd, V64:$Rn,
	(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
	VectorIndexS:$idx)>;
	def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
	(AArch64dup (f32 (fneg FPR32Op:$Rm))))),
	(FMLSv2i32_indexed V64:$Rd, V64:$Rn,
	(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;

	// 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
	// and DUP scalar.
	def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
	(AArch64duplane32 (v4f32 (fneg V128:$Rm)),
	VectorIndexS:$idx))),
	(FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
	VectorIndexS:$idx)>;
	def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
	(v4f32 (AArch64duplane32
	(v4f32 (insert_subvector undef,
	(v2f32 (fneg V64:$Rm)),
	(i32 0))),
	VectorIndexS:$idx)))),
	(FMLSv4i32_indexed V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
	VectorIndexS:$idx)>;
	def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
	(AArch64dup (f32 (fneg FPR32Op:$Rm))))),
	(FMLSv4i32_indexed V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;

	// 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
	// (DUPLANE from 64-bit would be trivial).
	def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
	(AArch64duplane64 (v2f64 (fneg V128:$Rm)),
	VectorIndexD:$idx))),
	(FMLSv2i64_indexed
	V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
	(AArch64dup (f64 (fneg FPR64Op:$Rm))))),
	(FMLSv2i64_indexed V128:$Rd, V128:$Rn,
	(SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;

	// 2 variants for 32-bit scalar version: extract from .2s or from .4s
	def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
	(vector_extract (v4f32 (fneg V128:$Rm)),
	VectorIndexS:$idx))),
	(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
	V128:$Rm, VectorIndexS:$idx)>;
	def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
	(vector_extract (v4f32 (insert_subvector undef,
	(v2f32 (fneg V64:$Rm)),
	(i32 0))),
	VectorIndexS:$idx))),
	(FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
	(SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;

	// 1 variant for 64-bit scalar version: extract from .1d or from .2d
	def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
	(vector_extract (v2f64 (fneg V128:$Rm)),
	VectorIndexS:$idx))),
	(FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
	V128:$Rm, VectorIndexS:$idx)>;
	}

	defm : FMLSIndexedAfterNegPatterns<
	TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
	defm : FMLSIndexedAfterNegPatterns<
	TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;

	defm FMULX : SIMDFPIndexed<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
	defm FMUL : SIMDFPIndexed<0, 0b1001, "fmul", fmul>;

	def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
	(FMULv2i32_indexed V64:$Rn,
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
	(i64 0))>;
	def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
	(FMULv4i32_indexed V128:$Rn,
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
	(i64 0))>;
	def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
	(FMULv2i64_indexed V128:$Rn,
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
	(i64 0))>;

	defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
	defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
	defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
	TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
	defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
	TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
	defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
	defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
	defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
	TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
	defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
	int_aarch64_neon_smull>;
	defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
	int_aarch64_neon_sqadd>;
	defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
	int_aarch64_neon_sqsub>;
	defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
	int_aarch64_neon_sqadd>;
	defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
	int_aarch64_neon_sqsub>;
	defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
	defm UMLAL : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
	TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
	defm UMLSL : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
	TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
	defm UMULL : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
	int_aarch64_neon_umull>;

	// A scalar sqdmull with the second operand being a vector lane can be
	// handled directly with the indexed instruction encoding.
	def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
	(vector_extract (v4i32 V128:$Vm),
	VectorIndexS:$idx)),
	(SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;

	//----------------------------------------------------------------------------
	// AdvSIMD scalar shift instructions
	//----------------------------------------------------------------------------
	defm FCVTZS : SIMDFPScalarRShift<0, 0b11111, "fcvtzs">;
	defm FCVTZU : SIMDFPScalarRShift<1, 0b11111, "fcvtzu">;
	defm SCVTF : SIMDFPScalarRShift<0, 0b11100, "scvtf">;
	defm UCVTF : SIMDFPScalarRShift<1, 0b11100, "ucvtf">;
	// Codegen patterns for the above. We don't put these directly on the
	// instructions because TableGen's type inference can't handle the truth.
	// Having the same base pattern for fp <--> int totally freaks it out.
	def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
	(FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
	def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
	(FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
	def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
	(FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
	(FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
	vecshiftR64:$imm)),
	(FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
	vecshiftR64:$imm)),
	(FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
	(SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
	def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
	(UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
	def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
	(SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
	(UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
	vecshiftR64:$imm)),
	(SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
	def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
	vecshiftR64:$imm)),
	(UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;

	defm SHL : SIMDScalarLShiftD< 0, 0b01010, "shl", AArch64vshl>;
	defm SLI : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
	defm SQRSHRN : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
	int_aarch64_neon_sqrshrn>;
	defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
	int_aarch64_neon_sqrshrun>;
	defm SQSHLU : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
	defm SQSHL : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
	defm SQSHRN : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
	int_aarch64_neon_sqshrn>;
	defm SQSHRUN : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
	int_aarch64_neon_sqshrun>;
	defm SRI : SIMDScalarRShiftDTied< 1, 0b01000, "sri">;
	defm SRSHR : SIMDScalarRShiftD< 0, 0b00100, "srshr", AArch64srshri>;
	defm SRSRA : SIMDScalarRShiftDTied< 0, 0b00110, "srsra",
	TriOpFrag<(add node:$LHS,
	(AArch64srshri node:$MHS, node:$RHS))>>;
	defm SSHR : SIMDScalarRShiftD< 0, 0b00000, "sshr", AArch64vashr>;
	defm SSRA : SIMDScalarRShiftDTied< 0, 0b00010, "ssra",
	TriOpFrag<(add node:$LHS,
	(AArch64vashr node:$MHS, node:$RHS))>>;
	defm UQRSHRN : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
	int_aarch64_neon_uqrshrn>;
	defm UQSHL : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
	defm UQSHRN : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
	int_aarch64_neon_uqshrn>;
	defm URSHR : SIMDScalarRShiftD< 1, 0b00100, "urshr", AArch64urshri>;
	defm URSRA : SIMDScalarRShiftDTied< 1, 0b00110, "ursra",
	TriOpFrag<(add node:$LHS,
	(AArch64urshri node:$MHS, node:$RHS))>>;
	defm USHR : SIMDScalarRShiftD< 1, 0b00000, "ushr", AArch64vlshr>;
	defm USRA : SIMDScalarRShiftDTied< 1, 0b00010, "usra",
	TriOpFrag<(add node:$LHS,
	(AArch64vlshr node:$MHS, node:$RHS))>>;

	//----------------------------------------------------------------------------
	// AdvSIMD vector shift instructions
	//----------------------------------------------------------------------------
	defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
	defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
	defm SCVTF: SIMDVectorRShiftToFP<0, 0b11100, "scvtf",
	int_aarch64_neon_vcvtfxs2fp>;
	defm RSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
	int_aarch64_neon_rshrn>;
	defm SHL : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
	defm SHRN : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
	BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
	defm SLI : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
	def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
	(i32 vecshiftL64:$imm))),
	(SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
	defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
	int_aarch64_neon_sqrshrn>;
	defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
	int_aarch64_neon_sqrshrun>;
	defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
	defm SQSHL : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
	defm SQSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
	int_aarch64_neon_sqshrn>;
	defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
	int_aarch64_neon_sqshrun>;
	defm SRI : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
	def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
	(i32 vecshiftR64:$imm))),
	(SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
	defm SRSHR : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
	defm SRSRA : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
	TriOpFrag<(add node:$LHS,
	(AArch64srshri node:$MHS, node:$RHS))> >;
	defm SSHLL : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
	BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;

	defm SSHR : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
	defm SSRA : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
	TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
	defm UCVTF : SIMDVectorRShiftToFP<1, 0b11100, "ucvtf",
	int_aarch64_neon_vcvtfxu2fp>;
	defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
	int_aarch64_neon_uqrshrn>;
	defm UQSHL : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
	defm UQSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
	int_aarch64_neon_uqshrn>;
	defm URSHR : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
	defm URSRA : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
	TriOpFrag<(add node:$LHS,
	(AArch64urshri node:$MHS, node:$RHS))> >;
	defm USHLL : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
	BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
	defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
	defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
	TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;

	// SHRN patterns for when a logical right shift was used instead of arithmetic
	// (the immediate guarantees no sign bits actually end up in the result so it
	// doesn't matter).
	def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
	(SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
	def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
	(SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
	def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
	(SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;

	def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
	(trunc (AArch64vlshr (v8i16 V128:$Rn),
	vecshiftR16Narrow:$imm)))),
	(SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR16Narrow:$imm)>;
	def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
	(trunc (AArch64vlshr (v4i32 V128:$Rn),
	vecshiftR32Narrow:$imm)))),
	(SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR32Narrow:$imm)>;
	def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
	(trunc (AArch64vlshr (v2i64 V128:$Rn),
	vecshiftR64Narrow:$imm)))),
	(SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
	V128:$Rn, vecshiftR32Narrow:$imm)>;

	// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
	// Anyexts are implemented as zexts.
	def : Pat<(v8i16 (sext (v8i8 V64:$Rn))), (SSHLLv8i8_shift V64:$Rn, (i32 0))>;
	def : Pat<(v8i16 (zext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
	def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))), (USHLLv8i8_shift V64:$Rn, (i32 0))>;
	def : Pat<(v4i32 (sext (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
	def : Pat<(v4i32 (zext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
	def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
	def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
	def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
	def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
	// Also match an extend from the upper half of a 128 bit source register.
	def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
	(USHLLv16i8_shift V128:$Rn, (i32 0))>;
	def : Pat<(v8i16 (zext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
	(USHLLv16i8_shift V128:$Rn, (i32 0))>;
	def : Pat<(v8i16 (sext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
	(SSHLLv16i8_shift V128:$Rn, (i32 0))>;
	def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
	(USHLLv8i16_shift V128:$Rn, (i32 0))>;
	def : Pat<(v4i32 (zext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
	(USHLLv8i16_shift V128:$Rn, (i32 0))>;
	def : Pat<(v4i32 (sext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
	(SSHLLv8i16_shift V128:$Rn, (i32 0))>;
	def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
	(USHLLv4i32_shift V128:$Rn, (i32 0))>;
	def : Pat<(v2i64 (zext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
	(USHLLv4i32_shift V128:$Rn, (i32 0))>;
	def : Pat<(v2i64 (sext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
	(SSHLLv4i32_shift V128:$Rn, (i32 0))>;

	// Vector shift sxtl aliases
	def : InstAlias<"sxtl.8h $dst, $src1",
	(SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl $dst.8h, $src1.8b",
	(SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl.4s $dst, $src1",
	(SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl $dst.4s, $src1.4h",
	(SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl.2d $dst, $src1",
	(SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"sxtl $dst.2d, $src1.2s",
	(SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;

	// Vector shift sxtl2 aliases
	def : InstAlias<"sxtl2.8h $dst, $src1",
	(SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
	(SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2.4s $dst, $src1",
	(SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
	(SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2.2d $dst, $src1",
	(SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
	(SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;

	// Vector shift uxtl aliases
	def : InstAlias<"uxtl.8h $dst, $src1",
	(USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl $dst.8h, $src1.8b",
	(USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl.4s $dst, $src1",
	(USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl $dst.4s, $src1.4h",
	(USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl.2d $dst, $src1",
	(USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
	def : InstAlias<"uxtl $dst.2d, $src1.2s",
	(USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;

	// Vector shift uxtl2 aliases
	def : InstAlias<"uxtl2.8h $dst, $src1",
	(USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
	(USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2.4s $dst, $src1",
	(USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
	(USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2.2d $dst, $src1",
	(USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
	def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
	(USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;

	// If an integer is about to be converted to a floating point value,
	// just load it on the floating point unit.
	// These patterns are more complex because floating point loads do not
	// support sign extension.
	// The sign extension has to be explicitly added and is only supported for
	// one step: byte-to-half, half-to-word, word-to-doubleword.
	// SCVTF GPR -> FPR is 9 cycles.
	// SCVTF FPR -> FPR is 4 cyclces.
	// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
	// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
	// and still being faster.
	// However, this is not good for code size.
	// 8-bits -> float. 2 sizes step-up.
	class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
	: Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
	(SCVTFv1i32 (f32 (EXTRACT_SUBREG
	(SSHLLv4i16_shift
	(f64
	(EXTRACT_SUBREG
	(SSHLLv8i8_shift
	(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	INST,
	bsub),
	0),
	dsub)),
	0),
	ssub)))>,
	Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;

	def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
	(LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
	def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
	(LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
	def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
	(LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
	def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
	(LDURBi GPR64sp:$Rn, simm9:$offset)>;

	// 16-bits -> float. 1 size step-up.
	class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
	: Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
	(SCVTFv1i32 (f32 (EXTRACT_SUBREG
	(SSHLLv4i16_shift
	(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	INST,
	hsub),
	0),
	ssub)))>, Requires<[NotForCodeSize]>;

	def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
	(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
	def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
	(LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
	def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
	def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
	(LDURHi GPR64sp:$Rn, simm9:$offset)>;

	// 32-bits to 32-bits are handled in target specific dag combine:
	// performIntToFpCombine.
	// 64-bits integer to 32-bits floating point, not possible with
	// SCVTF on floating point registers (both source and destination
	// must have the same size).

	// Here are the patterns for 8, 16, 32, and 64-bits to double.
	// 8-bits -> double. 3 size step-up: give up.
	// 16-bits -> double. 2 size step.
	class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
	: Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
	(SCVTFv1i64 (f64 (EXTRACT_SUBREG
	(SSHLLv2i32_shift
	(f64
	(EXTRACT_SUBREG
	(SSHLLv4i16_shift
	(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	INST,
	hsub),
	0),
	dsub)),
	0),
	dsub)))>,
	Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;

	def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
	(LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
	def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
	(LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
	def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
	(LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
	def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
	(LDURHi GPR64sp:$Rn, simm9:$offset)>;
	// 32-bits -> double. 1 size step-up.
	class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
	: Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
	(SCVTFv1i64 (f64 (EXTRACT_SUBREG
	(SSHLLv2i32_shift
	(INSERT_SUBREG (f64 (IMPLICIT_DEF)),
	INST,
	ssub),
	0),
	dsub)))>, Requires<[NotForCodeSize]>;

	def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
	(LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
	def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
	(LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
	def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
	(LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
	def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
	(LDURSi GPR64sp:$Rn, simm9:$offset)>;

	// 64-bits -> double are handled in target specific dag combine:
	// performIntToFpCombine.


	//----------------------------------------------------------------------------
	// AdvSIMD Load-Store Structure
	//----------------------------------------------------------------------------
	defm LD1 : SIMDLd1Multiple<"ld1">;
	defm LD2 : SIMDLd2Multiple<"ld2">;
	defm LD3 : SIMDLd3Multiple<"ld3">;
	defm LD4 : SIMDLd4Multiple<"ld4">;

	defm ST1 : SIMDSt1Multiple<"st1">;
	defm ST2 : SIMDSt2Multiple<"st2">;
	defm ST3 : SIMDSt3Multiple<"st3">;
	defm ST4 : SIMDSt4Multiple<"st4">;

	class Ld1Pat<ValueType ty, Instruction INST>
	: Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;

	def : Ld1Pat<v16i8, LD1Onev16b>;
	def : Ld1Pat<v8i16, LD1Onev8h>;
	def : Ld1Pat<v4i32, LD1Onev4s>;
	def : Ld1Pat<v2i64, LD1Onev2d>;
	def : Ld1Pat<v8i8, LD1Onev8b>;
	def : Ld1Pat<v4i16, LD1Onev4h>;
	def : Ld1Pat<v2i32, LD1Onev2s>;
	def : Ld1Pat<v1i64, LD1Onev1d>;

	class St1Pat<ValueType ty, Instruction INST>
	: Pat<(store ty:$Vt, GPR64sp:$Rn),
	(INST ty:$Vt, GPR64sp:$Rn)>;

	def : St1Pat<v16i8, ST1Onev16b>;
	def : St1Pat<v8i16, ST1Onev8h>;
	def : St1Pat<v4i32, ST1Onev4s>;
	def : St1Pat<v2i64, ST1Onev2d>;
	def : St1Pat<v8i8, ST1Onev8b>;
	def : St1Pat<v4i16, ST1Onev4h>;
	def : St1Pat<v2i32, ST1Onev2s>;
	def : St1Pat<v1i64, ST1Onev1d>;

	//---
	// Single-element
	//---

	defm LD1R : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
	defm LD2R : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
	defm LD3R : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
	defm LD4R : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
	let mayLoad = 1, hasSideEffects = 0 in {
	defm LD1 : SIMDLdSingleBTied<0, 0b000, "ld1", VecListOneb, GPR64pi1>;
	defm LD1 : SIMDLdSingleHTied<0, 0b010, 0, "ld1", VecListOneh, GPR64pi2>;
	defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes, GPR64pi4>;
	defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned, GPR64pi8>;
	defm LD2 : SIMDLdSingleBTied<1, 0b000, "ld2", VecListTwob, GPR64pi2>;
	defm LD2 : SIMDLdSingleHTied<1, 0b010, 0, "ld2", VecListTwoh, GPR64pi4>;
	defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos, GPR64pi8>;
	defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod, GPR64pi16>;
	defm LD3 : SIMDLdSingleBTied<0, 0b001, "ld3", VecListThreeb, GPR64pi3>;
	defm LD3 : SIMDLdSingleHTied<0, 0b011, 0, "ld3", VecListThreeh, GPR64pi6>;
	defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
	defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
	defm LD4 : SIMDLdSingleBTied<1, 0b001, "ld4", VecListFourb, GPR64pi4>;
	defm LD4 : SIMDLdSingleHTied<1, 0b011, 0, "ld4", VecListFourh, GPR64pi8>;
	defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours, GPR64pi16>;
	defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd, GPR64pi32>;
	}

	def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
	(LD1Rv8b GPR64sp:$Rn)>;
	def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
	(LD1Rv16b GPR64sp:$Rn)>;
	def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
	(LD1Rv4h GPR64sp:$Rn)>;
	def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
	(LD1Rv8h GPR64sp:$Rn)>;
	def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
	(LD1Rv2s GPR64sp:$Rn)>;
	def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
	(LD1Rv4s GPR64sp:$Rn)>;
	def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
	(LD1Rv2d GPR64sp:$Rn)>;
	def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
	(LD1Rv1d GPR64sp:$Rn)>;
	// Grab the floating point version too
	def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
	(LD1Rv2s GPR64sp:$Rn)>;
	def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
	(LD1Rv4s GPR64sp:$Rn)>;
	def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
	(LD1Rv2d GPR64sp:$Rn)>;
	def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
	(LD1Rv1d GPR64sp:$Rn)>;
	def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
	(LD1Rv4h GPR64sp:$Rn)>;
	def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
	(LD1Rv8h GPR64sp:$Rn)>;

	class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction LD1>
	: Pat<(vector_insert (VTy VecListOne128:$Rd),
	(STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
	(LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;

	def : Ld1Lane128Pat<extloadi8, VectorIndexB, v16i8, i32, LD1i8>;
	def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
	def : Ld1Lane128Pat<load, VectorIndexS, v4i32, i32, LD1i32>;
	def : Ld1Lane128Pat<load, VectorIndexS, v4f32, f32, LD1i32>;
	def : Ld1Lane128Pat<load, VectorIndexD, v2i64, i64, LD1i64>;
	def : Ld1Lane128Pat<load, VectorIndexD, v2f64, f64, LD1i64>;
	def : Ld1Lane128Pat<load, VectorIndexH, v8f16, f16, LD1i16>;

	class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction LD1>
	: Pat<(vector_insert (VTy VecListOne64:$Rd),
	(STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
	(EXTRACT_SUBREG
	(LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
	VecIndex:$idx, GPR64sp:$Rn),
	dsub)>;

	def : Ld1Lane64Pat<extloadi8, VectorIndexB, v8i8, i32, LD1i8>;
	def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
	def : Ld1Lane64Pat<load, VectorIndexS, v2i32, i32, LD1i32>;
	def : Ld1Lane64Pat<load, VectorIndexS, v2f32, f32, LD1i32>;
	def : Ld1Lane64Pat<load, VectorIndexH, v4f16, f16, LD1i16>;


	defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
	defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
	defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
	defm LD4 : SIMDLdSt4SingleAliases<"ld4">;

	// Stores
	defm ST1 : SIMDStSingleB<0, 0b000, "st1", VecListOneb, GPR64pi1>;
	defm ST1 : SIMDStSingleH<0, 0b010, 0, "st1", VecListOneh, GPR64pi2>;
	defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
	defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;

	let AddedComplexity = 19 in
	class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction ST1>
	: Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn),
	(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;

	def : St1Lane128Pat<truncstorei8, VectorIndexB, v16i8, i32, ST1i8>;
	def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
	def : St1Lane128Pat<store, VectorIndexS, v4i32, i32, ST1i32>;
	def : St1Lane128Pat<store, VectorIndexS, v4f32, f32, ST1i32>;
	def : St1Lane128Pat<store, VectorIndexD, v2i64, i64, ST1i64>;
	def : St1Lane128Pat<store, VectorIndexD, v2f64, f64, ST1i64>;
	def : St1Lane128Pat<store, VectorIndexH, v8f16, f16, ST1i16>;

	let AddedComplexity = 19 in
	class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction ST1>
	: Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn),
	(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
	VecIndex:$idx, GPR64sp:$Rn)>;

	def : St1Lane64Pat<truncstorei8, VectorIndexB, v8i8, i32, ST1i8>;
	def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
	def : St1Lane64Pat<store, VectorIndexS, v2i32, i32, ST1i32>;
	def : St1Lane64Pat<store, VectorIndexS, v2f32, f32, ST1i32>;
	def : St1Lane64Pat<store, VectorIndexH, v4f16, f16, ST1i16>;

	multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction ST1,
	int offset> {
	def : Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn, offset),
	(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
	VecIndex:$idx, GPR64sp:$Rn, XZR)>;

	def : Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn, GPR64:$Rm),
	(ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
	VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
	}

	defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
	defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
	2>;
	defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
	defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
	defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
	defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
	defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;

	multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
	ValueType VTy, ValueType STy, Instruction ST1,
	int offset> {
	def : Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn, offset),
	(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;

	def : Pat<(scalar_store
	(STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
	GPR64sp:$Rn, GPR64:$Rm),
	(ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
	}

	defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
	1>;
	defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
	2>;
	defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
	defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
	defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
	defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
	defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;

	let mayStore = 1, hasSideEffects = 0 in {
	defm ST2 : SIMDStSingleB<1, 0b000, "st2", VecListTwob, GPR64pi2>;
	defm ST2 : SIMDStSingleH<1, 0b010, 0, "st2", VecListTwoh, GPR64pi4>;
	defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos, GPR64pi8>;
	defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod, GPR64pi16>;
	defm ST3 : SIMDStSingleB<0, 0b001, "st3", VecListThreeb, GPR64pi3>;
	defm ST3 : SIMDStSingleH<0, 0b011, 0, "st3", VecListThreeh, GPR64pi6>;
	defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
	defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
	defm ST4 : SIMDStSingleB<1, 0b001, "st4", VecListFourb, GPR64pi4>;
	defm ST4 : SIMDStSingleH<1, 0b011, 0, "st4", VecListFourh, GPR64pi8>;
	defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours, GPR64pi16>;
	defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd, GPR64pi32>;
	}

	defm ST1 : SIMDLdSt1SingleAliases<"st1">;
	defm ST2 : SIMDLdSt2SingleAliases<"st2">;
	defm ST3 : SIMDLdSt3SingleAliases<"st3">;
	defm ST4 : SIMDLdSt4SingleAliases<"st4">;

	//----------------------------------------------------------------------------
	// Crypto extensions
	//----------------------------------------------------------------------------

	def AESErr : AESTiedInst<0b0100, "aese", int_aarch64_crypto_aese>;
	def AESDrr : AESTiedInst<0b0101, "aesd", int_aarch64_crypto_aesd>;
	def AESMCrr : AESInst< 0b0110, "aesmc", int_aarch64_crypto_aesmc>;
	def AESIMCrr : AESInst< 0b0111, "aesimc", int_aarch64_crypto_aesimc>;
	+
	+// Pseudo instructions for AESMCrr/AESIMCrr with a register constraint required
	+// for AES fusion on some CPUs.
	+let hasSideEffects = 0, mayStore = 0, mayLoad = 0 in {
	+def AESMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
	+ Sched<[WriteV]>;
	+def AESIMCrrTied: Pseudo<(outs V128:$Rd), (ins V128:$Rn), [], "$Rn = $Rd">,
	+ Sched<[WriteV]>;
	+}
	+
	+// Only use constrained versions of AES(I)MC instructions if they are paired with
	+// AESE/AESD.
	+def : Pat<(v16i8 (int_aarch64_crypto_aesmc
	+ (v16i8 (int_aarch64_crypto_aese (v16i8 V128:$src1),
	+ (v16i8 V128:$src2))))),
	+ (v16i8 (AESMCrrTied (v16i8 (AESErr (v16i8 V128:$src1),
	+ (v16i8 V128:$src2)))))>,
	+ Requires<[HasFuseAES]>;
	+
	+def : Pat<(v16i8 (int_aarch64_crypto_aesimc
	+ (v16i8 (int_aarch64_crypto_aesd (v16i8 V128:$src1),
	+ (v16i8 V128:$src2))))),
	+ (v16i8 (AESIMCrrTied (v16i8 (AESDrr (v16i8 V128:$src1),
	+ (v16i8 V128:$src2)))))>,
	+ Requires<[HasFuseAES]>;

	def SHA1Crrr : SHATiedInstQSV<0b000, "sha1c", int_aarch64_crypto_sha1c>;
	def SHA1Prrr : SHATiedInstQSV<0b001, "sha1p", int_aarch64_crypto_sha1p>;
	def SHA1Mrrr : SHATiedInstQSV<0b010, "sha1m", int_aarch64_crypto_sha1m>;
	def SHA1SU0rrr : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
	def SHA256Hrrr : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
	def SHA256H2rrr : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
	def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;

	def SHA1Hrr : SHAInstSS< 0b0000, "sha1h", int_aarch64_crypto_sha1h>;
	def SHA1SU1rr : SHATiedInstVV<0b0001, "sha1su1", int_aarch64_crypto_sha1su1>;
	def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;

	//----------------------------------------------------------------------------
	// Compiler-pseudos
	//----------------------------------------------------------------------------
	// FIXME: Like for X86, these should go in their own separate .td file.

	def def32 : PatLeaf<(i32 GPR32:$src), [{
	return isDef32(*N);
	}]>;

	// In the case of a 32-bit def that is known to implicitly zero-extend,
	// we can use a SUBREG_TO_REG.
	def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;

	// For an anyext, we don't care what the high bits are, so we can perform an
	// INSERT_SUBREF into an IMPLICIT_DEF.
	def : Pat<(i64 (anyext GPR32:$src)),
	(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;

	// When we need to explicitly zero-extend, we use a 32-bit MOV instruction and
	// then assert the extension has happened.
	def : Pat<(i64 (zext GPR32:$src)),
	(SUBREG_TO_REG (i32 0), (ORRWrs WZR, GPR32:$src, 0), sub_32)>;

	// To sign extend, we use a signed bitfield move instruction (SBFM) on the
	// containing super-reg.
	def : Pat<(i64 (sext GPR32:$src)),
	(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
	def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
	def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
	def : Pat<(i64 (sext_inreg GPR64:$src, i8)), (SBFMXri GPR64:$src, 0, 7)>;
	def : Pat<(i64 (sext_inreg GPR64:$src, i1)), (SBFMXri GPR64:$src, 0, 0)>;
	def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
	def : Pat<(i32 (sext_inreg GPR32:$src, i8)), (SBFMWri GPR32:$src, 0, 7)>;
	def : Pat<(i32 (sext_inreg GPR32:$src, i1)), (SBFMWri GPR32:$src, 0, 0)>;

	def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
	(SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
	(i64 (i32shift_sext_i8 imm0_31:$imm)))>;
	def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
	(SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
	(i64 (i64shift_sext_i8 imm0_63:$imm)))>;

	def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
	(SBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
	(i64 (i32shift_sext_i16 imm0_31:$imm)))>;
	def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
	(SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
	(i64 (i64shift_sext_i16 imm0_63:$imm)))>;

	def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
	(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
	(i64 (i64shift_a imm0_63:$imm)),
	(i64 (i64shift_sext_i32 imm0_63:$imm)))>;

	// sra patterns have an AddedComplexity of 10, so make sure we have a higher
	// AddedComplexity for the following patterns since we want to match sext + sra
	// patterns before we attempt to match a single sra node.
	let AddedComplexity = 20 in {
	// We support all sext + sra combinations which preserve at least one bit of the
	// original value which is to be sign extended. E.g. we support shifts up to
	// bitwidth-1 bits.
	def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
	(SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
	def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
	(SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;

	def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
	(SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
	def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
	(SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;

	def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
	(SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
	(i64 imm0_31:$imm), 31)>;
	} // AddedComplexity = 20

	// To truncate, we can simply extract from a subregister.
	def : Pat<(i32 (trunc GPR64sp:$src)),
	(i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;

	// __builtin_trap() uses the BRK instruction on AArch64.
	def : Pat<(trap), (BRK 1)>;

	// Conversions within AdvSIMD types in the same register size are free.
	// But because we need a consistent lane ordering, in big endian many
	// conversions require one or more REV instructions.
	//
	// Consider a simple memory load followed by a bitconvert then a store.
	// v0 = load v2i32
	// v1 = BITCAST v2i32 v0 to v4i16
	// store v4i16 v2
	//
	// In big endian mode every memory access has an implicit byte swap. LDR and
	// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
	// is, they treat the vector as a sequence of elements to be byte-swapped.
	// The two pairs of instructions are fundamentally incompatible. We've decided
	// to use LD1/ST1 only to simplify compiler implementation.
	//
	// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
	// the original code sequence:
	// v0 = load v2i32
	// v1 = REV v2i32 (implicit)
	// v2 = BITCAST v2i32 v1 to v4i16
	// v3 = REV v4i16 v2 (implicit)
	// store v4i16 v3
	//
	// But this is now broken - the value stored is different to the value loaded
	// due to lane reordering. To fix this, on every BITCAST we must perform two
	// other REVs:
	// v0 = load v2i32
	// v1 = REV v2i32 (implicit)
	// v2 = REV v2i32
	// v3 = BITCAST v2i32 v2 to v4i16
	// v4 = REV v4i16
	// v5 = REV v4i16 v4 (implicit)
	// store v4i16 v5
	//
	// This means an extra two instructions, but actually in most cases the two REV
	// instructions can be combined into one. For example:
	// (REV64_2s (REV64_4h X)) === (REV32_4h X)
	//
	// There is also no 128-bit REV instruction. This must be synthesized with an
	// EXT instruction.
	//
	// Most bitconverts require some sort of conversion. The only exceptions are:
	// a) Identity conversions - vNfX <-> vNiX
	// b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
	//

	// Natural vector casts (64 bit)
	def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;

	def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;

	def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;

	def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1f64 (AArch64NvCast (f64 FPR64:$src))), (v1f64 FPR64:$src)>;

	def : Pat<(v8i8 (AArch64NvCast (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v4i16 (AArch64NvCast (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v2i32 (AArch64NvCast (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2f32 (AArch64NvCast (v2f32 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v1i64 (AArch64NvCast (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;

	// Natural vector casts (128 bit)
	def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;

	def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v8i16 (AArch64NvCast (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v8i8 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;

	def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v8i8 (bitconvert GPR64:$Xn)),
	(REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
	def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
	(REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
	def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
	(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
	def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
	(REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
	def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
	(REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;

	def : Pat<(i64 (bitconvert (v8i8 V64:$Vn))),
	(REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
	(REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
	(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
	(REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
	(REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
	}
	def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;
	def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
	(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
	(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;

	def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
	(COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
	def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
	(COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
	def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
	(COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
	def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
	(COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
	def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
	(COPY_TO_REGCLASS V64:$Vn, GPR64)>;

	let Predicates = [IsLE] in {
	def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
	(v1i64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
	(v1i64 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v1i64 (bitconvert (v8i8 FPR64:$src))),
	(v1i64 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
	(v1i64 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
	(v1i64 (REV64v2i32 FPR64:$src))>;
	}
	def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
	def : Pat<(v1i64 (bitconvert (f64 FPR64:$src))), (v1i64 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
	def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
	(v2i32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
	(v2i32 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (v8i8 FPR64:$src))),
	(v2i32 (REV32v8i8 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))),
	(v2i32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
	(v2i32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
	(v2i32 (REV64v4i16 FPR64:$src))>;
	}
	def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
	def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
	(v4i16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
	(v4i16 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v8i8 FPR64:$src))),
	(v4i16 (REV16v8i8 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))),
	(v4i16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))),
	(v4i16 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
	(v4i16 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
	(v4i16 (REV64v4i16 FPR64:$src))>;
	}

	let Predicates = [IsLE] in {
	def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
	def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v2i32 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v8i8 FPR64:$src))),
	(v4f16 (REV16v8i8 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (f64 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
	(v4f16 (REV64v4i16 FPR64:$src))>;
	}



	let Predicates = [IsLE] in {
	def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))), (v8i8 FPR64:$src)>;
	def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))), (v8i8 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v8i8 (bitconvert (v1i64 FPR64:$src))),
	(v8i8 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v2i32 FPR64:$src))),
	(v8i8 (REV32v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v4i16 FPR64:$src))),
	(v8i8 (REV16v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (f64 FPR64:$src))),
	(v8i8 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v2f32 FPR64:$src))),
	(v8i8 (REV32v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v1f64 FPR64:$src))),
	(v8i8 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v8i8 (bitconvert (v4f16 FPR64:$src))),
	(v8i8 (REV16v8i8 FPR64:$src))>;
	}

	let Predicates = [IsLE] in {
	def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))), (f64 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(f64 (bitconvert (v2i32 FPR64:$src))),
	(f64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(f64 (bitconvert (v4i16 FPR64:$src))),
	(f64 (REV64v4i16 FPR64:$src))>;
	def : Pat<(f64 (bitconvert (v2f32 FPR64:$src))),
	(f64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(f64 (bitconvert (v8i8 FPR64:$src))),
	(f64 (REV64v8i8 FPR64:$src))>;
	def : Pat<(f64 (bitconvert (v4f16 FPR64:$src))),
	(f64 (REV64v4i16 FPR64:$src))>;
	}
	def : Pat<(f64 (bitconvert (v1i64 FPR64:$src))), (f64 FPR64:$src)>;
	def : Pat<(f64 (bitconvert (v1f64 FPR64:$src))), (f64 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
	(v1f64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
	(v1f64 (REV64v4i16 FPR64:$src))>;
	def : Pat<(v1f64 (bitconvert (v8i8 FPR64:$src))),
	(v1f64 (REV64v8i8 FPR64:$src))>;
	def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
	(v1f64 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
	(v1f64 (REV64v4i16 FPR64:$src))>;
	}
	def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
	def : Pat<(v1f64 (bitconvert (f64 FPR64:$src))), (v1f64 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
	def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
	(v2f32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
	(v2f32 (REV32v4i16 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (v8i8 FPR64:$src))),
	(v2f32 (REV32v8i8 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
	(v2f32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))),
	(v2f32 (REV64v2i32 FPR64:$src))>;
	def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
	(v2f32 (REV64v4i16 FPR64:$src))>;
	}
	def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
	def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
	(f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
	def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
	(REV64v4i32 FPR128:$src), (i32 8)))>;
	def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
	(REV64v8i16 FPR128:$src), (i32 8)))>;
	def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
	(REV64v8i16 FPR128:$src), (i32 8)))>;
	def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
	(f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
	def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
	(REV64v4i32 FPR128:$src), (i32 8)))>;
	def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
	(f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
	(REV64v16i8 FPR128:$src), (i32 8)))>;
	}

	let Predicates = [IsLE] in {
	def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))),
	(v2f64 (EXTv16i8 FPR128:$src,
	FPR128:$src, (i32 8)))>;
	def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
	(v2f64 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
	(v2f64 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
	(v2f64 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
	(v2f64 (REV64v16i8 FPR128:$src))>;
	def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
	(v2f64 (REV64v4i32 FPR128:$src))>;
	}
	def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))),
	(v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
	(REV64v4i32 FPR128:$src), (i32 8)))>;
	def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
	(v4f32 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
	(v4f32 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
	(v4f32 (REV32v16i8 FPR128:$src))>;
	def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
	(v4f32 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
	(v4f32 (REV64v4i32 FPR128:$src))>;
	}
	def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
	(v2i64 (EXTv16i8 FPR128:$src,
	FPR128:$src, (i32 8)))>;
	def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
	(v2i64 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
	(v2i64 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
	(v2i64 (REV64v16i8 FPR128:$src))>;
	def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
	(v2i64 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
	(v2i64 (REV64v8i16 FPR128:$src))>;
	}
	def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
	(v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
	(REV64v4i32 FPR128:$src),
	(i32 8)))>;
	def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
	(v4i32 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
	(v4i32 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
	(v4i32 (REV32v16i8 FPR128:$src))>;
	def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
	(v4i32 (REV64v4i32 FPR128:$src))>;
	def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
	(v4i32 (REV32v8i16 FPR128:$src))>;
	}
	def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;

	let Predicates = [IsLE] in {
	def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
	(v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
	(REV64v8i16 FPR128:$src),
	(i32 8)))>;
	def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
	(v8i16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
	(v8i16 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
	(v8i16 (REV16v16i8 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
	(v8i16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
	(v8i16 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))),
	(v8i16 (REV32v8i16 FPR128:$src))>;
	}

	let Predicates = [IsLE] in {
	def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
	def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v8f16 (bitconvert (f128 FPR128:$src))),
	(v8f16 (EXTv16i8 (REV64v8i16 FPR128:$src),
	(REV64v8i16 FPR128:$src),
	(i32 8)))>;
	def : Pat<(v8f16 (bitconvert (v2i64 FPR128:$src))),
	(v8f16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))),
	(v8f16 (REV32v8i16 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))),
	(v8f16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))),
	(v8f16 (REV16v16i8 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
	(v8f16 (REV64v8i16 FPR128:$src))>;
	def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
	(v8f16 (REV32v8i16 FPR128:$src))>;
	}

	let Predicates = [IsLE] in {
	def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
	}
	let Predicates = [IsBE] in {
	def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
	(v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
	(REV64v16i8 FPR128:$src),
	(i32 8)))>;
	def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
	(v16i8 (REV64v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
	(v16i8 (REV32v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
	(v16i8 (REV16v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
	(v16i8 (REV64v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
	(v16i8 (REV32v16i8 FPR128:$src))>;
	def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
	(v16i8 (REV16v16i8 FPR128:$src))>;
	}

	def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v8i8 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;
	def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 0))),
	(EXTRACT_SUBREG V128:$Rn, dsub)>;

	def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
	(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
	def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
	(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
	def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
	(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
	def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
	(EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;

	// A 64-bit subvector insert to the first 128-bit vector position
	// is a subregister copy that needs no instruction.
	def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
	(INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
	(INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
	(INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
	(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
	(INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (i32 0)),
	(INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
	def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
	(INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;

	// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
	// or v2f32.
	def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
	(vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
	(i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
	def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
	(vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
	(f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
	// vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
	// so we match on v4f32 here, not v2f32. This will also catch adding
	// the low two lanes of a true v4f32 vector.
	def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
	(vector_extract (v4f32 FPR128:$Rn), (i64 1))),
	(f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;

	// Scalar 64-bit shifts in FPR64 registers.
	def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
	def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
	(URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;

	// Patterns for nontemporal/no-allocate stores.
	// We have to resort to tricks to turn a single-input store into a store pair,
	// because there is no single-input nontemporal store, only STNP.
	let Predicates = [IsLE] in {
	let AddedComplexity = 15 in {
	class NTStore128Pat<ValueType VT> :
	Pat<(nontemporalstore (VT FPR128:$Rt),
	(am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
	(STNPDi (EXTRACT_SUBREG FPR128:$Rt, dsub),
	(CPYi64 FPR128:$Rt, (i64 1)),
	GPR64sp:$Rn, simm7s8:$offset)>;

	def : NTStore128Pat<v2i64>;
	def : NTStore128Pat<v4i32>;
	def : NTStore128Pat<v8i16>;
	def : NTStore128Pat<v16i8>;

	class NTStore64Pat<ValueType VT> :
	Pat<(nontemporalstore (VT FPR64:$Rt),
	(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
	(STNPSi (EXTRACT_SUBREG FPR64:$Rt, ssub),
	(CPYi32 (SUBREG_TO_REG (i64 0), FPR64:$Rt, dsub), (i64 1)),
	GPR64sp:$Rn, simm7s4:$offset)>;

	// FIXME: Shouldn't v1f64 loads/stores be promoted to v1i64?
	def : NTStore64Pat<v1f64>;
	def : NTStore64Pat<v1i64>;
	def : NTStore64Pat<v2i32>;
	def : NTStore64Pat<v4i16>;
	def : NTStore64Pat<v8i8>;

	def : Pat<(nontemporalstore GPR64:$Rt,
	(am_indexed7s32 GPR64sp:$Rn, simm7s4:$offset)),
	(STNPWi (EXTRACT_SUBREG GPR64:$Rt, sub_32),
	(EXTRACT_SUBREG (UBFMXri GPR64:$Rt, 32, 63), sub_32),
	GPR64sp:$Rn, simm7s4:$offset)>;
	} // AddedComplexity=10
	} // Predicates = [IsLE]

	// Tail call return handling. These are all compiler pseudo-instructions,
	// so no encoding information or anything like that.
	let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
	def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff), []>,
	Sched<[WriteBrReg]>;
	def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>,
	Sched<[WriteBrReg]>;
	}

	def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
	(TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
	def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
	(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
	def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
	(TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;

	include "AArch64InstrAtomics.td"
	Index: head/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Target/AArch64/AArch64MacroFusion.cpp (revision 322320)
	@@ -1,164 +1,166 @@
	//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	/// \file This file contains the AArch64 implementation of the DAG scheduling
	/// mutation to pair instructions back to back.
	//
	//===----------------------------------------------------------------------===//

	#include "AArch64MacroFusion.h"
	#include "AArch64Subtarget.h"
	#include "llvm/CodeGen/MacroFusion.h"
	#include "llvm/Target/TargetInstrInfo.h"

	using namespace llvm;

	namespace {

	/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused
	/// together. Given SecondMI, when FirstMI is unspecified, then check if
	/// SecondMI may be part of a fused pair at all.
	static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
	const TargetSubtargetInfo &TSI,
	const MachineInstr *FirstMI,
	const MachineInstr &SecondMI) {
	const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
	const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);

	// Assume wildcards for unspecified instrs.
	unsigned FirstOpcode =
	FirstMI ? FirstMI->getOpcode()
	: static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
	unsigned SecondOpcode = SecondMI.getOpcode();

	if (ST.hasArithmeticBccFusion())
	// Fuse CMN, CMP, TST followed by Bcc.
	if (SecondOpcode == AArch64::Bcc)
	switch (FirstOpcode) {
	default:
	return false;
	case AArch64::ADDSWri:
	case AArch64::ADDSWrr:
	case AArch64::ADDSXri:
	case AArch64::ADDSXrr:
	case AArch64::ANDSWri:
	case AArch64::ANDSWrr:
	case AArch64::ANDSXri:
	case AArch64::ANDSXrr:
	case AArch64::SUBSWri:
	case AArch64::SUBSWrr:
	case AArch64::SUBSXri:
	case AArch64::SUBSXrr:
	case AArch64::BICSWrr:
	case AArch64::BICSXrr:
	return true;
	case AArch64::ADDSWrs:
	case AArch64::ADDSXrs:
	case AArch64::ANDSWrs:
	case AArch64::ANDSXrs:
	case AArch64::SUBSWrs:
	case AArch64::SUBSXrs:
	case AArch64::BICSWrs:
	case AArch64::BICSXrs:
	// Shift value can be 0 making these behave like the "rr" variant...
	return !II.hasShiftedReg(*FirstMI);
	case AArch64::INSTRUCTION_LIST_END:
	return true;
	}

	if (ST.hasArithmeticCbzFusion())
	// Fuse ALU operations followed by CBZ/CBNZ.
	if (SecondOpcode == AArch64::CBNZW \|\| SecondOpcode == AArch64::CBNZX \|\|
	SecondOpcode == AArch64::CBZW \|\| SecondOpcode == AArch64::CBZX)
	switch (FirstOpcode) {
	default:
	return false;
	case AArch64::ADDWri:
	case AArch64::ADDWrr:
	case AArch64::ADDXri:
	case AArch64::ADDXrr:
	case AArch64::ANDWri:
	case AArch64::ANDWrr:
	case AArch64::ANDXri:
	case AArch64::ANDXrr:
	case AArch64::EORWri:
	case AArch64::EORWrr:
	case AArch64::EORXri:
	case AArch64::EORXrr:
	case AArch64::ORRWri:
	case AArch64::ORRWrr:
	case AArch64::ORRXri:
	case AArch64::ORRXrr:
	case AArch64::SUBWri:
	case AArch64::SUBWrr:
	case AArch64::SUBXri:
	case AArch64::SUBXrr:
	return true;
	case AArch64::ADDWrs:
	case AArch64::ADDXrs:
	case AArch64::ANDWrs:
	case AArch64::ANDXrs:
	case AArch64::SUBWrs:
	case AArch64::SUBXrs:
	case AArch64::BICWrs:
	case AArch64::BICXrs:
	// Shift value can be 0 making these behave like the "rr" variant...
	return !II.hasShiftedReg(*FirstMI);
	case AArch64::INSTRUCTION_LIST_END:
	return true;
	}

	if (ST.hasFuseAES())
	// Fuse AES crypto operations.
	switch(SecondOpcode) {
	// AES encode.
	- case AArch64::AESMCrr :
	+ case AArch64::AESMCrr:
	+ case AArch64::AESMCrrTied:
	return FirstOpcode == AArch64::AESErr \|\|
	FirstOpcode == AArch64::INSTRUCTION_LIST_END;
	// AES decode.
	case AArch64::AESIMCrr:
	+ case AArch64::AESIMCrrTied:
	return FirstOpcode == AArch64::AESDrr \|\|
	FirstOpcode == AArch64::INSTRUCTION_LIST_END;
	}

	if (ST.hasFuseLiterals())
	// Fuse literal generation operations.
	switch (SecondOpcode) {
	// PC relative address.
	case AArch64::ADDXri:
	return FirstOpcode == AArch64::ADRP \|\|
	FirstOpcode == AArch64::INSTRUCTION_LIST_END;
	// 32 bit immediate.
	case AArch64::MOVKWi:
	return (FirstOpcode == AArch64::MOVZWi &&
	SecondMI.getOperand(3).getImm() == 16) \|\|
	FirstOpcode == AArch64::INSTRUCTION_LIST_END;
	// Lower and upper half of 64 bit immediate.
	case AArch64::MOVKXi:
	return FirstOpcode == AArch64::INSTRUCTION_LIST_END \|\|
	(FirstOpcode == AArch64::MOVZXi &&
	SecondMI.getOperand(3).getImm() == 16) \|\|
	(FirstOpcode == AArch64::MOVKXi &&
	FirstMI->getOperand(3).getImm() == 32 &&
	SecondMI.getOperand(3).getImm() == 48);
	}

	return false;
	}

	} // end namespace


	namespace llvm {

	std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
	return createMacroFusionDAGMutation(shouldScheduleAdjacent);
	}

	} // end namespace llvm
	Index: head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Target/X86/X86ISelLowering.cpp (revision 322320)
	@@ -1,36705 +1,36712 @@
	//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the interfaces that X86 uses to lower LLVM code into a
	// selection DAG.
	//
	//===----------------------------------------------------------------------===//

	#include "X86ISelLowering.h"
	#include "Utils/X86ShuffleDecode.h"
	#include "X86CallingConv.h"
	#include "X86FrameLowering.h"
	#include "X86InstrBuilder.h"
	#include "X86IntrinsicsInfo.h"
	#include "X86MachineFunctionInfo.h"
	#include "X86ShuffleDecodeConstantPool.h"
	#include "X86TargetMachine.h"
	#include "X86TargetObjectFile.h"
	#include "llvm/ADT/SmallBitVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Analysis/EHPersonalities.h"
	#include "llvm/CodeGen/IntrinsicLowering.h"
	#include "llvm/CodeGen/MachineFrameInfo.h"
	#include "llvm/CodeGen/MachineFunction.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"
	#include "llvm/CodeGen/MachineJumpTableInfo.h"
	#include "llvm/CodeGen/MachineModuleInfo.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"
	#include "llvm/CodeGen/WinEHFuncInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/CallingConv.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/DiagnosticInfo.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/GlobalAlias.h"
	#include "llvm/IR/GlobalVariable.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/Intrinsics.h"
	#include "llvm/MC/MCAsmInfo.h"
	#include "llvm/MC/MCContext.h"
	#include "llvm/MC/MCExpr.h"
	#include "llvm/MC/MCSymbol.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/KnownBits.h"
	#include "llvm/Support/MathExtras.h"
	#include "llvm/Target/TargetLowering.h"
	#include "llvm/Target/TargetOptions.h"
	#include <algorithm>
	#include <bitset>
	#include <cctype>
	#include <numeric>
	using namespace llvm;

	#define DEBUG_TYPE "x86-isel"

	STATISTIC(NumTailCalls, "Number of tail calls");

	static cl::opt<bool> ExperimentalVectorWideningLegalization(
	"x86-experimental-vector-widening-legalization", cl::init(false),
	cl::desc("Enable an experimental vector type legalization through widening "
	"rather than promotion."),
	cl::Hidden);

	static cl::opt<int> ExperimentalPrefLoopAlignment(
	"x86-experimental-pref-loop-alignment", cl::init(4),
	cl::desc("Sets the preferable loop alignment for experiments "
	"(the last x86-experimental-pref-loop-alignment bits"
	" of the loop header PC will be 0)."),
	cl::Hidden);

	static cl::opt<bool> MulConstantOptimization(
	"mul-constant-optimization", cl::init(true),
	cl::desc("Replace 'mul x, Const' with more effective instructions like "
	"SHIFT, LEA, etc."),
	cl::Hidden);

	/// Call this when the user attempts to do something unsupported, like
	/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
	/// report_fatal_error, so calling code should attempt to recover without
	/// crashing.
	static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
	const char *Msg) {
	MachineFunction &MF = DAG.getMachineFunction();
	DAG.getContext()->diagnose(
	DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
	}

	X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
	const X86Subtarget &STI)
	: TargetLowering(TM), Subtarget(STI) {
	bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
	X86ScalarSSEf64 = Subtarget.hasSSE2();
	X86ScalarSSEf32 = Subtarget.hasSSE1();
	MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());

	// Set up the TargetLowering object.

	// X86 is weird. It always uses i8 for shift amounts and setcc results.
	setBooleanContents(ZeroOrOneBooleanContent);
	// X86-SSE is even stranger. It uses -1 or 0 for vector masks.
	setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);

	// For 64-bit, since we have so many registers, use the ILP scheduler.
	// For 32-bit, use the register pressure specific scheduling.
	// For Atom, always use ILP scheduling.
	if (Subtarget.isAtom())
	setSchedulingPreference(Sched::ILP);
	else if (Subtarget.is64Bit())
	setSchedulingPreference(Sched::ILP);
	else
	setSchedulingPreference(Sched::RegPressure);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());

	// Bypass expensive divides and use cheaper ones.
	if (TM.getOptLevel() >= CodeGenOpt::Default) {
	if (Subtarget.hasSlowDivide32())
	addBypassSlowDiv(32, 8);
	if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
	addBypassSlowDiv(64, 32);
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()) {
	// Setup Windows compiler runtime calls.
	setLibcallName(RTLIB::SDIV_I64, "_alldiv");
	setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
	setLibcallName(RTLIB::SREM_I64, "_allrem");
	setLibcallName(RTLIB::UREM_I64, "_aullrem");
	setLibcallName(RTLIB::MUL_I64, "_allmul");
	setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
	setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
	setUseUnderscoreSetJmp(false);
	setUseUnderscoreLongJmp(false);
	} else if (Subtarget.isTargetWindowsGNU()) {
	// MS runtime is weird: it exports _setjmp, but longjmp!
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(false);
	} else {
	setUseUnderscoreSetJmp(true);
	setUseUnderscoreLongJmp(true);
	}

	// Set up the register classes.
	addRegisterClass(MVT::i8, &X86::GR8RegClass);
	addRegisterClass(MVT::i16, &X86::GR16RegClass);
	addRegisterClass(MVT::i32, &X86::GR32RegClass);
	if (Subtarget.is64Bit())
	addRegisterClass(MVT::i64, &X86::GR64RegClass);

	for (MVT VT : MVT::integer_valuetypes())
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);

	// We don't accept any truncstore of integer registers.
	setTruncStoreAction(MVT::i64, MVT::i32, Expand);
	setTruncStoreAction(MVT::i64, MVT::i16, Expand);
	setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i32, MVT::i16, Expand);
	setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
	setTruncStoreAction(MVT::i16, MVT::i8, Expand);

	setTruncStoreAction(MVT::f64, MVT::f32, Expand);

	// SETOEQ and SETUNE require checking two conditions.
	setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
	setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
	setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);

	// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
	// operation.
	setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
	// f32/f64 are legal, f80 is custom.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	else
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	} else if (!Subtarget.useSoftFloat()) {
	// We have an algorithm for SSE2->double, and we turn this into a
	// 64-bit FILD followed by conditional FADD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
	// We have an algorithm for SSE2, and we turn this into a 64-bit
	// FILD or VCVTUSI2SS/SD for other targets.
	setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
	}

	// Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// SSE has no i16 to fp conversion, only i32.
	if (X86ScalarSSEf32) {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
	setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
	}

	// Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
	// this operation.
	setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);

	if (!Subtarget.useSoftFloat()) {
	// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
	// are Legal, f80 is custom lowered.
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
	setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);

	if (X86ScalarSSEf32) {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	// f32 and f64 cases are Legal, f80 case is not
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
	}
	} else {
	setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
	setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
	setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
	}

	// Handle FP_TO_UINT by promoting the destination to a larger signed
	// conversion.
	setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);

	if (Subtarget.is64Bit()) {
	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	// FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	} else {
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
	}
	} else if (!Subtarget.useSoftFloat()) {
	// Since AVX is a superset of SSE3, only check for SSE here.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
	// Expand FP_TO_UINT into a select.
	// FIXME: We would like to use a Custom expander here eventually to do
	// the optimal thing for SSE vs. the default expansion in the legalizer.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
	else
	// With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
	// With SSE3 we can use fisttpll to convert to a signed i64; without
	// SSE, we're stuck with a fistpll.
	setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);

	setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
	}

	// TODO: when we have SSE, these could be more efficient, by using movd/movq.
	if (!X86ScalarSSEf64) {
	setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
	setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
	// Without SSE, i64->f64 goes through memory.
	setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
	}
	} else if (!Subtarget.is64Bit())
	setOperationAction(ISD::BITCAST , MVT::i64 , Custom);

	// Scalar integer divide and remainder are lowered to use operations that
	// produce two results, to match the available instructions. This exposes
	// the two-result form to trivial CSE, which is able to combine x/y and x%y
	// into a single instruction.
	//
	// Scalar integer multiply-high is also lowered to use two-result
	// operations, to match the available instructions. However, plain multiply
	// (low) operations are left as Legal, as there are single-result
	// instructions for this in x86. Using the two-result multiply instructions
	// when both high and low results are needed must be arranged by dagcombine.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	}

	setOperationAction(ISD::BR_JT , MVT::Other, Expand);
	setOperationAction(ISD::BRCOND , MVT::Other, Custom);
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
	MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::BR_CC, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	}
	if (Subtarget.is64Bit())
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
	setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
	setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);

	setOperationAction(ISD::FREM , MVT::f32 , Expand);
	setOperationAction(ISD::FREM , MVT::f64 , Expand);
	setOperationAction(ISD::FREM , MVT::f80 , Expand);
	setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);

	// Promote the i8 variants and force them on up to i32 which has a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	if (!Subtarget.hasBMI()) {
	setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
	}
	}

	if (Subtarget.hasLZCNT()) {
	// When promoting the i8 variants, force them to i32 for a shorter
	// encoding.
	setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
	setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
	} else {
	setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
	if (Subtarget.is64Bit()) {
	setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
	setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
	}
	}

	// Special handling for half-precision floating point conversions.
	// If we don't have F16C support, then lower half float conversions
	// into library calls.
	if (Subtarget.useSoftFloat() \|\|
	(!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
	setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
	}

	// There's never any support for operations beyond MVT::f32.
	setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
	setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
	setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);

	setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
	setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
	setTruncStoreAction(MVT::f32, MVT::f16, Expand);
	setTruncStoreAction(MVT::f64, MVT::f16, Expand);
	setTruncStoreAction(MVT::f80, MVT::f16, Expand);

	if (Subtarget.hasPOPCNT()) {
	setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
	} else {
	setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
	setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
	if (Subtarget.is64Bit())
	setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
	}

	setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);

	if (!Subtarget.hasMOVBE())
	setOperationAction(ISD::BSWAP , MVT::i16 , Expand);

	// These should be promoted to a larger select which is supported.
	setOperationAction(ISD::SELECT , MVT::i1 , Promote);
	// X86 wants to expand cmov itself.
	for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	}
	setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
	// NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
	// SjLj exception handling but a light-weight setjmp/longjmp replacement to
	// support continuation, user-level threading, and etc.. As a result, no
	// other SjLj exception interfaces are implemented and please don't build
	// your own exception handling based on them.
	// LLVM/Clang supports zero-cost DWARF exception handling.
	setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
	setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
	setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
	if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
	setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");

	// Darwin ABI issue.
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::ConstantPool , VT, Custom);
	setOperationAction(ISD::JumpTable , VT, Custom);
	setOperationAction(ISD::GlobalAddress , VT, Custom);
	setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
	setOperationAction(ISD::ExternalSymbol , VT, Custom);
	setOperationAction(ISD::BlockAddress , VT, Custom);
	}

	// 64-bit shl, sra, srl (iff 32-bit x86)
	for (auto VT : { MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	setOperationAction(ISD::SHL_PARTS, VT, Custom);
	setOperationAction(ISD::SRA_PARTS, VT, Custom);
	setOperationAction(ISD::SRL_PARTS, VT, Custom);
	}

	if (Subtarget.hasSSE1())
	setOperationAction(ISD::PREFETCH , MVT::Other, Legal);

	setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);

	// Expand certain atomics
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
	setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
	setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
	}

	if (Subtarget.hasCmpxchg16b()) {
	setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
	}

	// FIXME - use subtarget debug flags
	if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
	!Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
	TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
	setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
	}

	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
	setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);

	setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
	setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);

	setOperationAction(ISD::TRAP, MVT::Other, Legal);
	setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);

	// VASTART needs to be custom lowered to use the VarArgsFrameIndex
	setOperationAction(ISD::VASTART , MVT::Other, Custom);
	setOperationAction(ISD::VAEND , MVT::Other, Expand);
	bool Is64Bit = Subtarget.is64Bit();
	setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
	setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);

	setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
	setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);

	setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);

	// GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
	setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
	setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);

	if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
	// f32 and f64 use SSE.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
	: &X86::FR64RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	// Use ANDPD to simulate FABS.
	setOperationAction(ISD::FABS, VT, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG, VT, Custom);

	// Use ANDPD and ORPD to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}

	// Lower this to MOVMSK plus an AND.
	setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
	setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);

	// Expand FP immediates into loads from the stack, except for the special
	// cases we handle.
	addLegalFPImmediate(APFloat(+0.0)); // xorpd
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	} else if (UseX87 && X86ScalarSSEf32) {
	// Use SSE for f32, x87 for f64.
	// Set up the FP register classes.
	addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
	: &X86::FR32RegClass);
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);

	// Use ANDPS to simulate FABS.
	setOperationAction(ISD::FABS , MVT::f32, Custom);

	// Use XORP to simulate FNEG.
	setOperationAction(ISD::FNEG , MVT::f32, Custom);

	setOperationAction(ISD::UNDEF, MVT::f64, Expand);

	// Use ANDPS and ORPS to simulate FCOPYSIGN.
	setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);

	// We don't support sin/cos/fmod
	setOperationAction(ISD::FSIN , MVT::f32, Expand);
	setOperationAction(ISD::FCOS , MVT::f32, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f32, Expand);

	// Special cases we handle for FP constants.
	addLegalFPImmediate(APFloat(+0.0f)); // xorps
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS

	if (!TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FSIN , MVT::f64, Expand);
	setOperationAction(ISD::FCOS , MVT::f64, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
	}
	} else if (UseX87) {
	// f32 and f64 in x87.
	// Set up the FP register classes.
	addRegisterClass(MVT::f64, &X86::RFP64RegClass);
	addRegisterClass(MVT::f32, &X86::RFP32RegClass);

	for (auto VT : { MVT::f32, MVT::f64 }) {
	setOperationAction(ISD::UNDEF, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);

	if (!TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FSIN , VT, Expand);
	setOperationAction(ISD::FCOS , VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	}
	}
	addLegalFPImmediate(APFloat(+0.0)); // FLD0
	addLegalFPImmediate(APFloat(+1.0)); // FLD1
	addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
	addLegalFPImmediate(APFloat(+0.0f)); // FLD0
	addLegalFPImmediate(APFloat(+1.0f)); // FLD1
	addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
	addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
	}

	// We don't support FMA.
	setOperationAction(ISD::FMA, MVT::f64, Expand);
	setOperationAction(ISD::FMA, MVT::f32, Expand);

	// Long double always uses X87, except f128 in MMX.
	if (UseX87) {
	if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::f128, &X86::FR128RegClass);
	ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
	setOperationAction(ISD::FABS , MVT::f128, Custom);
	setOperationAction(ISD::FNEG , MVT::f128, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
	}

	addRegisterClass(MVT::f80, &X86::RFP80RegClass);
	setOperationAction(ISD::UNDEF, MVT::f80, Expand);
	setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
	{
	APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
	addLegalFPImmediate(TmpFlt); // FLD0
	TmpFlt.changeSign();
	addLegalFPImmediate(TmpFlt); // FLD0/FCHS

	bool ignored;
	APFloat TmpFlt2(+1.0);
	TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
	&ignored);
	addLegalFPImmediate(TmpFlt2); // FLD1
	TmpFlt2.changeSign();
	addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
	}

	if (!TM.Options.UnsafeFPMath) {
	setOperationAction(ISD::FSIN , MVT::f80, Expand);
	setOperationAction(ISD::FCOS , MVT::f80, Expand);
	setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
	}

	setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
	setOperationAction(ISD::FCEIL, MVT::f80, Expand);
	setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
	setOperationAction(ISD::FRINT, MVT::f80, Expand);
	setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
	setOperationAction(ISD::FMA, MVT::f80, Expand);
	}

	// Always use a library call for pow.
	setOperationAction(ISD::FPOW , MVT::f32 , Expand);
	setOperationAction(ISD::FPOW , MVT::f64 , Expand);
	setOperationAction(ISD::FPOW , MVT::f80 , Expand);

	setOperationAction(ISD::FLOG, MVT::f80, Expand);
	setOperationAction(ISD::FLOG2, MVT::f80, Expand);
	setOperationAction(ISD::FLOG10, MVT::f80, Expand);
	setOperationAction(ISD::FEXP, MVT::f80, Expand);
	setOperationAction(ISD::FEXP2, MVT::f80, Expand);
	setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
	setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);

	// Some FP actions are always expanded for vector types.
	for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
	MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
	setOperationAction(ISD::FSIN, VT, Expand);
	setOperationAction(ISD::FSINCOS, VT, Expand);
	setOperationAction(ISD::FCOS, VT, Expand);
	setOperationAction(ISD::FREM, VT, Expand);
	setOperationAction(ISD::FCOPYSIGN, VT, Expand);
	setOperationAction(ISD::FPOW, VT, Expand);
	setOperationAction(ISD::FLOG, VT, Expand);
	setOperationAction(ISD::FLOG2, VT, Expand);
	setOperationAction(ISD::FLOG10, VT, Expand);
	setOperationAction(ISD::FEXP, VT, Expand);
	setOperationAction(ISD::FEXP2, VT, Expand);
	}

	// First set operation action for all vector types to either promote
	// (for widening) or expand (for scalarization). Then we will selectively
	// turn on ones that can be effectively codegen'd.
	for (MVT VT : MVT::vector_valuetypes()) {
	setOperationAction(ISD::SDIV, VT, Expand);
	setOperationAction(ISD::UDIV, VT, Expand);
	setOperationAction(ISD::SREM, VT, Expand);
	setOperationAction(ISD::UREM, VT, Expand);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
	setOperationAction(ISD::FMA, VT, Expand);
	setOperationAction(ISD::FFLOOR, VT, Expand);
	setOperationAction(ISD::FCEIL, VT, Expand);
	setOperationAction(ISD::FTRUNC, VT, Expand);
	setOperationAction(ISD::FRINT, VT, Expand);
	setOperationAction(ISD::FNEARBYINT, VT, Expand);
	setOperationAction(ISD::SMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHS, VT, Expand);
	setOperationAction(ISD::UMUL_LOHI, VT, Expand);
	setOperationAction(ISD::MULHU, VT, Expand);
	setOperationAction(ISD::SDIVREM, VT, Expand);
	setOperationAction(ISD::UDIVREM, VT, Expand);
	setOperationAction(ISD::CTPOP, VT, Expand);
	setOperationAction(ISD::CTTZ, VT, Expand);
	setOperationAction(ISD::CTLZ, VT, Expand);
	setOperationAction(ISD::ROTL, VT, Expand);
	setOperationAction(ISD::ROTR, VT, Expand);
	setOperationAction(ISD::BSWAP, VT, Expand);
	setOperationAction(ISD::SETCC, VT, Expand);
	setOperationAction(ISD::FP_TO_UINT, VT, Expand);
	setOperationAction(ISD::FP_TO_SINT, VT, Expand);
	setOperationAction(ISD::UINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SINT_TO_FP, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
	setOperationAction(ISD::TRUNCATE, VT, Expand);
	setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
	setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
	setOperationAction(ISD::ANY_EXTEND, VT, Expand);
	setOperationAction(ISD::SELECT_CC, VT, Expand);
	for (MVT InnerVT : MVT::vector_valuetypes()) {
	setTruncStoreAction(InnerVT, VT, Expand);

	setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
	setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);

	// N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
	// types, we have to deal with them whether we ask for Expansion or not.
	// Setting Expand causes its own optimisation problems though, so leave
	// them legal.
	if (VT.getVectorElementType() == MVT::i1)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);

	// EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
	// split/scalarized right now.
	if (VT.getVectorElementType() == MVT::f16)
	setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
	}
	}

	// FIXME: In order to prevent SSE instructions being expanded to MMX ones
	// with -msoft-float, disable use of MMX as well.
	if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
	addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
	// No operations on x86mmx supported, everything uses intrinsics.
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
	addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
	setOperationAction(ISD::FABS, MVT::v4f32, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
	setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
	setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
	addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	// FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
	// registers cannot be used even for integer operations.
	addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);
	addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
	: &X86::VR128RegClass);

	setOperationAction(ISD::MUL, MVT::v16i8, Custom);
	setOperationAction(ISD::MUL, MVT::v4i32, Custom);
	setOperationAction(ISD::MUL, MVT::v2i64, Custom);
	setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
	setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
	setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
	setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
	setOperationAction(ISD::MUL, MVT::v8i16, Legal);
	setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
	setOperationAction(ISD::FABS, MVT::v2f64, Custom);
	setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);

	setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v16i8, Legal);

	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	}

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// We support custom legalizing of sext and anyext loads for specific
	// memory vector types which we can load as a scalar (or sequence of
	// scalars) and extend in-register to a legal 128-bit vector type. For sext
	// loads these must work with a single scalar load.
	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
	}

	for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);

	if (VT == MVT::v2i64 && !Subtarget.is64Bit())
	continue;

	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	}

	// Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
	}

	// Custom lower v2i64 and v2f64 selects.
	setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v2i64, Custom);

	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);

	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);

	setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);

	// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);

	setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);

	setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
	setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
	setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);

	// In the customized shift lowering, the legal v4i32/v2i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
	setOperationAction(ISD::ABS, MVT::v16i8, Legal);
	setOperationAction(ISD::ABS, MVT::v8i16, Legal);
	setOperationAction(ISD::ABS, MVT::v4i32, Legal);
	setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
	setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
	setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
	for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
	setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
	setOperationAction(ISD::FCEIL, RoundedTy, Legal);
	setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
	setOperationAction(ISD::FRINT, RoundedTy, Legal);
	setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
	}

	setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
	setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
	setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
	setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
	setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
	setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
	setOperationAction(ISD::UMIN, MVT::v4i32, Legal);

	// FIXME: Do we need to handle scalar-to-vector here?
	setOperationAction(ISD::MUL, MVT::v4i32, Legal);

	// We directly match byte blends in the backend as they match the VSELECT
	// condition form.
	setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);

	// SSE41 brings specific instructions for doing vector sign extend even in
	// cases where we don't have SRA.
	for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
	}

	for (MVT VT : MVT::integer_vector_valuetypes()) {
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
	setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
	}

	// SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
	}

	// i8 vectors are custom because the source register and source
	// source memory operand types are not the same width.
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::ROTL, VT, Custom);

	// XOP can efficiently perform BITREVERSE with VPPERM.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);

	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
	setOperationAction(ISD::BITREVERSE, VT, Custom);
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
	bool HasInt256 = Subtarget.hasInt256();

	addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);
	addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
	: &X86::VR256RegClass);

	for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
	// even though v8i16 is a legal type.
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);

	setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);

	setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);

	// In the customized shift lowering, the legal v8i32/v4i64 cases
	// in AVX2 will be recognized.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	}

	setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f32, Custom);

	for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
	setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
	setOperationAction(ISD::ANY_EXTEND, VT, Custom);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
	setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::CTLZ, VT, Custom);
	}

	if (Subtarget.hasAnyFMA()) {
	for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
	MVT::v2f64, MVT::v4f64 })
	setOperationAction(ISD::FMA, VT, Legal);
	}

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
	setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
	}

	setOperationAction(ISD::MUL, MVT::v4i64, Custom);
	setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MUL, MVT::v32i8, Custom);

	setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
	setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);

	setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i8, Custom);

	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
	setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
	}

	if (HasInt256) {
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);

	// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
	// when we have a 256bit-wide blend with immediate.
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);

	// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
	for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
	setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
	}
	}

	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	}

	// Extract subvector is special because the value type
	// (result) is 128-bit but the source is 256-bit wide.
	for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
	MVT::v4f32, MVT::v2f64 }) {
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	}

	// Custom lower several nodes for 256-bit types.
	for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
	}

	if (HasInt256)
	setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);

	// Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
	setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
	addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
	addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
	addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
	addRegisterClass(MVT::v8f64, &X86::VR512RegClass);

	addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
	addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
	addRegisterClass(MVT::v16i1, &X86::VK16RegClass);

	for (MVT VT : MVT::fp_vector_valuetypes())
	setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
	setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
	}

	for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
	MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
	MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
	setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
	setTruncStoreAction(VT, MaskVT, Custom);
	}

	for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FNEG, VT, Custom);
	setOperationAction(ISD::FABS, VT, Custom);
	setOperationAction(ISD::FMA, VT, Legal);
	setOperationAction(ISD::FCOPYSIGN, VT, Custom);
	}

	setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
	setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
	setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
	setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
	setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);

	setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
	setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
	if (Subtarget.hasVLX()){
	setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
	setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
	setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);

	setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
	setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
	setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
	} else {
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
	setOperationAction(ISD::MLOAD, VT, Custom);
	setOperationAction(ISD::MSTORE, VT, Custom);
	}
	}
	setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);

	if (Subtarget.hasDQI()) {
	for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
	setOperationAction(ISD::SINT_TO_FP, VT, Legal);
	setOperationAction(ISD::UINT_TO_FP, VT, Legal);
	setOperationAction(ISD::FP_TO_SINT, VT, Legal);
	setOperationAction(ISD::FP_TO_UINT, VT, Legal);
	}
	if (Subtarget.hasVLX()) {
	// Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
	setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
	setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
	}
	}
	if (Subtarget.hasVLX()) {
	setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
	setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
	setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);

	// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
	setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
	setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
	}

	setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);

	for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::FFLOOR, VT, Legal);
	setOperationAction(ISD::FCEIL, VT, Legal);
	setOperationAction(ISD::FTRUNC, VT, Legal);
	setOperationAction(ISD::FRINT, VT, Legal);
	setOperationAction(ISD::FNEARBYINT, VT, Legal);
	}

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);

	// Without BWI we need to use custom lowering to handle MVT::v64i8 input.
	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);

	setOperationAction(ISD::MUL, MVT::v8i64, Custom);

	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
	setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
	setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
	setOperationAction(ISD::SELECT, MVT::v16f32, Custom);

	setOperationAction(ISD::MUL, MVT::v16i32, Legal);

	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	setOperationAction(ISD::ABS, MVT::v4i64, Legal);
	setOperationAction(ISD::ABS, MVT::v2i64, Legal);

	for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::TRUNCATE, VT, Custom);

	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);
	}

	for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	}

	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
	MVT::v8i64}) {
	setOperationAction(ISD::ROTL, VT, Custom);
	setOperationAction(ISD::ROTR, VT, Custom);
	}

	// Need to promote to 64-bit even though we have 32-bit masked instructions
	// because the IR optimizers rearrange bitcasts around logic ops leaving
	// too many variations to handle if we don't promote them.
	setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);

	if (Subtarget.hasCDI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
	MVT::v4i64, MVT::v8i64}) {
	setOperationAction(ISD::CTLZ, VT, Legal);
	setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
	}
	} // Subtarget.hasCDI()

	if (Subtarget.hasDQI()) {
	// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
	setOperationAction(ISD::MUL, MVT::v2i64, Legal);
	setOperationAction(ISD::MUL, MVT::v4i64, Legal);
	setOperationAction(ISD::MUL, MVT::v8i64, Legal);
	}

	if (Subtarget.hasVPOPCNTDQ()) {
	// VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
	// version of popcntd/q.
	for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
	MVT::v4i32, MVT::v2i64})
	setOperationAction(ISD::CTPOP, VT, Legal);
	}

	// Custom lower several nodes.
	for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
	MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
	setOperationAction(ISD::MGATHER, VT, Custom);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	// Extract subvector is special because the value type
	// (result) is 256-bit but the source is 512-bit wide.
	// 128-bit was made Custom under AVX1.
	for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
	MVT::v8f32, MVT::v4f64 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
	for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
	MVT::v16i1, MVT::v32i1, MVT::v64i1 })
	setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);

	for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::MGATHER, VT, Legal);
	setOperationAction(ISD::MSCATTER, VT, Custom);
	}
	for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
	setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
	}
	}// has AVX-512

	if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
	addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
	addRegisterClass(MVT::v64i8, &X86::VR512RegClass);

	addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
	addRegisterClass(MVT::v64i1, &X86::VK64RegClass);

	setOperationAction(ISD::ADD, MVT::v32i1, Custom);
	setOperationAction(ISD::ADD, MVT::v64i1, Custom);
	setOperationAction(ISD::SUB, MVT::v32i1, Custom);
	setOperationAction(ISD::SUB, MVT::v64i1, Custom);
	setOperationAction(ISD::MUL, MVT::v32i1, Custom);
	setOperationAction(ISD::MUL, MVT::v64i1, Custom);

	setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
	setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
	setOperationAction(ISD::MUL, MVT::v32i16, Legal);
	setOperationAction(ISD::MUL, MVT::v64i8, Custom);
	setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
	setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
	setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
	setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
	setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
	setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
	setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
	setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
	setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
	setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
	setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);

	setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);

	setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
	if (Subtarget.hasVLX()) {
	setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
	setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
	}

	LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
	for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
	setOperationAction(ISD::MLOAD, VT, Action);
	setOperationAction(ISD::MSTORE, VT, Action);
	}

	if (Subtarget.hasCDI()) {
	setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
	setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
	}

	for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Custom);
	setOperationAction(ISD::ABS, VT, Legal);
	setOperationAction(ISD::SRL, VT, Custom);
	setOperationAction(ISD::SHL, VT, Custom);
	setOperationAction(ISD::SRA, VT, Custom);
	setOperationAction(ISD::MLOAD, VT, Legal);
	setOperationAction(ISD::MSTORE, VT, Legal);
	setOperationAction(ISD::CTPOP, VT, Custom);
	setOperationAction(ISD::CTTZ, VT, Custom);
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);

	setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
	setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
	}

	for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
	setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
	if (Subtarget.hasVLX()) {
	// FIXME. This commands are available on SSE/AVX2, add relevant patterns.
	setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
	setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
	}
	}
	}

	if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
	addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
	addRegisterClass(MVT::v2i1, &X86::VK2RegClass);

	for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
	setOperationAction(ISD::ADD, VT, Custom);
	setOperationAction(ISD::SUB, VT, Custom);
	setOperationAction(ISD::MUL, VT, Custom);
	setOperationAction(ISD::VSELECT, VT, Expand);

	setOperationAction(ISD::TRUNCATE, VT, Custom);
	setOperationAction(ISD::SETCC, VT, Custom);
	setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
	setOperationAction(ISD::SELECT, VT, Custom);
	setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
	setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
	}

	setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
	setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
	setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);

	for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
	setOperationAction(ISD::SMAX, VT, Legal);
	setOperationAction(ISD::UMAX, VT, Legal);
	setOperationAction(ISD::SMIN, VT, Legal);
	setOperationAction(ISD::UMIN, VT, Legal);
	}
	}

	// We want to custom lower some of our intrinsics.
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
	setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
	if (!Subtarget.is64Bit()) {
	setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
	setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
	}

	// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
	// handle type legalization for these operations here.
	//
	// FIXME: We really should do custom legalization for addition and
	// subtraction on x86-32 once PR3203 is fixed. We really can't do much better
	// than generic legalization for 64-bit multiplication-with-overflow, though.
	for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
	if (VT == MVT::i64 && !Subtarget.is64Bit())
	continue;
	// Add/Sub/Mul with overflow operations are custom lowered.
	setOperationAction(ISD::SADDO, VT, Custom);
	setOperationAction(ISD::UADDO, VT, Custom);
	setOperationAction(ISD::SSUBO, VT, Custom);
	setOperationAction(ISD::USUBO, VT, Custom);
	setOperationAction(ISD::SMULO, VT, Custom);
	setOperationAction(ISD::UMULO, VT, Custom);

	// Support carry in as value rather than glue.
	setOperationAction(ISD::ADDCARRY, VT, Custom);
	setOperationAction(ISD::SUBCARRY, VT, Custom);
	setOperationAction(ISD::SETCCCARRY, VT, Custom);
	}

	if (!Subtarget.is64Bit()) {
	// These libcalls are not available in 32-bit.
	setLibcallName(RTLIB::SHL_I128, nullptr);
	setLibcallName(RTLIB::SRL_I128, nullptr);
	setLibcallName(RTLIB::SRA_I128, nullptr);
	}

	// Combine sin / cos into one node or libcall if possible.
	if (Subtarget.hasSinCos()) {
	setLibcallName(RTLIB::SINCOS_F32, "sincosf");
	setLibcallName(RTLIB::SINCOS_F64, "sincos");
	if (Subtarget.isTargetDarwin()) {
	// For MacOSX, we don't want the normal expansion of a libcall to sincos.
	// We want to issue a libcall to __sincos_stret to avoid memory traffic.
	setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
	setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
	}
	}

	if (Subtarget.isTargetWin64()) {
	setOperationAction(ISD::SDIV, MVT::i128, Custom);
	setOperationAction(ISD::UDIV, MVT::i128, Custom);
	setOperationAction(ISD::SREM, MVT::i128, Custom);
	setOperationAction(ISD::UREM, MVT::i128, Custom);
	setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
	setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
	}

	// On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
	// is. We should promote the value to 64-bits to solve this.
	// This is what the CRT headers do - `fmodf` is an inline header
	// function casting to f64 and calling `fmod`.
	if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium()))
	for (ISD::NodeType Op :
	{ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
	ISD::FLOG10, ISD::FPOW, ISD::FSIN})
	if (isOperationExpand(Op, MVT::f32))
	setOperationAction(Op, MVT::f32, Promote);

	// We have target-specific dag combine patterns for the following nodes:
	setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
	setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
	setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
	setTargetDAGCombine(ISD::BITCAST);
	setTargetDAGCombine(ISD::VSELECT);
	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::SRA);
	setTargetDAGCombine(ISD::SRL);
	setTargetDAGCombine(ISD::OR);
	setTargetDAGCombine(ISD::AND);
	setTargetDAGCombine(ISD::ADD);
	setTargetDAGCombine(ISD::FADD);
	setTargetDAGCombine(ISD::FSUB);
	setTargetDAGCombine(ISD::FNEG);
	setTargetDAGCombine(ISD::FMA);
	setTargetDAGCombine(ISD::FMINNUM);
	setTargetDAGCombine(ISD::FMAXNUM);
	setTargetDAGCombine(ISD::SUB);
	setTargetDAGCombine(ISD::LOAD);
	setTargetDAGCombine(ISD::MLOAD);
	setTargetDAGCombine(ISD::STORE);
	setTargetDAGCombine(ISD::MSTORE);
	setTargetDAGCombine(ISD::TRUNCATE);
	setTargetDAGCombine(ISD::ZERO_EXTEND);
	setTargetDAGCombine(ISD::ANY_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND);
	setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
	setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
	setTargetDAGCombine(ISD::SINT_TO_FP);
	setTargetDAGCombine(ISD::UINT_TO_FP);
	setTargetDAGCombine(ISD::SETCC);
	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::XOR);
	setTargetDAGCombine(ISD::MSCATTER);
	setTargetDAGCombine(ISD::MGATHER);

	computeRegisterProperties(Subtarget.getRegisterInfo());

	MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
	MaxStoresPerMemsetOptSize = 8;
	MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
	MaxStoresPerMemcpyOptSize = 4;
	MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
	MaxStoresPerMemmoveOptSize = 4;

	// TODO: These control memcmp expansion in CGP and could be raised higher, but
	// that needs to benchmarked and balanced with the potential use of vector
	// load/store types (PR33329, PR33914).
	MaxLoadsPerMemcmp = 2;
	MaxLoadsPerMemcmpOptSize = 2;

	// Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
	setPrefLoopAlignment(ExperimentalPrefLoopAlignment);

	// An out-of-order CPU can speculatively execute past a predictable branch,
	// but a conditional move could be stalled by an expensive earlier operation.
	PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
	EnableExtLdPromotion = true;
	setPrefFunctionAlignment(4); // 2^4 bytes.

	verifyIntrinsicTables();
	}

	// This has so far only been implemented for 64-bit MachO.
	bool X86TargetLowering::useLoadStackGuardNode() const {
	return Subtarget.isTargetMachO() && Subtarget.is64Bit();
	}

	TargetLoweringBase::LegalizeTypeAction
	X86TargetLowering::getPreferredVectorAction(EVT VT) const {
	if (ExperimentalVectorWideningLegalization &&
	VT.getVectorNumElements() != 1 &&
	VT.getVectorElementType().getSimpleVT() != MVT::i1)
	return TypeWidenVector;

	return TargetLoweringBase::getPreferredVectorAction(VT);
	}

	EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
	LLVMContext& Context,
	EVT VT) const {
	if (!VT.isVector())
	return MVT::i8;

	if (VT.isSimple()) {
	MVT VVT = VT.getSimpleVT();
	const unsigned NumElts = VVT.getVectorNumElements();
	MVT EltVT = VVT.getVectorElementType();
	if (VVT.is512BitVector()) {
	if (Subtarget.hasAVX512())
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64 \|\|
	EltVT == MVT::f32 \|\| EltVT == MVT::f64)
	switch(NumElts) {
	case 8: return MVT::v8i1;
	case 16: return MVT::v16i1;
	}
	if (Subtarget.hasBWI())
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16)
	switch(NumElts) {
	case 32: return MVT::v32i1;
	case 64: return MVT::v64i1;
	}
	}

	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return MVT::getVectorVT(MVT::i1, NumElts);

	if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
	EVT LegalVT = getTypeToTransformTo(Context, VT);
	EltVT = LegalVT.getVectorElementType().getSimpleVT();
	}

	if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
	switch(NumElts) {
	case 2: return MVT::v2i1;
	case 4: return MVT::v4i1;
	case 8: return MVT::v8i1;
	}
	}

	return VT.changeVectorElementTypeToInteger();
	}

	/// Helper for getByValTypeAlignment to determine
	/// the desired ByVal argument alignment.
	static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
	if (MaxAlign == 16)
	return;
	if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
	if (VTy->getBitWidth() == 128)
	MaxAlign = 16;
	} else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
	unsigned EltAlign = 0;
	getMaxByValAlign(ATy->getElementType(), EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	} else if (StructType *STy = dyn_cast<StructType>(Ty)) {
	for (auto *EltTy : STy->elements()) {
	unsigned EltAlign = 0;
	getMaxByValAlign(EltTy, EltAlign);
	if (EltAlign > MaxAlign)
	MaxAlign = EltAlign;
	if (MaxAlign == 16)
	break;
	}
	}
	}

	/// Return the desired alignment for ByVal aggregate
	/// function arguments in the caller parameter area. For X86, aggregates
	/// that contain SSE vectors are placed at 16-byte boundaries while the rest
	/// are at 4-byte boundaries.
	unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
	const DataLayout &DL) const {
	if (Subtarget.is64Bit()) {
	// Max of 8 and alignment of type.
	unsigned TyAlign = DL.getABITypeAlignment(Ty);
	if (TyAlign > 8)
	return TyAlign;
	return 8;
	}

	unsigned Align = 4;
	if (Subtarget.hasSSE1())
	getMaxByValAlign(Ty, Align);
	return Align;
	}

	/// Returns the target specific optimal type for load
	/// and store operations as a result of memset, memcpy, and memmove
	/// lowering. If DstAlign is zero that means it's safe to destination
	/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
	/// means there isn't a need to check it against alignment requirement,
	/// probably because the source does not need to be loaded. If 'IsMemset' is
	/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
	/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
	/// source is constant so it does not need to be loaded.
	/// It returns EVT::Other if the type should be determined using generic
	/// target-independent logic.
	EVT
	X86TargetLowering::getOptimalMemOpType(uint64_t Size,
	unsigned DstAlign, unsigned SrcAlign,
	bool IsMemset, bool ZeroMemset,
	bool MemcpyStrSrc,
	MachineFunction &MF) const {
	const Function *F = MF.getFunction();
	if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
	if (Size >= 16 &&
	(!Subtarget.isUnalignedMem16Slow() \|\|
	((DstAlign == 0 \|\| DstAlign >= 16) &&
	(SrcAlign == 0 \|\| SrcAlign >= 16)))) {
	// FIXME: Check if unaligned 32-byte accesses are slow.
	if (Size >= 32 && Subtarget.hasAVX()) {
	// Although this isn't a well-supported type for AVX1, we'll let
	// legalization and shuffle lowering produce the optimal codegen. If we
	// choose an optimal type with a vector element larger than a byte,
	// getMemsetStores() may create an intermediate splat (using an integer
	// multiply) before we splat as a vector.
	return MVT::v32i8;
	}
	if (Subtarget.hasSSE2())
	return MVT::v16i8;
	// TODO: Can SSE1 handle a byte vector?
	if (Subtarget.hasSSE1())
	return MVT::v4f32;
	} else if ((!IsMemset \|\| ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
	!Subtarget.is64Bit() && Subtarget.hasSSE2()) {
	// Do not use f64 to lower memcpy if source is string constant. It's
	// better to use i32 to avoid the loads.
	// Also, do not use f64 to lower memset unless this is a memset of zeros.
	// The gymnastics of splatting a byte value into an XMM register and then
	// only using 8-byte stores (because this is a CPU with slow unaligned
	// 16-byte accesses) makes that a loser.
	return MVT::f64;
	}
	}
	// This is a compromise. If we reach here, unaligned accesses may be slow on
	// this target. However, creating smaller, aligned accesses could be even
	// slower and would certainly be a lot more code.
	if (Subtarget.is64Bit() && Size >= 8)
	return MVT::i64;
	return MVT::i32;
	}

	bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
	if (VT == MVT::f32)
	return X86ScalarSSEf32;
	else if (VT == MVT::f64)
	return X86ScalarSSEf64;
	return true;
	}

	bool
	X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
	unsigned,
	unsigned,
	bool *Fast) const {
	if (Fast) {
	switch (VT.getSizeInBits()) {
	default:
	// 8-byte and under are always assumed to be fast.
	*Fast = true;
	break;
	case 128:
	*Fast = !Subtarget.isUnalignedMem16Slow();
	break;
	case 256:
	*Fast = !Subtarget.isUnalignedMem32Slow();
	break;
	// TODO: What about AVX-512 (512-bit) accesses?
	}
	}
	// Misaligned accesses of any size are always allowed.
	return true;
	}

	/// Return the entry encoding for a jump table in the
	/// current function. The returned value is a member of the
	/// MachineJumpTableInfo::JTEntryKind enum.
	unsigned X86TargetLowering::getJumpTableEncoding() const {
	// In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
	// symbol.
	if (isPositionIndependent() && Subtarget.isPICStyleGOT())
	return MachineJumpTableInfo::EK_Custom32;

	// Otherwise, use the normal jump table encoding heuristics.
	return TargetLowering::getJumpTableEncoding();
	}

	bool X86TargetLowering::useSoftFloat() const {
	return Subtarget.useSoftFloat();
	}

	void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
	ArgListTy &Args) const {

	// Only relabel X86-32 for C / Stdcall CCs.
	if (Subtarget.is64Bit())
	return;
	if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
	return;
	unsigned ParamRegs = 0;
	if (auto *M = MF->getFunction()->getParent())
	ParamRegs = M->getNumberRegisterParameters();

	// Mark the first N int arguments as having reg
	for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
	Type *T = Args[Idx].Ty;
	if (T->isPointerTy() \|\| T->isIntegerTy())
	if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
	unsigned numRegs = 1;
	if (MF->getDataLayout().getTypeAllocSize(T) > 4)
	numRegs = 2;
	if (ParamRegs < numRegs)
	return;
	ParamRegs -= numRegs;
	Args[Idx].IsInReg = true;
	}
	}
	}

	const MCExpr *
	X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
	const MachineBasicBlock *MBB,
	unsigned uid,MCContext &Ctx) const{
	assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
	// In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
	// entries.
	return MCSymbolRefExpr::create(MBB->getSymbol(),
	MCSymbolRefExpr::VK_GOTOFF, Ctx);
	}

	/// Returns relocation base for the given PIC jumptable.
	SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
	SelectionDAG &DAG) const {
	if (!Subtarget.is64Bit())
	// This doesn't have SDLoc associated with it, but is not really the
	// same as a Register.
	return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()));
	return Table;
	}

	/// This returns the relocation base for the given PIC jumptable,
	/// the same as getPICJumpTableRelocBase, but as an MCExpr.
	const MCExpr *X86TargetLowering::
	getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
	MCContext &Ctx) const {
	// X86-64 uses RIP relative addressing based on the jump table label.
	if (Subtarget.isPICStyleRIPRel())
	return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);

	// Otherwise, the reference is relative to the PIC base.
	return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
	}

	std::pair<const TargetRegisterClass *, uint8_t>
	X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
	MVT VT) const {
	const TargetRegisterClass *RRC = nullptr;
	uint8_t Cost = 1;
	switch (VT.SimpleTy) {
	default:
	return TargetLowering::findRepresentativeClass(TRI, VT);
	case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
	RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
	break;
	case MVT::x86mmx:
	RRC = &X86::VR64RegClass;
	break;
	case MVT::f32: case MVT::f64:
	case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
	case MVT::v4f32: case MVT::v2f64:
	case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
	case MVT::v8f32: case MVT::v4f64:
	case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
	case MVT::v16f32: case MVT::v8f64:
	RRC = &X86::VR128XRegClass;
	break;
	}
	return std::make_pair(RRC, Cost);
	}

	unsigned X86TargetLowering::getAddressSpace() const {
	if (Subtarget.is64Bit())
	return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
	return 256;
	}

	static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
	return TargetTriple.isOSGlibc() \|\| TargetTriple.isOSFuchsia() \|\|
	(TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
	}

	static Constant* SegmentOffset(IRBuilder<> &IRB,
	unsigned Offset, unsigned AddressSpace) {
	return ConstantExpr::getIntToPtr(
	ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
	Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
	}

	Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
	// glibc, bionic, and Fuchsia have a special slot for the stack guard in
	// tcbhead_t; use it instead of the usual global variable (see
	// sysdeps/{i386,x86_64}/nptl/tls.h)
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
	if (Subtarget.isTargetFuchsia()) {
	// <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
	return SegmentOffset(IRB, 0x10, getAddressSpace());
	} else {
	// %fs:0x28, unless we're using a Kernel code model, in which case
	// it's %gs:0x28. gs:0x14 on i386.
	unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}
	}

	return TargetLowering::getIRStackGuard(IRB);
	}

	void X86TargetLowering::insertSSPDeclarations(Module &M) const {
	// MSVC CRT provides functionalities for stack protection.
	if (Subtarget.getTargetTriple().isOSMSVCRT()) {
	// MSVC CRT has a global variable holding security cookie.
	M.getOrInsertGlobal("__security_cookie",
	Type::getInt8PtrTy(M.getContext()));

	// MSVC CRT has a function to validate security cookie.
	auto *SecurityCheckCookie = cast<Function>(
	M.getOrInsertFunction("__security_check_cookie",
	Type::getVoidTy(M.getContext()),
	Type::getInt8PtrTy(M.getContext())));
	SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
	SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
	return;
	}
	// glibc, bionic, and Fuchsia have a special slot for the stack guard.
	if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
	return;
	TargetLowering::insertSSPDeclarations(M);
	}

	Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
	// MSVC CRT has a global variable holding security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getGlobalVariable("__security_cookie");
	return TargetLowering::getSDagStackGuard(M);
	}

	Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
	// MSVC CRT has a function to validate security cookie.
	if (Subtarget.getTargetTriple().isOSMSVCRT())
	return M.getFunction("__security_check_cookie");
	return TargetLowering::getSSPStackGuardCheck(M);
	}

	Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
	if (Subtarget.getTargetTriple().isOSContiki())
	return getDefaultSafeStackPointerLocation(IRB, false);

	// Android provides a fixed TLS slot for the SafeStack pointer. See the
	// definition of TLS_SLOT_SAFESTACK in
	// https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
	if (Subtarget.isTargetAndroid()) {
	// %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
	// %gs:0x24 on i386
	unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
	return SegmentOffset(IRB, Offset, getAddressSpace());
	}

	// Fuchsia is similar.
	if (Subtarget.isTargetFuchsia()) {
	// <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
	return SegmentOffset(IRB, 0x18, getAddressSpace());
	}

	return TargetLowering::getSafeStackPointerLocation(IRB);
	}

	bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
	unsigned DestAS) const {
	assert(SrcAS != DestAS && "Expected different address spaces!");

	return SrcAS < 256 && DestAS < 256;
	}

	//===----------------------------------------------------------------------===//
	// Return Value Calling Convention Implementation
	//===----------------------------------------------------------------------===//

	#include "X86GenCallingConv.inc"

	bool X86TargetLowering::CanLowerReturn(
	CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
	return CCInfo.CheckReturn(Outs, RetCC_X86);
	}

	const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
	static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
	return ScratchRegs;
	}

	/// Lowers masks values (v*i1) to the local register values
	/// \returns DAG node after lowering to register type
	static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
	const SDLoc &Dl, SelectionDAG &DAG) {
	EVT ValVT = ValArg.getValueType();

	if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 \|\| ValLoc == MVT::i32)) \|\|
	(ValVT == MVT::v16i1 && (ValLoc == MVT::i16 \|\| ValLoc == MVT::i32))) {
	// Two stage lowering might be required
	// bitcast: v8i1 -> i8 / v16i1 -> i16
	// anyextend: i8 -> i32 / i16 -> i32
	EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
	SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
	if (ValLoc == MVT::i32)
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
	return ValToCopy;
	} else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) \|\|
	(ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
	// One stage lowering is required
	// bitcast: v32i1 -> i32 / v64i1 -> i64
	return DAG.getBitcast(ValLoc, ValArg);
	} else
	return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
	}

	/// Breaks v64i1 value into two registers and adds the new node to the DAG
	static void Passv64i1ArgInRegs(
	const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
	SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
	CCValAssign &NextVA, const X86Subtarget &Subtarget) {
	assert((Subtarget.hasBWI() \|\| Subtarget.hasBMI()) &&
	"Expected AVX512BW or AVX512BMI target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The value should reside in two registers");

	// Before splitting the value we cast it to i64
	Arg = DAG.getBitcast(MVT::i64, Arg);

	// Splitting the value into two i32 types
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(0, Dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
	DAG.getConstant(1, Dl, MVT::i32));

	// Attach the two i32 types into corresponding registers
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
	RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
	}

	SDValue
	X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
	bool isVarArg,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SDLoc &dl, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	// In some cases we need to disable registers from the default CSR list.
	// For example, when they are used for argument passing.
	bool ShouldDisableCalleeSavedRegister =
	CallConv == CallingConv::X86_RegCall \|\|
	MF.getFunction()->hasFnAttribute("no_caller_saved_registers");

	if (CallConv == CallingConv::X86_INTR && !Outs.empty())
	report_fatal_error("X86 interrupts may not return any value");

	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
	CCInfo.AnalyzeReturn(Outs, RetCC_X86);

	SDValue Flag;
	SmallVector<SDValue, 6> RetOps;
	RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
	// Operand #1 = Bytes To Pop
	RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
	MVT::i32));

	// Copy the result values into the output registers.
	for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = RVLocs[I];
	assert(VA.isRegLoc() && "Can only return in registers!");

	// Add the register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());

	SDValue ValToCopy = OutVals[OutsIndex];
	EVT ValVT = ValToCopy.getValueType();

	// Promote values to the appropriate types.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
	else if (VA.getLocInfo() == CCValAssign::AExt) {
	if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
	ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
	else
	ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
	}
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);

	assert(VA.getLocInfo() != CCValAssign::FPExt &&
	"Unexpected FP-extend for return value.");

	// If this is x86-64, and we disabled SSE, we can't return FP values,
	// or SSE or MMX vectors.
	if ((ValVT == MVT::f32 \|\| ValVT == MVT::f64 \|\|
	VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	} else if (ValVT == MVT::f64 &&
	(Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
	// Likewise we can't return F64 values with SSE1 only. gcc does so, but
	// llvm-gcc has never done it right and no one has noticed, so this
	// should be OK for now.
	errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// Returns in ST0/ST1 are handled specially: these are pushed as operands to
	// the RET instruction and handled by the FP Stackifier.
	if (VA.getLocReg() == X86::FP0 \|\|
	VA.getLocReg() == X86::FP1) {
	// If this is a copy from an xmm register to ST(0), use an FPExtend to
	// change the value to the FP stack register class.
	if (isScalarFPTypeInSSEReg(VA.getValVT()))
	ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
	RetOps.push_back(ValToCopy);
	// Don't emit a copytoreg.
	continue;
	}

	// 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
	// which is returned in RAX / RDX.
	if (Subtarget.is64Bit()) {
	if (ValVT == MVT::x86mmx) {
	if (VA.getLocReg() == X86::XMM0 \|\| VA.getLocReg() == X86::XMM1) {
	ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
	ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	ValToCopy);
	// If we don't have SSE2 available, convert to v4f32 so the generated
	// register is legal.
	if (!Subtarget.hasSSE2())
	ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
	}
	}
	}

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
	Subtarget);

	assert(2 == RegsToPass.size() &&
	"Expecting two registers after Pass64BitArgInRegs");

	// Add the second register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
	} else {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
	}

	// Add nodes to the DAG and add the values into the RetOps list
	for (auto &Reg : RegsToPass) {
	Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
	Flag = Chain.getValue(1);
	RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
	}
	}

	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return, and SRetReturnReg is not set for Swift.

	// All x86 ABIs require that for returning structs by value we copy
	// the sret argument into %rax/%eax (depending on ABI) for the return.
	// We saved the argument into a virtual register in the entry block,
	// so now we copy the value out and into %rax/%eax.
	//
	// Checking Function.hasStructRetAttr() here is insufficient because the IR
	// may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
	// false, then an sret argument may be implicitly inserted in the SelDAG. In
	// either case FuncInfo->setSRetReturnReg() will have been called.
	if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
	// When we have both sret and another return value, we should use the
	// original Chain stored in RetOps[0], instead of the current Chain updated
	// in the above loop. If we only have sret, RetOps[0] equals to Chain.

	// For the case of sret and another return value, we have
	// Chain_0 at the function entry
	// Chain_1 = getCopyToReg(Chain_0) in the above loop
	// If we use Chain_1 in getCopyFromReg, we will have
	// Val = getCopyFromReg(Chain_1)
	// Chain_2 = getCopyToReg(Chain_1, Val) from below

	// getCopyToReg(Chain_0) will be glued together with
	// getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
	// in Unit B, and we will have cyclic dependency between Unit A and Unit B:
	// Data dependency from Unit B to Unit A due to usage of Val in
	// getCopyToReg(Chain_1, Val)
	// Chain dependency from Unit A to Unit B

	// So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
	SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
	getPointerTy(MF.getDataLayout()));

	unsigned RetValReg
	= (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
	X86::RAX : X86::EAX;
	Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
	Flag = Chain.getValue(1);

	// RAX/EAX now acts like a return value.
	RetOps.push_back(
	DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));

	// Add the returned register to the CalleeSaveDisableRegs list.
	if (ShouldDisableCalleeSavedRegister)
	MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
	}

	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *I =
	TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
	if (I) {
	for (; *I; ++I) {
	if (X86::GR64RegClass.contains(*I))
	RetOps.push_back(DAG.getRegister(*I, MVT::i64));
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");
	}
	}

	RetOps[0] = Chain; // Update chain.

	// Add the flag if we have it.
	if (Flag.getNode())
	RetOps.push_back(Flag);

	X86ISD::NodeType opcode = X86ISD::RET_FLAG;
	if (CallConv == CallingConv::X86_INTR)
	opcode = X86ISD::IRET;
	return DAG.getNode(opcode, dl, MVT::Other, RetOps);
	}

	bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
	if (N->getNumValues() != 1 \|\| !N->hasNUsesOfValue(1, 0))
	return false;

	SDValue TCChain = Chain;
	SDNode Copy = N->use_begin();
	if (Copy->getOpcode() == ISD::CopyToReg) {
	// If the copy has a glue operand, we conservatively assume it isn't safe to
	// perform a tail call.
	if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
	return false;
	TCChain = Copy->getOperand(0);
	} else if (Copy->getOpcode() != ISD::FP_EXTEND)
	return false;

	bool HasRet = false;
	for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
	UI != UE; ++UI) {
	if (UI->getOpcode() != X86ISD::RET_FLAG)
	return false;
	// If we are returning more than one value, we can definitely
	// not make a tail call see PR19530
	if (UI->getNumOperands() > 4)
	return false;
	if (UI->getNumOperands() == 4 &&
	UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
	return false;
	HasRet = true;
	}

	if (!HasRet)
	return false;

	Chain = TCChain;
	return true;
	}

	EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
	ISD::NodeType ExtendKind) const {
	MVT ReturnMVT = MVT::i32;

	bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
	if (VT == MVT::i1 \|\| (!Darwin && (VT == MVT::i8 \|\| VT == MVT::i16))) {
	// The ABI does not require i1, i8 or i16 to be extended.
	//
	// On Darwin, there is code in the wild relying on Clang's old behaviour of
	// always extending i8/i16 return values, so keep doing that for now.
	// (PR26665).
	ReturnMVT = MVT::i8;
	}

	EVT MinVT = getRegisterType(Context, ReturnMVT);
	return VT.bitsLT(MinVT) ? MinVT : VT;
	}

	/// Reads two 32 bit registers and creates a 64 bit mask value.
	/// \param VA The current 32 bit value that need to be assigned.
	/// \param NextVA The next 32 bit value that need to be assigned.
	/// \param Root The parent DAG node.
	/// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
	/// glue purposes. In the case the DAG is already using
	/// physical register instead of virtual, we should glue
	/// our new SDValue to InFlag SDvalue.
	/// \return a new SDvalue of size 64bit.
	static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
	SDValue &Root, SelectionDAG &DAG,
	const SDLoc &Dl, const X86Subtarget &Subtarget,
	SDValue *InFlag = nullptr) {
	assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
	assert(Subtarget.is32Bit() && "Expecting 32 bit target");
	assert(VA.getValVT() == MVT::v64i1 &&
	"Expecting first location of 64 bit width type");
	assert(NextVA.getValVT() == VA.getValVT() &&
	"The locations should have the same type");
	assert(VA.isRegLoc() && NextVA.isRegLoc() &&
	"The values should reside in two registers");

	SDValue Lo, Hi;
	unsigned Reg;
	SDValue ArgValueLo, ArgValueHi;

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetRegisterClass *RC = &X86::GR32RegClass;

	// Read a 32 bit value from the registers
	if (nullptr == InFlag) {
	// When no physical register is present,
	// create an intermediate virtual register
	Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
	ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
	} else {
	// When a physical register is available read the value from it and glue
	// the reads together.
	ArgValueLo =
	DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueLo.getValue(2);
	ArgValueHi =
	DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
	*InFlag = ArgValueHi.getValue(2);
	}

	// Convert the i32 type into v32i1 type
	Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);

	// Convert the i32 type into v32i1 type
	Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);

	// Concatenate the two values together
	return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
	}

	/// The function will lower a register of various sizes (8/16/32/64)
	/// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
	/// \returns a DAG node contains the operand after lowering to mask type.
	static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
	const EVT &ValLoc, const SDLoc &Dl,
	SelectionDAG &DAG) {
	SDValue ValReturned = ValArg;

	if (ValVT == MVT::v1i1)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);

	if (ValVT == MVT::v64i1) {
	// In 32 bit machine, this case is handled by getv64i1Argument
	assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
	// In 64 bit machine, There is no need to truncate the value only bitcast
	} else {
	MVT maskLen;
	switch (ValVT.getSimpleVT().SimpleTy) {
	case MVT::v8i1:
	maskLen = MVT::i8;
	break;
	case MVT::v16i1:
	maskLen = MVT::i16;
	break;
	case MVT::v32i1:
	maskLen = MVT::i32;
	break;
	default:
	llvm_unreachable("Expecting a vector of i1 types");
	}

	ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
	}
	return DAG.getBitcast(ValVT, ValReturned);
	}

	/// Lower the result values of a call into the
	/// appropriate copies out of appropriate physical registers.
	///
	SDValue X86TargetLowering::LowerCallResult(
	SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
	uint32_t *RegMask) const {

	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	// Assign locations to each value returned by this call.
	SmallVector<CCValAssign, 16> RVLocs;
	bool Is64Bit = Subtarget.is64Bit();
	CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
	*DAG.getContext());
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);

	// Copy all of the result registers out of their specified physreg.
	for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
	++I, ++InsIndex) {
	CCValAssign &VA = RVLocs[I];
	EVT CopyVT = VA.getLocVT();

	// In some calling conventions we need to remove the used registers
	// from the register mask.
	if (RegMask) {
	for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));
	}

	// If this is x86-64, and we disabled SSE, we can't return FP values
	if ((CopyVT == MVT::f32 \|\| CopyVT == MVT::f64 \|\| CopyVT == MVT::f128) &&
	((Is64Bit \|\| Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
	errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
	VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
	}

	// If we prefer to use the value in xmm registers, copy it out as f80 and
	// use a truncate to move it from fp stack reg to xmm reg.
	bool RoundAfterCopy = false;
	if ((VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1) &&
	isScalarFPTypeInSSEReg(VA.getValVT())) {
	if (!Subtarget.hasX87())
	report_fatal_error("X87 register return with X87 disabled");
	CopyVT = MVT::f80;
	RoundAfterCopy = (CopyVT != VA.getLocVT());
	}

	SDValue Val;
	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	Val =
	getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
	} else {
	Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
	.getValue(1);
	Val = Chain.getValue(0);
	InFlag = Chain.getValue(2);
	}

	if (RoundAfterCopy)
	Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
	// This truncation won't change the value.
	DAG.getIntPtrConstant(1, dl));

	if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
	if (VA.getValVT().isVector() &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
	} else
	Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
	}

	InVals.push_back(Val);
	}

	return Chain;
	}

	//===----------------------------------------------------------------------===//
	// C & StdCall & Fast Calling Convention implementation
	//===----------------------------------------------------------------------===//
	// StdCall calling convention seems to be standard for many Windows' API
	// routines and around. It differs from C calling convention just a little:
	// callee should clean up the stack, not caller. Symbols should be also
	// decorated in some fancy way :) It doesn't support any vector arguments.
	// For info on fast calling convention see Fast Calling Convention (tail call)
	// implementation LowerX86_32FastCCCallTo.

	/// CallIsStructReturn - Determines whether a call uses struct return
	/// semantics.
	enum StructReturnType {
	NotStructReturn,
	RegStructReturn,
	StackStructReturn
	};
	static StructReturnType
	callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
	if (Outs.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Determines whether a function uses struct return semantics.
	static StructReturnType
	argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
	if (Ins.empty())
	return NotStructReturn;

	const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
	if (!Flags.isSRet())
	return NotStructReturn;
	if (Flags.isInReg() \|\| IsMCU)
	return RegStructReturn;
	return StackStructReturn;
	}

	/// Make a copy of an aggregate at address specified by "Src" to address
	/// "Dst" with size and alignment information specified by the specific
	/// parameter attribute. The copy will be passed as a byval function parameter.
	static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
	SDValue Chain, ISD::ArgFlagsTy Flags,
	SelectionDAG &DAG, const SDLoc &dl) {
	SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);

	return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
	/isVolatile/false, /AlwaysInline=/true,
	/isTailCall/false,
	MachinePointerInfo(), MachinePointerInfo());
	}

	/// Return true if the calling convention is one that we can guarantee TCO for.
	static bool canGuaranteeTCO(CallingConv::ID CC) {
	return (CC == CallingConv::Fast \|\| CC == CallingConv::GHC \|\|
	CC == CallingConv::X86_RegCall \|\| CC == CallingConv::HiPE \|\|
	CC == CallingConv::HHVM);
	}

	/// Return true if we might ever do TCO for calls with this calling convention.
	static bool mayTailCallThisCC(CallingConv::ID CC) {
	switch (CC) {
	// C calling conventions:
	case CallingConv::C:
	case CallingConv::Win64:
	case CallingConv::X86_64_SysV:
	// Callee pop conventions:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_StdCall:
	case CallingConv::X86_VectorCall:
	case CallingConv::X86_FastCall:
	return true;
	default:
	return canGuaranteeTCO(CC);
	}
	}

	/// Return true if the function is being made into a tailcall target by
	/// changing its ABI.
	static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
	return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
	}

	bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
	auto Attr =
	CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
	if (!CI->isTailCall() \|\| Attr.getValueAsString() == "true")
	return false;

	ImmutableCallSite CS(CI);
	CallingConv::ID CalleeCC = CS.getCallingConv();
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	return true;
	}

	SDValue
	X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
	const SmallVectorImpl<ISD::InputArg> &Ins,
	const SDLoc &dl, SelectionDAG &DAG,
	const CCValAssign &VA,
	MachineFrameInfo &MFI, unsigned i) const {
	// Create the nodes corresponding to a load from this parameter slot.
	ISD::ArgFlagsTy Flags = Ins[i].Flags;
	bool AlwaysUseMutable = shouldGuaranteeTCO(
	CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
	bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
	EVT ValVT;
	MVT PtrVT = getPointerTy(DAG.getDataLayout());

	// If value is passed by pointer we have address passed instead of the value
	// itself. No need to extend if the mask value and location share the same
	// absolute size.
	bool ExtendedInMem =
	VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
	VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();

	if (VA.getLocInfo() == CCValAssign::Indirect \|\| ExtendedInMem)
	ValVT = VA.getLocVT();
	else
	ValVT = VA.getValVT();

	// Calculate SP offset of interrupt parameter, re-arrange the slot normally
	// taken by a return address.
	int Offset = 0;
	if (CallConv == CallingConv::X86_INTR) {
	// X86 interrupts may take one or two arguments.
	// On the stack there will be no return address as in regular call.
	// Offset of last argument need to be set to -4/-8 bytes.
	// Where offset of the first argument out of two, should be set to 0 bytes.
	Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
	if (Subtarget.is64Bit() && Ins.size() == 2) {
	// The stack pointer needs to be realigned for 64 bit handlers with error
	// code, so the argument offset changes by 8 bytes.
	Offset += 8;
	}
	}

	// FIXME: For now, all byval parameter objects are marked mutable. This can be
	// changed with more analysis.
	// In case of tail call optimization mark all arguments mutable. Since they
	// could be overwritten by lowering of arguments in case of a tail call.
	if (Flags.isByVal()) {
	unsigned Bytes = Flags.getByValSize();
	if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
	int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}
	return DAG.getFrameIndex(FI, PtrVT);
	}

	// This is an argument in memory. We might be able to perform copy elision.
	if (Flags.isCopyElisionCandidate()) {
	EVT ArgVT = Ins[i].ArgVT;
	SDValue PartAddr;
	if (Ins[i].PartOffset == 0) {
	// If this is a one-part value or the first part of a multi-part value,
	// create a stack object for the entire argument value type and return a
	// load from our portion of it. This assumes that if the first part of an
	// argument is in memory, the rest will also be in memory.
	int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
	/Immutable=/false);
	PartAddr = DAG.getFrameIndex(FI, PtrVT);
	return DAG.getLoad(
	ValVT, dl, Chain, PartAddr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	} else {
	// This is not the first piece of an argument in memory. See if there is
	// already a fixed stack object including this offset. If so, assume it
	// was created by the PartOffset == 0 branch above and create a load from
	// the appropriate offset into it.
	int64_t PartBegin = VA.getLocMemOffset();
	int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
	int FI = MFI.getObjectIndexBegin();
	for (; MFI.isFixedObjectIndex(FI); ++FI) {
	int64_t ObjBegin = MFI.getObjectOffset(FI);
	int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
	if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
	break;
	}
	if (MFI.isFixedObjectIndex(FI)) {
	SDValue Addr =
	DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
	DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
	return DAG.getLoad(
	ValVT, dl, Chain, Addr,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
	Ins[i].PartOffset));
	}
	}
	}

	int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
	VA.getLocMemOffset(), isImmutable);

	// Set SExt or ZExt flag.
	if (VA.getLocInfo() == CCValAssign::ZExt) {
	MFI.setObjectZExt(FI, true);
	} else if (VA.getLocInfo() == CCValAssign::SExt) {
	MFI.setObjectSExt(FI, true);
	}

	// Adjust SP offset of interrupt parameter.
	if (CallConv == CallingConv::X86_INTR) {
	MFI.setObjectOffset(FI, Offset);
	}

	SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
	SDValue Val = DAG.getLoad(
	ValVT, dl, Chain, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	return ExtendedInMem
	? (VA.getValVT().isVector()
	? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
	: DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
	: Val;
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());

	if (Subtarget.isCallingConvWin64(CallConv)) {
	static const MCPhysReg GPR64ArgRegsWin64[] = {
	X86::RCX, X86::RDX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
	}

	static const MCPhysReg GPR64ArgRegs64Bit[] = {
	X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
	};
	return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
	}

	// FIXME: Get this from tablegen.
	static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
	CallingConv::ID CallConv,
	const X86Subtarget &Subtarget) {
	assert(Subtarget.is64Bit());
	if (Subtarget.isCallingConvWin64(CallConv)) {
	// The XMM registers which might contain var arg parameters are shadowed
	// in their paired GPR. So we only need to save the GPR to their home
	// slots.
	// TODO: __vectorcall will change this.
	return None;
	}

	const Function *Fn = MF.getFunction();
	bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
	bool isSoftFloat = Subtarget.useSoftFloat();
	assert(!(isSoftFloat && NoImplicitFloatOps) &&
	"SSE register cannot be used when SSE is disabled!");
	if (isSoftFloat \|\| NoImplicitFloatOps \|\| !Subtarget.hasSSE1())
	// Kernel mode asks for SSE to be disabled, so there are no XMM argument
	// registers.
	return None;

	static const MCPhysReg XMMArgRegs64Bit[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
	}

	#ifndef NDEBUG
	static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
	return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
	[](const CCValAssign &A, const CCValAssign &B) -> bool {
	return A.getValNo() < B.getValNo();
	});
	}
	#endif

	SDValue X86TargetLowering::LowerFormalArguments(
	SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
	const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
	SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
	MachineFunction &MF = DAG.getMachineFunction();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();

	const Function *Fn = MF.getFunction();
	if (Fn->hasExternalLinkage() &&
	Subtarget.isTargetCygMing() &&
	Fn->getName() == "main")
	FuncInfo->setForceFramePointer(true);

	MachineFrameInfo &MFI = MF.getFrameInfo();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);

	assert(
	!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling conv' regcall, fastcc, ghc or hipe");

	if (CallConv == CallingConv::X86_INTR) {
	bool isLegal = Ins.size() == 1 \|\|
	(Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) \|\|
	(!Is64Bit && Ins[1].VT == MVT::i32)));
	if (!isLegal)
	report_fatal_error("X86 interrupts may take one or two arguments");
	}

	// Assign locations to all of the incoming arguments.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Ins, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
	}

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	SDValue ArgValue;
	for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++InsIndex) {
	assert(InsIndex < Ins.size() && "Invalid Ins index");
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	EVT RegVT = VA.getLocVT();
	if (VA.needsCustom()) {
	assert(
	VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");

	// v64i1 values, in regcall calling convention, that are
	// compiled to 32 bit arch, are split up into two registers.
	ArgValue =
	getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
	} else {
	const TargetRegisterClass *RC;
	if (RegVT == MVT::i32)
	RC = &X86::GR32RegClass;
	else if (Is64Bit && RegVT == MVT::i64)
	RC = &X86::GR64RegClass;
	else if (RegVT == MVT::f32)
	RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
	else if (RegVT == MVT::f64)
	RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
	else if (RegVT == MVT::f80)
	RC = &X86::RFP80RegClass;
	else if (RegVT == MVT::f128)
	RC = &X86::FR128RegClass;
	else if (RegVT.is512BitVector())
	RC = &X86::VR512RegClass;
	else if (RegVT.is256BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
	else if (RegVT.is128BitVector())
	RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
	else if (RegVT == MVT::x86mmx)
	RC = &X86::VR64RegClass;
	else if (RegVT == MVT::v1i1)
	RC = &X86::VK1RegClass;
	else if (RegVT == MVT::v8i1)
	RC = &X86::VK8RegClass;
	else if (RegVT == MVT::v16i1)
	RC = &X86::VK16RegClass;
	else if (RegVT == MVT::v32i1)
	RC = &X86::VK32RegClass;
	else if (RegVT == MVT::v64i1)
	RC = &X86::VK64RegClass;
	else
	llvm_unreachable("Unknown argument type!");

	unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
	ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
	}

	// If this is an 8 or 16-bit value, it is really passed promoted to 32
	// bits. Insert an assert[sz]ext to capture this, then truncate to the
	// right size.
	if (VA.getLocInfo() == CCValAssign::SExt)
	ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::ZExt)
	ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
	DAG.getValueType(VA.getValVT()));
	else if (VA.getLocInfo() == CCValAssign::BCvt)
	ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);

	if (VA.isExtInLoc()) {
	// Handle MMX values passed in XMM regs.
	if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
	ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
	else if (VA.getValVT().isVector() &&
	VA.getValVT().getScalarType() == MVT::i1 &&
	((VA.getLocVT() == MVT::i64) \|\| (VA.getLocVT() == MVT::i32) \|\|
	(VA.getLocVT() == MVT::i16) \|\| (VA.getLocVT() == MVT::i8))) {
	// Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
	ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
	} else
	ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
	}
	} else {
	assert(VA.isMemLoc());
	ArgValue =
	LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
	}

	// If value is passed via pointer - do a load.
	if (VA.getLocInfo() == CCValAssign::Indirect)
	ArgValue =
	DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());

	InVals.push_back(ArgValue);
	}

	for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
	// Swift calling convention does not require we copy the sret argument
	// into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
	if (CallConv == CallingConv::Swift)
	continue;

	// All x86 ABIs require that for returning structs by value we copy the
	// sret argument into %rax/%eax (depending on ABI) for the return. Save
	// the argument into a virtual register so that we can access it from the
	// return points.
	if (Ins[I].Flags.isSRet()) {
	unsigned Reg = FuncInfo->getSRetReturnReg();
	if (!Reg) {
	MVT PtrTy = getPointerTy(DAG.getDataLayout());
	Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
	FuncInfo->setSRetReturnReg(Reg);
	}
	SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
	break;
	}
	}

	unsigned StackSize = CCInfo.getNextStackOffset();
	// Align stack specially for tail calls.
	if (shouldGuaranteeTCO(CallConv,
	MF.getTarget().Options.GuaranteedTailCallOpt))
	StackSize = GetAlignedArgumentStackSize(StackSize, DAG);

	// If the function takes variable number of arguments, make a frame index for
	// the start of the first vararg value... for expansion of llvm.va_start. We
	// can skip this if there are no va_start calls.
	if (MFI.hasVAStart() &&
	(Is64Bit \|\| (CallConv != CallingConv::X86_FastCall &&
	CallConv != CallingConv::X86_ThisCall))) {
	FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
	}

	// Figure out if XMM registers are in use.
	assert(!(Subtarget.useSoftFloat() &&
	Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
	"SSE register cannot be used when SSE is disabled!");

	// 64-bit calling conventions support varargs and register parameters, so we
	// have to do extra work to spill them in the prologue.
	if (Is64Bit && isVarArg && MFI.hasVAStart()) {
	// Find the first unallocated argument registers.
	ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
	ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
	unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
	assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
	"SSE register cannot be used when SSE is disabled!");

	// Gather all the live in physical registers.
	SmallVector<SDValue, 6> LiveGPRs;
	SmallVector<SDValue, 8> LiveXMMRegs;
	SDValue ALVal;
	for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
	unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
	LiveGPRs.push_back(
	DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
	}
	if (!ArgXMMs.empty()) {
	unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
	for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
	unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
	LiveXMMRegs.push_back(
	DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
	}
	}

	if (IsWin64) {
	// Get to the caller-allocated home save location. Add 8 to account
	// for the return address.
	int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
	FuncInfo->setRegSaveFrameIndex(
	MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
	// Fixup to set vararg frame on shadow area (4 x i64).
	if (NumIntRegs < 4)
	FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
	} else {
	// For X86-64, if there are vararg parameters that are passed via
	// registers, then we must store them to their spots on the stack so
	// they may be loaded by dereferencing the result of va_next.
	FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
	FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
	FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
	ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
	}

	// Store the integer parameter registers.
	SmallVector<SDValue, 8> MemOps;
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
	getPointerTy(DAG.getDataLayout()));
	unsigned Offset = FuncInfo->getVarArgsGPOffset();
	for (SDValue Val : LiveGPRs) {
	SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	RSFIN, DAG.getIntPtrConstant(Offset, dl));
	SDValue Store =
	DAG.getStore(Val.getValue(1), dl, Val, FIN,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(),
	FuncInfo->getRegSaveFrameIndex(), Offset));
	MemOps.push_back(Store);
	Offset += 8;
	}

	if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
	// Now store the XMM (fp + vector) parameter registers.
	SmallVector<SDValue, 12> SaveXMMOps;
	SaveXMMOps.push_back(Chain);
	SaveXMMOps.push_back(ALVal);
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getRegSaveFrameIndex(), dl));
	SaveXMMOps.push_back(DAG.getIntPtrConstant(
	FuncInfo->getVarArgsFPOffset(), dl));
	SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
	LiveXMMRegs.end());
	MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
	MVT::Other, SaveXMMOps));
	}

	if (!MemOps.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
	}

	if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
	// Find the largest legal vector type.
	MVT VecVT = MVT::Other;
	// FIXME: Only some x86_32 calling conventions support AVX512.
	if (Subtarget.hasAVX512() &&
	(Is64Bit \|\| (CallConv == CallingConv::X86_VectorCall \|\|
	CallConv == CallingConv::Intel_OCL_BI)))
	VecVT = MVT::v16f32;
	else if (Subtarget.hasAVX())
	VecVT = MVT::v8f32;
	else if (Subtarget.hasSSE2())
	VecVT = MVT::v4f32;

	// We forward some GPRs and some vector types.
	SmallVector<MVT, 2> RegParmTypes;
	MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
	RegParmTypes.push_back(IntVT);
	if (VecVT != MVT::Other)
	RegParmTypes.push_back(VecVT);

	// Compute the set of forwarded registers. The rest are scratch.
	SmallVectorImpl<ForwardedRegister> &Forwards =
	FuncInfo->getForwardedMustTailRegParms();
	CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);

	// Conservatively forward AL on x86_64, since it might be used for varargs.
	if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
	unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
	Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
	}

	// Copy all forwards from physical to virtual registers.
	for (ForwardedRegister &F : Forwards) {
	// FIXME: Can we use a less constrained schedule?
	SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
	Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
	}
	}

	// Some CCs need callee pop.
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt)) {
	FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
	} else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
	// X86 interrupts must pop the error code (and the alignment padding) if
	// present.
	FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
	} else {
	FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
	// If this is an sret function, the return should pop the hidden pointer.
	if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
	FuncInfo->setBytesToPopOnReturn(4);
	}

	if (!Is64Bit) {
	// RegSaveFrameIndex is X86-64 only.
	FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
	if (CallConv == CallingConv::X86_FastCall \|\|
	CallConv == CallingConv::X86_ThisCall)
	// fastcc functions can't have varargs.
	FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
	}

	FuncInfo->setArgumentStackSize(StackSize);

	if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
	EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
	if (Personality == EHPersonality::CoreCLR) {
	assert(Is64Bit);
	// TODO: Add a mechanism to frame lowering that will allow us to indicate
	// that we'd prefer this slot be allocated towards the bottom of the frame
	// (i.e. near the stack pointer after allocating the frame). Every
	// funclet needs a copy of this slot in its (mostly empty) frame, and the
	// offset from the bottom of this and each funclet's frame must be the
	// same, so the size of funclets' (mostly empty) frames is dictated by
	// how far this slot is from the bottom (since they allocate just enough
	// space to accommodate holding this slot at the correct offset).
	int PSPSymFI = MFI.CreateStackObject(8, 8, /isSS=/false);
	EHInfo->PSPSymFrameIdx = PSPSymFI;
	}
	}

	if (CallConv == CallingConv::X86_RegCall \|\|
	Fn->hasFnAttribute("no_caller_saved_registers")) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
	MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
	}

	return Chain;
	}

	SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
	SDValue Arg, const SDLoc &dl,
	SelectionDAG &DAG,
	const CCValAssign &VA,
	ISD::ArgFlagsTy Flags) const {
	unsigned LocMemOffset = VA.getLocMemOffset();
	SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
	PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, PtrOff);
	if (Flags.isByVal())
	return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);

	return DAG.getStore(
	Chain, dl, Arg, PtrOff,
	MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
	}

	/// Emit a load of return address if tail call
	/// optimization is performed and it is required.
	SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
	SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
	bool Is64Bit, int FPDiff, const SDLoc &dl) const {
	// Adjust the Return address stack slot.
	EVT VT = getPointerTy(DAG.getDataLayout());
	OutRetAddr = getReturnAddressFrameIndex(DAG);

	// Load the "old" Return address.
	OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
	return SDValue(OutRetAddr.getNode(), 1);
	}

	/// Emit a store of the return address if tail call
	/// optimization is performed and it is required (FPDiff!=0).
	static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
	SDValue Chain, SDValue RetAddrFrIdx,
	EVT PtrVT, unsigned SlotSize,
	int FPDiff, const SDLoc &dl) {
	// Store the return address to the appropriate stack slot.
	if (!FPDiff) return Chain;
	// Calculate the new stack slot for the return address.
	int NewReturnAddrFI =
	MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
	false);
	SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
	Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
	MachinePointerInfo::getFixedStack(
	DAG.getMachineFunction(), NewReturnAddrFI));
	return Chain;
	}

	/// Returns a vector_shuffle mask for an movs{s\|d}, movd
	/// operation of specified width.
	static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
	SDValue V2) {
	unsigned NumElems = VT.getVectorNumElements();
	SmallVector<int, 8> Mask;
	Mask.push_back(NumElems);
	for (unsigned i = 1; i != NumElems; ++i)
	Mask.push_back(i);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	SDValue
	X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
	SmallVectorImpl<SDValue> &InVals) const {
	SelectionDAG &DAG = CLI.DAG;
	SDLoc &dl = CLI.DL;
	SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
	SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
	SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
	SDValue Chain = CLI.Chain;
	SDValue Callee = CLI.Callee;
	CallingConv::ID CallConv = CLI.CallConv;
	bool &isTailCall = CLI.IsTailCall;
	bool isVarArg = CLI.IsVarArg;

	MachineFunction &MF = DAG.getMachineFunction();
	bool Is64Bit = Subtarget.is64Bit();
	bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
	StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
	bool IsSibcall = false;
	X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
	auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
	const CallInst *CI =
	CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
	const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
	bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) \|\|
	(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));

	if (CallConv == CallingConv::X86_INTR)
	report_fatal_error("X86 interrupts may not be called directly");

	if (Attr.getValueAsString() == "true")
	isTailCall = false;

	if (Subtarget.isPICStyleGOT() &&
	!MF.getTarget().Options.GuaranteedTailCallOpt) {
	// If we are using a GOT, disable tail calls to external symbols with
	// default visibility. Tail calling such a symbol requires using a GOT
	// relocation, which forces early binding of the symbol. This breaks code
	// that require lazy function symbol resolution. Using musttail or
	// GuaranteedTailCallOpt will override this.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (!G \|\| (!G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility()))
	isTailCall = false;
	}

	bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
	if (IsMustTail) {
	// Force this to be a tail call. The verifier rules are enough to ensure
	// that we can lower this successfully without moving the return address
	// around.
	isTailCall = true;
	} else if (isTailCall) {
	// Check if it's really possible to do a tail call.
	isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
	isVarArg, SR != NotStructReturn,
	MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
	Outs, OutVals, Ins, DAG);

	// Sibcalls are automatically detected tailcalls which do not require
	// ABI changes.
	if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
	IsSibcall = true;

	if (isTailCall)
	++NumTailCalls;
	}

	assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
	"Var args not supported with calling convention fastcc, ghc or hipe");

	// Analyze operands of the call, assigning locations to each operand.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());

	// Allocate shadow area for Win64.
	if (IsWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeArguments(Outs, CC_X86);

	// In vectorcall calling convention a second pass is required for the HVA
	// types.
	if (CallingConv::X86_VectorCall == CallConv) {
	CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
	}

	// Get a count of how many bytes are to be pushed on the stack.
	unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
	if (IsSibcall)
	// This is a sibcall. The memory operands are available in caller's
	// own caller's stack.
	NumBytes = 0;
	else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
	canGuaranteeTCO(CallConv))
	NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);

	int FPDiff = 0;
	if (isTailCall && !IsSibcall && !IsMustTail) {
	// Lower arguments at fp - stackoffset + fpdiff.
	unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();

	FPDiff = NumBytesCallerPushed - NumBytes;

	// Set the delta of movement of the returnaddr stackslot.
	// But only set if delta is greater than previous delta.
	if (FPDiff < X86Info->getTCReturnAddrDelta())
	X86Info->setTCReturnAddrDelta(FPDiff);
	}

	unsigned NumBytesToPush = NumBytes;
	unsigned NumBytesToPop = NumBytes;

	// If we have an inalloca argument, all stack space has already been allocated
	// for us and be right at the top of the stack. We don't support multiple
	// arguments passed in memory when using inalloca.
	if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
	NumBytesToPush = 0;
	if (!ArgLocs.back().isMemLoc())
	report_fatal_error("cannot use inalloca attribute on a register "
	"parameter");
	if (ArgLocs.back().getLocMemOffset() != 0)
	report_fatal_error("any parameter with the inalloca attribute must be "
	"the only memory argument");
	}

	if (!IsSibcall)
	Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
	NumBytes - NumBytesToPush, dl);

	SDValue RetAddrFrIdx;
	// Load return address for tail calls.
	if (isTailCall && FPDiff)
	Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
	Is64Bit, FPDiff, dl);

	SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
	SmallVector<SDValue, 8> MemOpChains;
	SDValue StackPtr;

	// The next loop assumes that the locations are in the same order of the
	// input arguments.
	assert(isSortedByValueNo(ArgLocs) &&
	"Argument Location list must be sorted before lowering");

	// Walk the register/memloc assignments, inserting copies/loads. In the case
	// of tail call optimization arguments are handle later.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutIndex) {
	assert(OutIndex < Outs.size() && "Invalid Out index");
	// Skip inalloca arguments, they have already been written.
	ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
	if (Flags.isInAlloca())
	continue;

	CCValAssign &VA = ArgLocs[I];
	EVT RegVT = VA.getLocVT();
	SDValue Arg = OutVals[OutIndex];
	bool isByVal = Flags.isByVal();

	// Promote the value if needed.
	switch (VA.getLocInfo()) {
	default: llvm_unreachable("Unknown loc info!");
	case CCValAssign::Full: break;
	case CCValAssign::SExt:
	Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::ZExt:
	Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::AExt:
	if (Arg.getValueType().isVector() &&
	Arg.getValueType().getVectorElementType() == MVT::i1)
	Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
	else if (RegVT.is128BitVector()) {
	// Special case: passing MMX values in XMM registers.
	Arg = DAG.getBitcast(MVT::i64, Arg);
	Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
	Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
	} else
	Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
	break;
	case CCValAssign::BCvt:
	Arg = DAG.getBitcast(RegVT, Arg);
	break;
	case CCValAssign::Indirect: {
	// Store the argument.
	SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
	int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
	Chain = DAG.getStore(
	Chain, dl, Arg, SpillSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
	Arg = SpillSlot;
	break;
	}
	}

	if (VA.needsCustom()) {
	assert(VA.getValVT() == MVT::v64i1 &&
	"Currently the only custom case is when we split v64i1 to 2 regs");
	// Split v64i1 value into two registers
	Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
	Subtarget);
	} else if (VA.isRegLoc()) {
	RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
	if (isVarArg && IsWin64) {
	// Win64 ABI requires argument XMM reg to be copied to the corresponding
	// shadow reg if callee is a varargs function.
	unsigned ShadowReg = 0;
	switch (VA.getLocReg()) {
	case X86::XMM0: ShadowReg = X86::RCX; break;
	case X86::XMM1: ShadowReg = X86::RDX; break;
	case X86::XMM2: ShadowReg = X86::R8; break;
	case X86::XMM3: ShadowReg = X86::R9; break;
	}
	if (ShadowReg)
	RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
	}
	} else if (!IsSibcall && (!isTailCall \|\| isByVal)) {
	assert(VA.isMemLoc());
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
	dl, DAG, VA, Flags));
	}
	}

	if (!MemOpChains.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);

	if (Subtarget.isPICStyleGOT()) {
	// ELF / PIC requires GOT in the EBX register before function calls via PLT
	// GOT pointer.
	if (!isTailCall) {
	RegsToPass.push_back(std::make_pair(
	unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
	getPointerTy(DAG.getDataLayout()))));
	} else {
	// If we are tail calling and generating PIC/GOT style code load the
	// address of the callee into ECX. The value in ecx is used as target of
	// the tail jump. This is done to circumvent the ebx/callee-saved problem
	// for tail calls on PIC/GOT architectures. Normally we would just put the
	// address of GOT into ebx and then call target@PLT. But for tail calls
	// ebx would be restored (since ebx is callee saved) before jumping to the
	// target@PLT.

	// Note: The actual moving to ECX is done further down.
	GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
	if (G && !G->getGlobal()->hasLocalLinkage() &&
	G->getGlobal()->hasDefaultVisibility())
	Callee = LowerGlobalAddress(Callee, DAG);
	else if (isa<ExternalSymbolSDNode>(Callee))
	Callee = LowerExternalSymbol(Callee, DAG);
	}
	}

	if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
	// From AMD64 ABI document:
	// For calls that may call functions that use varargs or stdargs
	// (prototype-less calls or calls to functions containing ellipsis (...) in
	// the declaration) %al is used as hidden argument to specify the number
	// of SSE registers used. The contents of %al do not need to match exactly
	// the number of registers, but must be an ubound on the number of SSE
	// registers used and is in the range 0 - 8 inclusive.

	// Count the number of XMM registers allocated.
	static const MCPhysReg XMMArgRegs[] = {
	X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
	X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
	};
	unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
	assert((Subtarget.hasSSE1() \|\| !NumXMMRegs)
	&& "SSE registers cannot be used when SSE is disabled");

	RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
	DAG.getConstant(NumXMMRegs, dl,
	MVT::i8)));
	}

	if (isVarArg && IsMustTail) {
	const auto &Forwards = X86Info->getForwardedMustTailRegParms();
	for (const auto &F : Forwards) {
	SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
	RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
	}
	}

	// For tail calls lower the arguments to the 'real' stack slots. Sibcalls
	// don't need this because the eligibility check rejects calls that require
	// shuffling arguments passed in memory.
	if (!IsSibcall && isTailCall) {
	// Force all the incoming stack arguments to be loaded from the stack
	// before any new outgoing arguments are stored to the stack, because the
	// outgoing stack slots may alias the incoming argument stack slots, and
	// the alias isn't otherwise explicit. This is slightly more conservative
	// than necessary, because it means that each store effectively depends
	// on every argument instead of just those arguments it would clobber.
	SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);

	SmallVector<SDValue, 8> MemOpChains2;
	SDValue FIN;
	int FI = 0;
	for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
	++I, ++OutsIndex) {
	CCValAssign &VA = ArgLocs[I];

	if (VA.isRegLoc()) {
	if (VA.needsCustom()) {
	assert((CallConv == CallingConv::X86_RegCall) &&
	"Expecting custom case only in regcall calling convention");
	// This means that we are in special case where one argument was
	// passed through two register locations - Skip the next location
	++I;
	}

	continue;
	}

	assert(VA.isMemLoc());
	SDValue Arg = OutVals[OutsIndex];
	ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
	// Skip inalloca arguments. They don't require any work.
	if (Flags.isInAlloca())
	continue;
	// Create frame index.
	int32_t Offset = VA.getLocMemOffset()+FPDiff;
	uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
	FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
	FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));

	if (Flags.isByVal()) {
	// Copy relative to framepointer.
	SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
	if (!StackPtr.getNode())
	StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
	getPointerTy(DAG.getDataLayout()));
	Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
	StackPtr, Source);

	MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
	ArgChain,
	Flags, DAG, dl));
	} else {
	// Store relative to framepointer.
	MemOpChains2.push_back(DAG.getStore(
	ArgChain, dl, Arg, FIN,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
	}
	}

	if (!MemOpChains2.empty())
	Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);

	// Store the return address to the appropriate stack slot.
	Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
	getPointerTy(DAG.getDataLayout()),
	RegInfo->getSlotSize(), FPDiff, dl);
	}

	// Build a sequence of copy-to-reg nodes chained together with token chain
	// and flag operands which copy the outgoing args into registers.
	SDValue InFlag;
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
	Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
	RegsToPass[i].second, InFlag);
	InFlag = Chain.getValue(1);
	}

	if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
	assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
	// In the 64-bit large code model, we have to make all calls
	// through a register, since the call instruction's 32-bit
	// pc-relative offset may not be large enough to hold the whole
	// address.
	} else if (Callee->getOpcode() == ISD::GlobalAddress) {
	// If the callee is a GlobalAddress node (quite common, every direct call
	// is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
	// it.
	GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);

	// We should use extra load for direct calls to dllimported functions in
	// non-JIT mode.
	const GlobalValue *GV = G->getGlobal();
	if (!GV->hasDLLImportStorageClass()) {
	unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);

	Callee = DAG.getTargetGlobalAddress(
	GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);

	if (OpFlags == X86II::MO_GOTPCREL) {
	// Add a wrapper.
	Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
	getPointerTy(DAG.getDataLayout()), Callee);
	// Add extra indirection
	Callee = DAG.getLoad(
	getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}
	}
	} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
	const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
	unsigned char OpFlags =
	Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);

	Callee = DAG.getTargetExternalSymbol(
	S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
	} else if (Subtarget.isTarget64BitILP32() &&
	Callee->getValueType(0) == MVT::i32) {
	// Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
	Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
	}

	// Returns a chain & a flag for retval copy to use.
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SmallVector<SDValue, 8> Ops;

	if (!IsSibcall && isTailCall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	Ops.push_back(Chain);
	Ops.push_back(Callee);

	if (isTailCall)
	Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));

	// Add argument registers to the end of the list so that they are known live
	// into the call.
	for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
	Ops.push_back(DAG.getRegister(RegsToPass[i].first,
	RegsToPass[i].second.getValueType()));

	// Add a register mask operand representing the call-preserved registers.
	// If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
	// set X86_INTR calling convention because it has the same CSR mask
	// (same preserved registers).
	const uint32_t *Mask = RegInfo->getCallPreservedMask(
	MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
	assert(Mask && "Missing call preserved mask for calling convention");

	// If this is an invoke in a 32-bit function using a funclet-based
	// personality, assume the function clobbers all registers. If an exception
	// is thrown, the runtime will not restore CSRs.
	// FIXME: Model this more precisely so that we can register allocate across
	// the normal edge and spill and fill across the exceptional edge.
	if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
	const Function *CallerFn = MF.getFunction();
	EHPersonality Pers =
	CallerFn->hasPersonalityFn()
	? classifyEHPersonality(CallerFn->getPersonalityFn())
	: EHPersonality::Unknown;
	if (isFuncletEHPersonality(Pers))
	Mask = RegInfo->getNoPreservedMask();
	}

	// Define a new register mask from the existing mask.
	uint32_t *RegMask = nullptr;

	// In some calling conventions we need to remove the used physical registers
	// from the reg mask.
	if (CallConv == CallingConv::X86_RegCall \|\| HasNCSR) {
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	// Allocate a new Reg Mask and copy Mask.
	RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
	unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
	memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);

	// Make sure all sub registers of the argument registers are reset
	// in the RegMask.
	for (auto const &RegPair : RegsToPass)
	for (MCSubRegIterator SubRegs(RegPair.first, TRI, /IncludeSelf=/true);
	SubRegs.isValid(); ++SubRegs)
	RegMask[SubRegs / 32] &= ~(1u << (SubRegs % 32));

	// Create the RegMask Operand according to our updated mask.
	Ops.push_back(DAG.getRegisterMask(RegMask));
	} else {
	// Create the RegMask Operand according to the static mask.
	Ops.push_back(DAG.getRegisterMask(Mask));
	}

	if (InFlag.getNode())
	Ops.push_back(InFlag);

	if (isTailCall) {
	// We used to do:
	//// If this is the first return lowered for this function, add the regs
	//// to the liveout set for the function.
	// This isn't right, although it's probably harmless on x86; liveouts
	// should be computed from returns not tail calls. Consider a void
	// function making a tail call to a function returning int.
	MF.getFrameInfo().setHasTailCall();
	return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
	}

	Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
	InFlag = Chain.getValue(1);

	// Create the CALLSEQ_END node.
	unsigned NumBytesForCalleeToPop;
	if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
	DAG.getTarget().Options.GuaranteedTailCallOpt))
	NumBytesForCalleeToPop = NumBytes; // Callee pops everything
	else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
	!Subtarget.getTargetTriple().isOSMSVCRT() &&
	SR == StackStructReturn)
	// If this is a call to a struct-return function, the callee
	// pops the hidden struct pointer, so we have to push it back.
	// This is common for Darwin/X86, Linux & Mingw32 targets.
	// For MSVC Win32 targets, the caller pops the hidden struct pointer.
	NumBytesForCalleeToPop = 4;
	else
	NumBytesForCalleeToPop = 0; // Callee pops nothing.

	if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
	// No need to reset the stack after the call if the call doesn't return. To
	// make the MI verify, we'll pretend the callee does it for us.
	NumBytesForCalleeToPop = NumBytes;
	}

	// Returns a flag for retval copy to use.
	if (!IsSibcall) {
	Chain = DAG.getCALLSEQ_END(Chain,
	DAG.getIntPtrConstant(NumBytesToPop, dl, true),
	DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
	true),
	InFlag, dl);
	InFlag = Chain.getValue(1);
	}

	// Handle result values, copying them out of physregs into vregs that we
	// return.
	return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
	InVals, RegMask);
	}

	//===----------------------------------------------------------------------===//
	// Fast Calling Convention (tail call) implementation
	//===----------------------------------------------------------------------===//

	// Like std call, callee cleans arguments, convention except that ECX is
	// reserved for storing the tail called function address. Only 2 registers are
	// free for argument passing (inreg). Tail call optimization is performed
	// provided:
	// * tailcallopt is enabled
	// * caller/callee are fastcc
	// On X86_64 architecture with GOT-style position independent code only local
	// (within module) calls are supported at the moment.
	// To keep the stack aligned according to platform abi the function
	// GetAlignedArgumentStackSize ensures that argument delta is always multiples
	// of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
	// If a tail called function callee has more arguments than the caller the
	// caller needs to make sure that there is room to move the RETADDR to. This is
	// achieved by reserving an area the size of the argument delta right after the
	// original RETADDR, but before the saved framepointer or the spilled registers
	// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
	// stack layout:
	// arg1
	// arg2
	// RETADDR
	// [ new RETADDR
	// move area ]
	// (possible EBP)
	// ESI
	// EDI
	// local1 ..

	/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
	/// requirement.
	unsigned
	X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
	SelectionDAG& DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	uint64_t AlignMask = StackAlignment - 1;
	int64_t Offset = StackSize;
	unsigned SlotSize = RegInfo->getSlotSize();
	if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
	// Number smaller than 12 so just add the difference.
	Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
	} else {
	// Mask out lower bits, add stackalignment once plus the 12 bytes.
	Offset = ((~AlignMask) & Offset) + StackAlignment +
	(StackAlignment-SlotSize);
	}
	return Offset;
	}

	/// Return true if the given stack call argument is already available in the
	/// same position (relatively) of the caller's incoming argument stack.
	static
	bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
	MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
	const X86InstrInfo *TII, const CCValAssign &VA) {
	unsigned Bytes = Arg.getValueSizeInBits() / 8;

	for (;;) {
	// Look through nodes that don't alter the bits of the incoming value.
	unsigned Op = Arg.getOpcode();
	if (Op == ISD::ZERO_EXTEND \|\| Op == ISD::ANY_EXTEND \|\| Op == ISD::BITCAST) {
	Arg = Arg.getOperand(0);
	continue;
	}
	if (Op == ISD::TRUNCATE) {
	const SDValue &TruncInput = Arg.getOperand(0);
	if (TruncInput.getOpcode() == ISD::AssertZext &&
	cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
	Arg.getValueType()) {
	Arg = TruncInput.getOperand(0);
	continue;
	}
	}
	break;
	}

	int FI = INT_MAX;
	if (Arg.getOpcode() == ISD::CopyFromReg) {
	unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
	if (!TargetRegisterInfo::isVirtualRegister(VR))
	return false;
	MachineInstr *Def = MRI->getVRegDef(VR);
	if (!Def)
	return false;
	if (!Flags.isByVal()) {
	if (!TII->isLoadFromStackSlot(*Def, FI))
	return false;
	} else {
	unsigned Opcode = Def->getOpcode();
	if ((Opcode == X86::LEA32r \|\| Opcode == X86::LEA64r \|\|
	Opcode == X86::LEA64_32r) &&
	Def->getOperand(1).isFI()) {
	FI = Def->getOperand(1).getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;
	}
	} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
	if (Flags.isByVal())
	// ByVal argument is passed in as a pointer but it's now being
	// dereferenced. e.g.
	// define @foo(%struct.X* %A) {
	// tail call @bar(%struct.X* byval %A)
	// }
	return false;
	SDValue Ptr = Ld->getBasePtr();
	FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
	if (!FINode)
	return false;
	FI = FINode->getIndex();
	} else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
	FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
	FI = FINode->getIndex();
	Bytes = Flags.getByValSize();
	} else
	return false;

	assert(FI != INT_MAX);
	if (!MFI.isFixedObjectIndex(FI))
	return false;

	if (Offset != MFI.getObjectOffset(FI))
	return false;

	+ // If this is not byval, check that the argument stack object is immutable.
	+ // inalloca and argument copy elision can create mutable argument stack
	+ // objects. Byval objects can be mutated, but a byval call intends to pass the
	+ // mutated memory.
	+ if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
	+ return false;
	+
	if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
	// If the argument location is wider than the argument type, check that any
	// extension flags match.
	if (Flags.isZExt() != MFI.isObjectZExt(FI) \|\|
	Flags.isSExt() != MFI.isObjectSExt(FI)) {
	return false;
	}
	}

	return Bytes == MFI.getObjectSize(FI);
	}

	/// Check whether the call is eligible for tail call optimization. Targets
	/// that want to do tail call optimization should implement this function.
	bool X86TargetLowering::IsEligibleForTailCallOptimization(
	SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
	bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
	const SmallVectorImpl<ISD::OutputArg> &Outs,
	const SmallVectorImpl<SDValue> &OutVals,
	const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
	if (!mayTailCallThisCC(CalleeCC))
	return false;

	// If -tailcallopt is specified, make fastcc functions tail-callable.
	MachineFunction &MF = DAG.getMachineFunction();
	const Function *CallerF = MF.getFunction();

	// If the function return type is x86_fp80 and the callee return type is not,
	// then the FP_EXTEND of the call result is not a nop. It's not safe to
	// perform a tailcall optimization here.
	if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
	return false;

	CallingConv::ID CallerCC = CallerF->getCallingConv();
	bool CCMatch = CallerCC == CalleeCC;
	bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
	bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);

	// Win64 functions have extra shadow space for argument homing. Don't do the
	// sibcall if the caller and callee have mismatched expectations for this
	// space.
	if (IsCalleeWin64 != IsCallerWin64)
	return false;

	if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
	if (canGuaranteeTCO(CalleeCC) && CCMatch)
	return true;
	return false;
	}

	// Look for obvious safe cases to perform tail call optimization that do not
	// require ABI changes. This is what gcc calls sibcall.

	// Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
	// emit a special epilogue.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	if (RegInfo->needsStackRealignment(MF))
	return false;

	// Also avoid sibcall optimization if either caller or callee uses struct
	// return semantics.
	if (isCalleeStructRet \|\| isCallerStructRet)
	return false;

	// Do not sibcall optimize vararg calls unless all arguments are passed via
	// registers.
	LLVMContext &C = *DAG.getContext();
	if (isVarArg && !Outs.empty()) {
	// Optimizing for varargs on Win64 is unlikely to be safe without
	// additional testing.
	if (IsCalleeWin64 \|\| IsCallerWin64)
	return false;

	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
	if (!ArgLocs[i].isRegLoc())
	return false;
	}

	// If the call result is in ST0 / ST1, it needs to be popped off the x87
	// stack. Therefore, if it's not used by the call it is not safe to optimize
	// this into a sibcall.
	bool Unused = false;
	for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
	if (!Ins[i].Used) {
	Unused = true;
	break;
	}
	}
	if (Unused) {
	SmallVector<CCValAssign, 16> RVLocs;
	CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
	CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
	for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
	CCValAssign &VA = RVLocs[i];
	if (VA.getLocReg() == X86::FP0 \|\| VA.getLocReg() == X86::FP1)
	return false;
	}
	}

	// Check that the call results are passed in the same way.
	if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
	RetCC_X86, RetCC_X86))
	return false;
	// The callee has to preserve all registers the caller needs to preserve.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
	if (!CCMatch) {
	const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
	if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
	return false;
	}

	unsigned StackArgsSize = 0;

	// If the callee takes no arguments then go on to check the results of the
	// call.
	if (!Outs.empty()) {
	// Check if stack adjustment is needed. For now, do not do this if any
	// argument is passed on the stack.
	SmallVector<CCValAssign, 16> ArgLocs;
	CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);

	// Allocate shadow area for Win64
	if (IsCalleeWin64)
	CCInfo.AllocateStack(32, 8);

	CCInfo.AnalyzeCallOperands(Outs, CC_X86);
	StackArgsSize = CCInfo.getNextStackOffset();

	if (CCInfo.getNextStackOffset()) {
	// Check if the arguments are already laid out in the right way as
	// the caller's fixed stack objects.
	MachineFrameInfo &MFI = MF.getFrameInfo();
	const MachineRegisterInfo *MRI = &MF.getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	SDValue Arg = OutVals[i];
	ISD::ArgFlagsTy Flags = Outs[i].Flags;
	if (VA.getLocInfo() == CCValAssign::Indirect)
	return false;
	if (!VA.isRegLoc()) {
	if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
	MFI, MRI, TII, VA))
	return false;
	}
	}
	}

	bool PositionIndependent = isPositionIndependent();
	// If the tailcall address may be in a register, then make sure it's
	// possible to register allocate for it. In 32-bit, the call address can
	// only target EAX, EDX, or ECX since the tail call must be scheduled after
	// callee-saved registers are restored. These happen to be the same
	// registers used to pass 'inreg' arguments so watch out for those.
	if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
	!isa<ExternalSymbolSDNode>(Callee)) \|\|
	PositionIndependent)) {
	unsigned NumInRegs = 0;
	// In PIC we need an extra register to formulate the address computation
	// for the callee.
	unsigned MaxInRegs = PositionIndependent ? 2 : 3;

	for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
	CCValAssign &VA = ArgLocs[i];
	if (!VA.isRegLoc())
	continue;
	unsigned Reg = VA.getLocReg();
	switch (Reg) {
	default: break;
	case X86::EAX: case X86::EDX: case X86::ECX:
	if (++NumInRegs == MaxInRegs)
	return false;
	break;
	}
	}
	}

	const MachineRegisterInfo &MRI = MF.getRegInfo();
	if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
	return false;
	}

	bool CalleeWillPop =
	X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
	MF.getTarget().Options.GuaranteedTailCallOpt);

	if (unsigned BytesToPop =
	MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
	// If we have bytes to pop, the callee must pop them.
	bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
	if (!CalleePopMatches)
	return false;
	} else if (CalleeWillPop && StackArgsSize > 0) {
	// If we don't have bytes to pop, make sure the callee doesn't pop any.
	return false;
	}

	return true;
	}

	FastISel *
	X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
	const TargetLibraryInfo *libInfo) const {
	return X86::createFastISel(funcInfo, libInfo);
	}

	//===----------------------------------------------------------------------===//
	// Other Lowering Hooks
	//===----------------------------------------------------------------------===//

	static bool MayFoldLoad(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
	}

	static bool MayFoldIntoStore(SDValue Op) {
	return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
	}

	static bool MayFoldIntoZeroExtend(SDValue Op) {
	if (Op.hasOneUse()) {
	unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
	return (ISD::ZERO_EXTEND == Opcode);
	}
	return false;
	}

	static bool isTargetShuffle(unsigned Opcode) {
	switch(Opcode) {
	default: return false;
	case X86ISD::BLENDI:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::SHUFP:
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::MOVLHPS:
	case X86ISD::MOVLHPD:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLPS:
	case X86ISD::MOVLPD:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	case X86ISD::VBROADCAST:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMI:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VZEXT_MOVL:
	return true;
	}
	}

	static bool isTargetShuffleVariableMask(unsigned Opcode) {
	switch (Opcode) {
	default: return false;
	// Target Shuffles.
	case X86ISD::PSHUFB:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERMIL2:
	case X86ISD::VPPERM:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	return true;
	// 'Faux' Target Shuffles.
	case ISD::AND:
	case X86ISD::ANDNP:
	return true;
	}
	}

	SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	int ReturnAddrIndex = FuncInfo->getRAIndex();

	if (ReturnAddrIndex == 0) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
	-(int64_t)SlotSize,
	false);
	FuncInfo->setRAIndex(ReturnAddrIndex);
	}

	return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
	}

	bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
	bool hasSymbolicDisplacement) {
	// Offset should fit into 32 bit immediate field.
	if (!isInt<32>(Offset))
	return false;

	// If we don't have a symbolic displacement - we don't have any extra
	// restrictions.
	if (!hasSymbolicDisplacement)
	return true;

	// FIXME: Some tweaks might be needed for medium code model.
	if (M != CodeModel::Small && M != CodeModel::Kernel)
	return false;

	// For small code model we assume that latest object is 16MB before end of 31
	// bits boundary. We may also accept pretty large negative constants knowing
	// that all objects are in the positive half of address space.
	if (M == CodeModel::Small && Offset < 1610241024)
	return true;

	// For kernel code model we know that all object resist in the negative half
	// of 32bits address space. We may not accept negative offsets, since they may
	// be just off and we may accept pretty large positive ones.
	if (M == CodeModel::Kernel && Offset >= 0)
	return true;

	return false;
	}

	/// Determines whether the callee is required to pop its own arguments.
	/// Callee pop is necessary to support tail calls.
	bool X86::isCalleePop(CallingConv::ID CallingConv,
	bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
	// If GuaranteeTCO is true, we force some calls to be callee pop so that we
	// can guarantee TCO.
	if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
	return true;

	switch (CallingConv) {
	default:
	return false;
	case CallingConv::X86_StdCall:
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::X86_VectorCall:
	return !is64Bit;
	}
	}

	/// \brief Return true if the condition is an unsigned comparison operation.
	static bool isX86CCUnsigned(unsigned X86CC) {
	switch (X86CC) {
	default:
	llvm_unreachable("Invalid integer condition!");
	case X86::COND_E:
	case X86::COND_NE:
	case X86::COND_B:
	case X86::COND_A:
	case X86::COND_BE:
	case X86::COND_AE:
	return true;
	case X86::COND_G:
	case X86::COND_GE:
	case X86::COND_L:
	case X86::COND_LE:
	return false;
	}
	}

	static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
	switch (SetCCOpcode) {
	default: llvm_unreachable("Invalid integer condition!");
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETGT: return X86::COND_G;
	case ISD::SETGE: return X86::COND_GE;
	case ISD::SETLT: return X86::COND_L;
	case ISD::SETLE: return X86::COND_LE;
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETULT: return X86::COND_B;
	case ISD::SETUGT: return X86::COND_A;
	case ISD::SETULE: return X86::COND_BE;
	case ISD::SETUGE: return X86::COND_AE;
	}
	}

	/// Do a one-to-one translation of a ISD::CondCode to the X86-specific
	/// condition code, returning the condition code and the LHS/RHS of the
	/// comparison to make.
	static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
	bool isFP, SDValue &LHS, SDValue &RHS,
	SelectionDAG &DAG) {
	if (!isFP) {
	if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
	if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
	// X > -1 -> X == 0, jump !sign.
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_NS;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
	// X < 0 -> X == 0, jump on sign.
	return X86::COND_S;
	}
	if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
	// X < 1 -> X <= 0
	RHS = DAG.getConstant(0, DL, RHS.getValueType());
	return X86::COND_LE;
	}
	}

	return TranslateIntegerX86CC(SetCCOpcode);
	}

	// First determine if it is required or is profitable to flip the operands.

	// If LHS is a foldable load, but RHS is not, flip the condition.
	if (ISD::isNON_EXTLoad(LHS.getNode()) &&
	!ISD::isNON_EXTLoad(RHS.getNode())) {
	SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
	std::swap(LHS, RHS);
	}

	switch (SetCCOpcode) {
	default: break;
	case ISD::SETOLT:
	case ISD::SETOLE:
	case ISD::SETUGT:
	case ISD::SETUGE:
	std::swap(LHS, RHS);
	break;
	}

	// On a floating point condition, the flags are set as follows:
	// ZF PF CF op
	// 0 \| 0 \| 0 \| X > Y
	// 0 \| 0 \| 1 \| X < Y
	// 1 \| 0 \| 0 \| X == Y
	// 1 \| 1 \| 1 \| unordered
	switch (SetCCOpcode) {
	default: llvm_unreachable("Condcode should be pre-legalized away");
	case ISD::SETUEQ:
	case ISD::SETEQ: return X86::COND_E;
	case ISD::SETOLT: // flipped
	case ISD::SETOGT:
	case ISD::SETGT: return X86::COND_A;
	case ISD::SETOLE: // flipped
	case ISD::SETOGE:
	case ISD::SETGE: return X86::COND_AE;
	case ISD::SETUGT: // flipped
	case ISD::SETULT:
	case ISD::SETLT: return X86::COND_B;
	case ISD::SETUGE: // flipped
	case ISD::SETULE:
	case ISD::SETLE: return X86::COND_BE;
	case ISD::SETONE:
	case ISD::SETNE: return X86::COND_NE;
	case ISD::SETUO: return X86::COND_P;
	case ISD::SETO: return X86::COND_NP;
	case ISD::SETOEQ:
	case ISD::SETUNE: return X86::COND_INVALID;
	}
	}

	/// Is there a floating point cmov for the specific X86 condition code?
	/// Current x86 isa includes the following FP cmov instructions:
	/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
	static bool hasFPCMov(unsigned X86CC) {
	switch (X86CC) {
	default:
	return false;
	case X86::COND_B:
	case X86::COND_BE:
	case X86::COND_E:
	case X86::COND_P:
	case X86::COND_A:
	case X86::COND_AE:
	case X86::COND_NE:
	case X86::COND_NP:
	return true;
	}
	}


	bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
	const CallInst &I,
	unsigned Intrinsic) const {

	const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
	if (!IntrData)
	return false;

	Info.opc = ISD::INTRINSIC_W_CHAIN;
	Info.readMem = false;
	Info.writeMem = false;
	Info.vol = false;
	Info.offset = 0;

	switch (IntrData->Type) {
	case EXPAND_FROM_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getType());
	Info.align = 1;
	Info.readMem = true;
	break;
	}
	case COMPRESS_TO_MEM: {
	Info.ptrVal = I.getArgOperand(0);
	Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
	Info.align = 1;
	Info.writeMem = true;
	break;
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	Info.ptrVal = I.getArgOperand(0);
	MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
	MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
	ScalarVT = MVT::i8;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
	ScalarVT = MVT::i16;
	else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
	ScalarVT = MVT::i32;

	Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
	Info.align = 1;
	Info.writeMem = true;
	break;
	}
	default:
	return false;
	}

	return true;
	}

	/// Returns true if the target can instruction select the
	/// specified FP immediate natively. If false, the legalizer will
	/// materialize the FP immediate as a load from a constant pool.
	bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
	for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
	if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
	return true;
	}
	return false;
	}

	bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
	ISD::LoadExtType ExtTy,
	EVT NewVT) const {
	// "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
	// relocation target a movq or addq instruction: don't let the load shrink.
	SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
	if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
	if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
	return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
	return true;
	}

	/// \brief Returns true if it is beneficial to convert a load of a constant
	/// to just the constant itself.
	bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
	Type *Ty) const {
	assert(Ty->isIntegerTy());

	unsigned BitSize = Ty->getPrimitiveSizeInBits();
	if (BitSize == 0 \|\| BitSize > 64)
	return false;
	return true;
	}

	bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
	unsigned Index) const {
	if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
	return false;

	return (Index == 0 \|\| Index == ResVT.getVectorNumElements());
	}

	bool X86TargetLowering::isCheapToSpeculateCttz() const {
	// Speculate cttz only if we can directly use TZCNT.
	return Subtarget.hasBMI();
	}

	bool X86TargetLowering::isCheapToSpeculateCtlz() const {
	// Speculate ctlz only if we can directly use LZCNT.
	return Subtarget.hasLZCNT();
	}

	bool X86TargetLowering::isCtlzFast() const {
	return Subtarget.hasFastLZCNT();
	}

	bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
	const Instruction &AndI) const {
	return true;
	}

	bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
	if (!Subtarget.hasBMI())
	return false;

	// There are only 32-bit and 64-bit forms for 'andn'.
	EVT VT = Y.getValueType();
	if (VT != MVT::i32 && VT != MVT::i64)
	return false;

	return true;
	}

	MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
	MVT VT = MVT::getIntegerVT(NumBits);
	if (isTypeLegal(VT))
	return VT;

	// PMOVMSKB can handle this.
	if (NumBits == 128 && isTypeLegal(MVT::v16i8))
	return MVT::v16i8;

	// VPMOVMSKB can handle this.
	if (NumBits == 256 && isTypeLegal(MVT::v32i8))
	return MVT::v32i8;

	// TODO: Allow 64-bit type for 32-bit target.
	// TODO: 512-bit types should be allowed, but make sure that those
	// cases are handled in combineVectorSizedSetCCEquality().

	return MVT::INVALID_SIMPLE_VALUE_TYPE;
	}

	/// Val is the undef sentinel value or equal to the specified value.
	static bool isUndefOrEqual(int Val, int CmpVal) {
	return ((Val == SM_SentinelUndef) \|\| (Val == CmpVal));
	}

	/// Val is either the undef or zero sentinel value.
	static bool isUndefOrZero(int Val) {
	return ((Val == SM_SentinelUndef) \|\| (Val == SM_SentinelZero));
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is the undef sentinel value.
	static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (Mask[i] != SM_SentinelUndef)
	return false;
	return true;
	}

	/// Return true if Val is undef or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrInRange(int Val, int Low, int Hi) {
	return (Val == SM_SentinelUndef) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrInRange(ArrayRef<int> Mask,
	int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if Val is undef, zero or if its value falls within the
	/// specified range (L, H].
	static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
	return isUndefOrZero(Val) \|\| (Val >= Low && Val < Hi);
	}

	/// Return true if every element in Mask is undef, zero or if its value
	/// falls within the specified range (L, H].
	static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
	for (int M : Mask)
	if (!isUndefOrZeroOrInRange(M, Low, Hi))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size]. or is undef.
	static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
	unsigned Pos, unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
	if (!isUndefOrEqual(Mask[i], Low))
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size, falls within the specified
	/// sequential range (Low, Low+Size], or is undef or is zero.
	static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size, int Low) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
	if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
	return false;
	return true;
	}

	/// Return true if every element in Mask, beginning
	/// from position Pos and ending in Pos+Size is undef or is zero.
	static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
	unsigned Size) {
	for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
	if (!isUndefOrZero(Mask[i]))
	return false;
	return true;
	}

	/// \brief Helper function to test whether a shuffle mask could be
	/// simplified by widening the elements being shuffled.
	///
	/// Appends the mask for wider elements in WidenedMask if valid. Otherwise
	/// leaves it in an unspecified state.
	///
	/// NOTE: This must handle normal vector shuffle masks and target vector
	/// shuffle masks. The latter have the special property of a '-2' representing
	/// a zero-ed lane of a vector.
	static bool canWidenShuffleElements(ArrayRef<int> Mask,
	SmallVectorImpl<int> &WidenedMask) {
	WidenedMask.assign(Mask.size() / 2, 0);
	for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
	int M0 = Mask[i];
	int M1 = Mask[i + 1];

	// If both elements are undef, its trivial.
	if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
	WidenedMask[i / 2] = SM_SentinelUndef;
	continue;
	}

	// Check for an undef mask and a mask value properly aligned to fit with
	// a pair of values. If we find such a case, use the non-undef mask's value.
	if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
	WidenedMask[i / 2] = M1 / 2;
	continue;
	}
	if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// When zeroing, we need to spread the zeroing across both lanes to widen.
	if (M0 == SM_SentinelZero \|\| M1 == SM_SentinelZero) {
	if ((M0 == SM_SentinelZero \|\| M0 == SM_SentinelUndef) &&
	(M1 == SM_SentinelZero \|\| M1 == SM_SentinelUndef)) {
	WidenedMask[i / 2] = SM_SentinelZero;
	continue;
	}
	return false;
	}

	// Finally check if the two mask values are adjacent and aligned with
	// a pair.
	if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
	WidenedMask[i / 2] = M0 / 2;
	continue;
	}

	// Otherwise we can't safely widen the elements used in this shuffle.
	return false;
	}
	assert(WidenedMask.size() == Mask.size() / 2 &&
	"Incorrect size of mask after widening the elements!");

	return true;
	}

	/// Helper function to scale a shuffle or target shuffle mask, replacing each
	/// mask index with the scaled sequential indices for an equivalent narrowed
	/// mask. This is the reverse process to canWidenShuffleElements, but can always
	/// succeed.
	static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
	SmallVectorImpl<int> &ScaledMask) {
	assert(0 < Scale && "Unexpected scaling factor");
	int NumElts = Mask.size();
	ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);

	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];

	// Repeat sentinel values in every mask element.
	if (M < 0) {
	for (int s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = M;
	continue;
	}

	// Scale mask element and increment across each mask element.
	for (int s = 0; s != Scale; ++s)
	ScaledMask[(Scale * i) + s] = (Scale * M) + s;
	}
	}

	/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
	/// extract that is suitable for instruction that extract 128 or 256 bit vectors
	static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unexpected vector width");
	if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
	return false;

	// The index should be aligned on a vecWidth-bit boundary.
	uint64_t Index = N->getConstantOperandVal(1);
	MVT VT = N->getSimpleValueType(0);
	unsigned ElSize = VT.getScalarSizeInBits();
	return (Index * ElSize) % vecWidth == 0;
	}

	/// Return true if the specified INSERT_SUBVECTOR
	/// operand specifies a subvector insert that is suitable for input to
	/// insertion of 128 or 256-bit subvectors
	static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unexpected vector width");
	if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
	return false;

	// The index should be aligned on a vecWidth-bit boundary.
	uint64_t Index = N->getConstantOperandVal(2);
	MVT VT = N->getSimpleValueType(0);
	unsigned ElSize = VT.getScalarSizeInBits();
	return (Index * ElSize) % vecWidth == 0;
	}

	bool X86::isVINSERT128Index(SDNode *N) {
	return isVINSERTIndex(N, 128);
	}

	bool X86::isVINSERT256Index(SDNode *N) {
	return isVINSERTIndex(N, 256);
	}

	bool X86::isVEXTRACT128Index(SDNode *N) {
	return isVEXTRACTIndex(N, 128);
	}

	bool X86::isVEXTRACT256Index(SDNode *N) {
	return isVEXTRACTIndex(N, 256);
	}

	static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unsupported vector width");
	assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
	"Illegal extract subvector for VEXTRACT");

	uint64_t Index = N->getConstantOperandVal(1);
	MVT VecVT = N->getOperand(0).getSimpleValueType();
	unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
	return Index / NumElemsPerChunk;
	}

	static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
	assert((vecWidth == 128 \|\| vecWidth == 256) && "Unsupported vector width");
	assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
	"Illegal insert subvector for VINSERT");

	uint64_t Index = N->getConstantOperandVal(2);
	MVT VecVT = N->getSimpleValueType(0);
	unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
	return Index / NumElemsPerChunk;
	}

	/// Return the appropriate immediate to extract the specified
	/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
	unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
	return getExtractVEXTRACTImmediate(N, 128);
	}

	/// Return the appropriate immediate to extract the specified
	/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
	unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
	return getExtractVEXTRACTImmediate(N, 256);
	}

	/// Return the appropriate immediate to insert at the specified
	/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
	unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
	return getInsertVINSERTImmediate(N, 128);
	}

	/// Return the appropriate immediate to insert at the specified
	/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
	unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
	return getInsertVINSERTImmediate(N, 256);
	}

	/// Returns true if Elt is a constant zero or a floating point constant +0.0.
	bool X86::isZeroNode(SDValue Elt) {
	return isNullConstant(Elt) \|\| isNullFPConstant(Elt);
	}

	// Build a vector of constants.
	// Use an UNDEF node if MaskElt == -1.
	// Split 64-bit constants in the 32-bit mode.
	static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
	const SDLoc &dl, bool IsMask = false) {

	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0; i < NumElts; ++i) {
	bool IsUndef = Values[i] < 0 && IsMask;
	SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(Values[i], dl, EltVT);
	Ops.push_back(OpNode);
	if (Split)
	Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
	DAG.getConstant(0, dl, EltVT));
	}
	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	if (Split)
	ConstsNode = DAG.getBitcast(VT, ConstsNode);
	return ConstsNode;
	}

	static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
	MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert(Bits.size() == Undefs.getBitWidth() &&
	"Unequal constant and undef arrays");
	SmallVector<SDValue, 32> Ops;
	bool Split = false;

	MVT ConstVecVT = VT;
	unsigned NumElts = VT.getVectorNumElements();
	bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
	if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
	ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
	Split = true;
	}

	MVT EltVT = ConstVecVT.getVectorElementType();
	for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
	if (Undefs[i]) {
	Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
	continue;
	}
	const APInt &V = Bits[i];
	assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
	if (Split) {
	Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
	Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
	} else if (EltVT == MVT::f32) {
	APFloat FV(APFloat::IEEEsingle(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else if (EltVT == MVT::f64) {
	APFloat FV(APFloat::IEEEdouble(), V);
	Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
	} else {
	Ops.push_back(DAG.getConstant(V, dl, EltVT));
	}
	}

	SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
	return DAG.getBitcast(VT, ConstsNode);
	}

	/// Returns a vector of specified type with all zero elements.
	static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector() \|\|
	VT.getVectorElementType() == MVT::i1) &&
	"Unexpected vector type");

	// Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
	// type. This ensures they get CSE'd. But if the integer type is not
	// available, use a floating-point +0.0 instead.
	SDValue Vec;
	if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
	Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
	} else if (VT.getVectorElementType() == MVT::i1) {
	assert((Subtarget.hasBWI() \|\| VT.getVectorNumElements() <= 16) &&
	"Unexpected vector type");
	assert((Subtarget.hasVLX() \|\| VT.getVectorNumElements() >= 8) &&
	"Unexpected vector type");
	Vec = DAG.getConstant(0, dl, VT);
	} else {
	unsigned Num32BitElts = VT.getSizeInBits() / 32;
	Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
	}
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
	const SDLoc &dl, unsigned vectorWidth) {
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	unsigned Factor = VT.getSizeInBits()/vectorWidth;
	EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
	VT.getVectorNumElements()/Factor);

	// Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
	unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	// If the input is a buildvector just emit a smaller one.
	if (Vec.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
	}

	/// Generate a DAG to grab 128-bits from a vector > 128 bits. This
	/// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
	/// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
	/// instructions or a simple subregister reference. Idx is an index in the
	/// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering EXTRACT_VECTOR_ELT operations easier.
	static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert((Vec.getValueType().is256BitVector() \|\|
	Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 128);
	}

	/// Generate a DAG to grab 256-bits from a 512-bit vector.
	static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
	return extractSubVector(Vec, IdxVal, DAG, dl, 256);
	}

	static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl,
	unsigned vectorWidth) {
	assert((vectorWidth == 128 \|\| vectorWidth == 256) &&
	"Unsupported vector width");
	// Inserting UNDEF is Result
	if (Vec.isUndef())
	return Result;
	EVT VT = Vec.getValueType();
	EVT ElVT = VT.getVectorElementType();
	EVT ResultVT = Result.getValueType();

	// Insert the relevant vectorWidth bits.
	unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// This is the index of the first element of the vectorWidth-bit chunk
	// we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
	IdxVal &= ~(ElemsPerChunk - 1);

	SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
	}

	/// Generate a DAG to put 128-bits into a vector > 128 bits. This
	/// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
	/// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
	/// simple superregister reference. Idx is an index in the 128 bits
	/// we want. It need not be aligned to a 128-bit boundary. That makes
	/// lowering INSERT_VECTOR_ELT operations easier.
	static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
	}

	static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
	SelectionDAG &DAG, const SDLoc &dl) {
	assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
	return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
	}

	// Return true if the instruction zeroes the unused upper part of the
	// destination and accepts mask.
	static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
	switch (Opcode) {
	default:
	return false;
	case X86ISD::PCMPEQM:
	case X86ISD::PCMPGTM:
	case X86ISD::CMPM:
	case X86ISD::CMPMU:
	return true;
	}
	}

	/// Insert i1-subvector to i1-vector.
	static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue SubVec = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);

	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
	return Op;

	MVT OpVT = Op.getSimpleValueType();
	MVT SubVecVT = SubVec.getSimpleValueType();
	unsigned NumElems = OpVT.getVectorNumElements();
	unsigned SubVecNumElems = SubVecVT.getVectorNumElements();

	assert(IdxVal + SubVecNumElems <= NumElems &&
	IdxVal % SubVecVT.getSizeInBits() == 0 &&
	"Unexpected index value in INSERT_SUBVECTOR");

	// There are 3 possible cases:
	// 1. Subvector should be inserted in the lower part (IdxVal == 0)
	// 2. Subvector should be inserted in the upper part
	// (IdxVal + SubVecNumElems == NumElems)
	// 3. Subvector should be inserted in the middle (for example v2i1
	// to v16i1, index 2)

	// If this node widens - by concatenating zeroes - the type of the result
	// of a node with instruction that zeroes all upper (irrelevant) bits of the
	// output register, mark this node as legal to enable replacing them with
	// the v8i1 version of the previous instruction during instruction selection.
	// For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
	// while zeroing all the upper remaining 60 bits of the register. if the
	// result of such instruction is inserted into an allZeroVector, then we can
	// safely remove insert_vector (in instruction selection) as the cmp instr
	// already zeroed the rest of the register.
	if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
	(isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) \|\|
	(SubVec.getOpcode() == ISD::AND &&
	(isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) \|\|
	isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
	return Op;

	// extend to natively supported kshift
	MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
	MVT WideOpVT = OpVT;
	if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
	WideOpVT = MinVT;

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	SDValue Undef = DAG.getUNDEF(WideOpVT);
	SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	Undef, SubVec, ZeroIdx);

	// Extract sub-vector if require.
	auto ExtractSubVec = [&](SDValue V) {
	return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
	OpVT, V, ZeroIdx);
	};

	if (Vec.isUndef()) {
	if (IdxVal != 0) {
	SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
	WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
	ShiftBits);
	}
	return ExtractSubVec(WideSubVec);
	}

	if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
	NumElems = WideOpVT.getVectorNumElements();
	unsigned ShiftLeft = NumElems - SubVecNumElems;
	unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
	DAG.getConstant(ShiftLeft, dl, MVT::i8));
	Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
	DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
	return ExtractSubVec(Vec);
	}

	if (IdxVal == 0) {
	// Zero lower bits of the Vec
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	// Merge them together, SubVec should be zero extended.
	WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
	getZeroVector(WideOpVT, Subtarget, DAG, dl),
	SubVec, ZeroIdx);
	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
	return ExtractSubVec(Vec);
	}

	// Simple case when we put subvector in the upper part
	if (IdxVal + SubVecNumElems == NumElems) {
	// Zero upper bits of the Vec
	WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
	Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
	return ExtractSubVec(Vec);
	}
	// Subvector should be inserted in the middle - use shuffle
	WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
	SubVec, ZeroIdx);
	SmallVector<int, 64> Mask;
	for (unsigned i = 0; i < NumElems; ++i)
	Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
	i : i + NumElems);
	return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
	}

	/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
	/// instructions. This is used because creating CONCAT_VECTOR nodes of
	/// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
	/// large BUILD_VECTORS.
	static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
	unsigned NumElems, SelectionDAG &DAG,
	const SDLoc &dl) {
	SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
	return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
	}

	/// Returns a vector of specified type with all bits set.
	/// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
	/// Then bitcast to their original type, ensuring they get CSE'd.
	static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Expected a 128/256/512-bit vector type");

	APInt Ones = APInt::getAllOnesValue(32);
	unsigned NumElts = VT.getSizeInBits() / 32;
	SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
	return DAG.getBitcast(VT, Vec);
	}

	static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
	SelectionDAG &DAG) {
	EVT InVT = In.getValueType();
	assert((X86ISD::VSEXT == Opc \|\| X86ISD::VZEXT == Opc) && "Unexpected opcode");

	if (VT.is128BitVector() && InVT.is128BitVector())
	return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
	: DAG.getZeroExtendVectorInReg(In, DL, VT);

	// For 256-bit vectors, we only need the lower (128-bit) input half.
	// For 512-bit vectors, we only need the lower input half or quarter.
	if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
	int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
	In = extractSubVector(In, 0, DAG, DL,
	std::max(128, (int)VT.getSizeInBits() / Scale));
	}

	return DAG.getNode(Opc, DL, VT, In);
	}

	/// Generate unpacklo/unpackhi shuffle mask.
	static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
	bool Unary) {
	assert(Mask.empty() && "Expected an empty shuffle mask vector");
	int NumElts = VT.getVectorNumElements();
	int NumEltsInLane = 128 / VT.getScalarSizeInBits();

	for (int i = 0; i < NumElts; ++i) {
	unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
	int Pos = (i % NumEltsInLane) / 2 + LaneStart;
	Pos += (Unary ? 0 : NumElts * (i % 2));
	Pos += (Lo ? 0 : NumEltsInLane / 2);
	Mask.push_back(Pos);
	}
	}

	/// Returns a vector_shuffle node for an unpackl operation.
	static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / true, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Returns a vector_shuffle node for an unpackh operation.
	static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
	SDValue V1, SDValue V2) {
	SmallVector<int, 8> Mask;
	createUnpackShuffleMask(VT, Mask, /* Lo = / false, / Unary = */ false);
	return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
	}

	/// Return a vector_shuffle of the specified vector of zero or undef vector.
	/// This produces a shuffle where the low element of V2 is swizzled into the
	/// zero/undef vector, landing at element Idx.
	/// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
	static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
	bool IsZero,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = V2.getSimpleValueType();
	SDValue V1 = IsZero
	? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
	int NumElems = VT.getVectorNumElements();
	SmallVector<int, 16> MaskVec(NumElems);
	for (int i = 0; i != NumElems; ++i)
	// If this is the insertion idx, put the low elt of V2 here.
	MaskVec[i] = (i == Idx) ? NumElems : i;
	return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
	}

	static SDValue peekThroughBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST)
	V = V.getOperand(0);
	return V;
	}

	static SDValue peekThroughOneUseBitcasts(SDValue V) {
	while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
	V.getOperand(0).hasOneUse())
	V = V.getOperand(0);
	return V;
	}

	static const Constant *getTargetConstantFromNode(SDValue Op) {
	Op = peekThroughBitcasts(Op);

	auto *Load = dyn_cast<LoadSDNode>(Op);
	if (!Load)
	return nullptr;

	SDValue Ptr = Load->getBasePtr();
	if (Ptr->getOpcode() == X86ISD::Wrapper \|\|
	Ptr->getOpcode() == X86ISD::WrapperRIP)
	Ptr = Ptr->getOperand(0);

	auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
	if (!CNode \|\| CNode->isMachineConstantPoolEntry())
	return nullptr;

	return dyn_cast<Constant>(CNode->getConstVal());
	}

	// Extract raw constant bits from constant pools.
	static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
	APInt &UndefElts,
	SmallVectorImpl<APInt> &EltBits,
	bool AllowWholeUndefs = true,
	bool AllowPartialUndefs = true) {
	assert(EltBits.empty() && "Expected an empty EltBits vector");

	Op = peekThroughBitcasts(Op);

	EVT VT = Op.getValueType();
	unsigned SizeInBits = VT.getSizeInBits();
	assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
	unsigned NumElts = SizeInBits / EltSizeInBits;

	// Bitcast a source array of element bits to the target size.
	auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
	unsigned NumSrcElts = UndefSrcElts.getBitWidth();
	unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
	assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
	"Constant bit sizes don't match");

	// Don't split if we don't allow undef bits.
	bool AllowUndefs = AllowWholeUndefs \|\| AllowPartialUndefs;
	if (UndefSrcElts.getBoolValue() && !AllowUndefs)
	return false;

	// If we're already the right size, don't bother bitcasting.
	if (NumSrcElts == NumElts) {
	UndefElts = UndefSrcElts;
	EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
	return true;
	}

	// Extract all the undef/constant element data and pack into single bitsets.
	APInt UndefBits(SizeInBits, 0);
	APInt MaskBits(SizeInBits, 0);

	for (unsigned i = 0; i != NumSrcElts; ++i) {
	unsigned BitOffset = i * SrcEltSizeInBits;
	if (UndefSrcElts[i])
	UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
	MaskBits.insertBits(SrcEltBits[i], BitOffset);
	}

	// Split the undef/constant single bitset data into the target elements.
	UndefElts = APInt(NumElts, 0);
	EltBits.resize(NumElts, APInt(EltSizeInBits, 0));

	for (unsigned i = 0; i != NumElts; ++i) {
	unsigned BitOffset = i * EltSizeInBits;
	APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);

	// Only treat an element as UNDEF if all bits are UNDEF.
	if (UndefEltBits.isAllOnesValue()) {
	if (!AllowWholeUndefs)
	return false;
	UndefElts.setBit(i);
	continue;
	}

	// If only some bits are UNDEF then treat them as zero (or bail if not
	// supported).
	if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
	return false;

	APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
	EltBits[i] = Bits.getZExtValue();
	}
	return true;
	};

	// Collect constant bits and insert into mask/undef bit masks.
	auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
	unsigned UndefBitIndex) {
	if (!Cst)
	return false;
	if (isa<UndefValue>(Cst)) {
	Undefs.setBit(UndefBitIndex);
	return true;
	}
	if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
	Mask = CInt->getValue();
	return true;
	}
	if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
	Mask = CFP->getValueAPF().bitcastToAPInt();
	return true;
	}
	return false;
	};

	// Extract constant bits from build vector.
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
	const SDValue &Src = Op.getOperand(i);
	if (Src.isUndef()) {
	UndefSrcElts.setBit(i);
	continue;
	}
	auto *Cst = cast<ConstantSDNode>(Src);
	SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
	}
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from constant pool vector.
	if (auto *Cst = getTargetConstantFromNode(Op)) {
	Type *CstTy = Cst->getType();
	if (!CstTy->isVectorTy() \|\| (SizeInBits != CstTy->getPrimitiveSizeInBits()))
	return false;

	unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
	unsigned NumSrcElts = CstTy->getVectorNumElements();

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
	for (unsigned i = 0; i != NumSrcElts; ++i)
	if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
	UndefSrcElts, i))
	return false;

	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	// Extract constant bits from a broadcasted constant pool scalar.
	if (Op.getOpcode() == X86ISD::VBROADCAST &&
	EltSizeInBits <= VT.getScalarSizeInBits()) {
	if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
	unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
	if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
	if (UndefSrcElts[0])
	UndefSrcElts.setBits(0, NumSrcElts);
	SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
	return CastBitData(UndefSrcElts, SrcEltBits);
	}
	}
	}

	// Extract a rematerialized scalar constant insertion.
	if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
	Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
	isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
	unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
	unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;

	APInt UndefSrcElts(NumSrcElts, 0);
	SmallVector<APInt, 64> SrcEltBits;
	auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
	SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
	SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
	return CastBitData(UndefSrcElts, SrcEltBits);
	}

	return false;
	}

	static bool getTargetShuffleMaskIndices(SDValue MaskNode,
	unsigned MaskEltSizeInBits,
	SmallVectorImpl<uint64_t> &RawMask) {
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;

	// Extract the raw target constant bits.
	// FIXME: We currently don't support UNDEF bits or mask entries.
	if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
	EltBits, /* AllowWholeUndefs */ false,
	/* AllowPartialUndefs */ false))
	return false;

	// Insert the extracted elements into the mask.
	for (APInt Elt : EltBits)
	RawMask.push_back(Elt.getZExtValue());

	return true;
	}

	/// Calculates the shuffle mask corresponding to the target-specific opcode.
	/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
	/// operands in \p Ops, and returns true.
	/// Sets \p IsUnary to true if only one source is used. Note that this will set
	/// IsUnary for shuffles which use a single input multiple times, and in those
	/// cases it will adjust the mask to only have indices within that single input.
	/// It is an error to call this with non-empty Mask/Ops vectors.
	static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
	SmallVectorImpl<SDValue> &Ops,
	SmallVectorImpl<int> &Mask, bool &IsUnary) {
	unsigned NumElems = VT.getVectorNumElements();
	SDValue ImmN;

	assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
	assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");

	IsUnary = false;
	bool IsFakeUnary = false;
	switch(N->getOpcode()) {
	case X86ISD::BLENDI:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::SHUFP:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::INSERTPS:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::EXTRQI:
	if (isa<ConstantSDNode>(N->getOperand(1)) &&
	isa<ConstantSDNode>(N->getOperand(2))) {
	int BitLen = N->getConstantOperandVal(1);
	int BitIdx = N->getConstantOperandVal(2);
	DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = true;
	}
	break;
	case X86ISD::INSERTQI:
	if (isa<ConstantSDNode>(N->getOperand(2)) &&
	isa<ConstantSDNode>(N->getOperand(3))) {
	int BitLen = N->getConstantOperandVal(2);
	int BitIdx = N->getConstantOperandVal(3);
	DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	}
	break;
	case X86ISD::UNPCKH:
	DecodeUNPCKHMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::UNPCKL:
	DecodeUNPCKLMask(VT, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVHLPS:
	DecodeMOVHLPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVLHPS:
	DecodeMOVLHPSMask(NumElems, Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::PALIGNR:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(0));
	break;
	case X86ISD::VSHLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VSRLDQ:
	assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
	ImmN = N->getOperand(N->getNumOperands() - 1);
	DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFD:
	case X86ISD::VPERMILPI:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFHW:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::PSHUFLW:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::VZEXT_MOVL:
	DecodeZeroMoveLowMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::VBROADCAST: {
	SDValue N0 = N->getOperand(0);
	// See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
	// add the pre-extracted value to the Ops vector.
	if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	N0.getOperand(0).getValueType() == VT &&
	N0.getConstantOperandVal(1) == 0)
	Ops.push_back(N0.getOperand(0));

	// We only decode broadcasts of same-sized vectors, unless the broadcast
	// came from an extract from the original width. If we found one, we
	// pushed it the Ops vector above.
	if (N0.getValueType() == VT \|\| !Ops.empty()) {
	DecodeVectorBroadcast(VT, Mask);
	IsUnary = true;
	break;
	}
	return false;
	}
	case X86ISD::VPERMILPV: {
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMILPMask(VT, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMILPMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::PSHUFB: {
	IsUnary = true;
	SDValue MaskNode = N->getOperand(1);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodePSHUFBMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodePSHUFBMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMI:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
	break;
	case X86ISD::VPERM2X128:
	ImmN = N->getOperand(N->getNumOperands()-1);
	DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	break;
	case X86ISD::MOVSLDUP:
	DecodeMOVSLDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVSHDUP:
	DecodeMOVSHDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVDDUP:
	DecodeMOVDDUPMask(VT, Mask);
	IsUnary = true;
	break;
	case X86ISD::MOVLHPD:
	case X86ISD::MOVLPD:
	case X86ISD::MOVLPS:
	// Not yet implemented
	return false;
	case X86ISD::VPERMIL2: {
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	SDValue MaskNode = N->getOperand(2);
	SDValue CtrlNode = N->getOperand(3);
	if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
	unsigned CtrlImm = CtrlOp->getZExtValue();
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
	break;
	}
	}
	return false;
	}
	case X86ISD::VPPERM: {
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
	SDValue MaskNode = N->getOperand(2);
	SmallVector<uint64_t, 32> RawMask;
	if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
	DecodeVPPERMMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPPERMMask(C, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV: {
	IsUnary = true;
	// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
	Ops.push_back(N->getOperand(1));
	SDValue MaskNode = N->getOperand(0);
	SmallVector<uint64_t, 32> RawMask;
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
	DecodeVPERMVMask(RawMask, Mask);
	break;
	}
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMVMask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMV3: {
	IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
	Ops.push_back(N->getOperand(0));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(1);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	case X86ISD::VPERMIV3: {
	IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
	// Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
	Ops.push_back(N->getOperand(1));
	Ops.push_back(N->getOperand(2));
	SDValue MaskNode = N->getOperand(0);
	unsigned MaskEltSize = VT.getScalarSizeInBits();
	if (auto *C = getTargetConstantFromNode(MaskNode)) {
	DecodeVPERMV3Mask(C, MaskEltSize, Mask);
	break;
	}
	return false;
	}
	default: llvm_unreachable("unknown target shuffle node");
	}

	// Empty mask indicates the decode failed.
	if (Mask.empty())
	return false;

	// Check if we're getting a shuffle mask with zero'd elements.
	if (!AllowSentinelZero)
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return false;

	// If we have a fake unary shuffle, the shuffle mask is spread across two
	// inputs that are actually the same node. Re-map the mask to always point
	// into the first input.
	if (IsFakeUnary)
	for (int &M : Mask)
	if (M >= (int)Mask.size())
	M -= Mask.size();

	// If we didn't already add operands in the opcode-specific code, default to
	// adding 1 or 2 operands starting at 0.
	if (Ops.empty()) {
	Ops.push_back(N->getOperand(0));
	if (!IsUnary \|\| IsFakeUnary)
	Ops.push_back(N->getOperand(1));
	}

	return true;
	}

	/// Check a target shuffle mask's inputs to see if we can set any values to
	/// SM_SentinelZero - this is for elements that are known to be zero
	/// (not just zeroable) from their inputs.
	/// Returns true if the target shuffle mask was decoded.
	static bool setTargetShuffleZeroElements(SDValue N,
	SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops) {
	bool IsUnary;
	if (!isTargetShuffle(N.getOpcode()))
	return false;

	MVT VT = N.getSimpleValueType();
	if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
	return false;

	SDValue V1 = Ops[0];
	SDValue V2 = IsUnary ? V1 : Ops[1];

	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	assert((VT.getSizeInBits() % Mask.size()) == 0 &&
	"Illegal split of shuffle value type");
	unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();

	// Extract known constant input data.
	APInt UndefSrcElts[2];
	SmallVector<APInt, 32> SrcEltBits[2];
	bool IsSrcConstant[2] = {
	getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
	SrcEltBits[0], true, false),
	getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
	SrcEltBits[1], true, false)};

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];

	// Already decoded as SM_SentinelZero / SM_SentinelUndef.
	if (M < 0)
	continue;

	// Determine shuffle input and normalize the mask.
	unsigned SrcIdx = M / Size;
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// We are referencing an UNDEF input.
	if (V.isUndef()) {
	Mask[i] = SM_SentinelUndef;
	continue;
	}

	// SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
	// TODO: We currently only set UNDEF for integer types - floats use the same
	// registers as vectors and many of the scalar folded loads rely on the
	// SCALAR_TO_VECTOR pattern.
	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	(Size % V.getValueType().getVectorNumElements()) == 0) {
	int Scale = Size / V.getValueType().getVectorNumElements();
	int Idx = M / Scale;
	if (Idx != 0 && !VT.isFloatingPoint())
	Mask[i] = SM_SentinelUndef;
	else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
	Mask[i] = SM_SentinelZero;
	continue;
	}

	// Attempt to extract from the source's constant bits.
	if (IsSrcConstant[SrcIdx]) {
	if (UndefSrcElts[SrcIdx][M])
	Mask[i] = SM_SentinelUndef;
	else if (SrcEltBits[SrcIdx][M] == 0)
	Mask[i] = SM_SentinelZero;
	}
	}

	assert(VT.getVectorNumElements() == Mask.size() &&
	"Different mask size from vector size!");
	return true;
	}

	// Attempt to decode ops that could be represented as a shuffle mask.
	// The decoded shuffle mask may contain a different number of elements to the
	// destination value type.
	static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
	SmallVectorImpl<SDValue> &Ops,
	SelectionDAG &DAG) {
	Mask.clear();
	Ops.clear();

	MVT VT = N.getSimpleValueType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumSizeInBits = VT.getSizeInBits();
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
	"Expected byte aligned value types");

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case ISD::AND:
	case X86ISD::ANDNP: {
	// Attempt to decode as a per-byte mask.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	SDValue N0 = N.getOperand(0);
	SDValue N1 = N.getOperand(1);
	bool IsAndN = (X86ISD::ANDNP == Opcode);
	uint64_t ZeroMask = IsAndN ? 255 : 0;
	if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
	return false;
	for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
	if (UndefElts[i]) {
	Mask.push_back(SM_SentinelUndef);
	continue;
	}
	uint64_t ByteBits = EltBits[i].getZExtValue();
	if (ByteBits != 0 && ByteBits != 255)
	return false;
	Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
	}
	Ops.push_back(IsAndN ? N1 : N0);
	return true;
	}
	case ISD::SCALAR_TO_VECTOR: {
	// Match against a scalar_to_vector of an extract from a vector,
	// for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
	SDValue N0 = N.getOperand(0);
	SDValue SrcExtract;

	if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	N0.getOperand(0).getValueType() == VT) {
	SrcExtract = N0;
	} else if (N0.getOpcode() == ISD::AssertZext &&
	N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
	cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
	SrcExtract = N0.getOperand(0);
	assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
	} else if (N0.getOpcode() == ISD::AssertZext &&
	N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
	cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
	SrcExtract = N0.getOperand(0);
	assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
	}

	if (!SrcExtract \|\| !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
	return false;

	SDValue SrcVec = SrcExtract.getOperand(0);
	EVT SrcVT = SrcVec.getValueType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;

	unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
	if (NumSrcElts <= SrcIdx)
	return false;

	Ops.push_back(SrcVec);
	Mask.push_back(SrcIdx);
	Mask.append(NumZeros, SM_SentinelZero);
	Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
	return true;
	}
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: {
	SDValue InVec = N.getOperand(0);
	SDValue InScl = N.getOperand(1);
	uint64_t InIdx = N.getConstantOperandVal(2);
	assert(InIdx < NumElts && "Illegal insertion index");

	// Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
	if (X86::isZeroNode(InScl)) {
	Ops.push_back(InVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
	return true;
	}

	// Attempt to recognise a PINSR(ASSERTZEXT(PEXTR)) shuffle pattern.
	// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
	unsigned ExOp =
	(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
	if (InScl.getOpcode() != ISD::AssertZext \|\|
	InScl.getOperand(0).getOpcode() != ExOp)
	return false;

	SDValue ExVec = InScl.getOperand(0).getOperand(0);
	uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
	assert(ExIdx < NumElts && "Illegal extraction index");
	Ops.push_back(InVec);
	Ops.push_back(ExVec);
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
	return true;
	}
	case X86ISD::PACKSS: {
	// If we know input saturation won't happen we can treat this
	// as a truncation shuffle.
	if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt \|\|
	DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
	return false;

	Ops.push_back(N.getOperand(0));
	Ops.push_back(N.getOperand(1));
	for (unsigned i = 0; i != NumElts; ++i)
	Mask.push_back(i * 2);
	return true;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	uint64_t ShiftVal = N.getConstantOperandVal(1);
	// Out of range bit shifts are guaranteed to be zero.
	if (NumBitsPerElt <= ShiftVal) {
	Mask.append(NumElts, SM_SentinelZero);
	return true;
	}

	// We can only decode 'whole byte' bit shifts as shuffles.
	if ((ShiftVal % 8) != 0)
	break;

	uint64_t ByteShift = ShiftVal / 8;
	unsigned NumBytes = NumSizeInBits / 8;
	unsigned NumBytesPerElt = NumBitsPerElt / 8;
	Ops.push_back(N.getOperand(0));

	// Clear mask to all zeros and insert the shifted byte indices.
	Mask.append(NumBytes, SM_SentinelZero);

	if (X86ISD::VSHLI == Opcode) {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j] = i + j - ByteShift;
	} else {
	for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
	for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
	Mask[i + j - ByteShift] = i + j;
	}
	return true;
	}
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VZEXT: {
	// TODO - add support for VPMOVZX with smaller input vector types.
	SDValue Src = N.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	if (NumSizeInBits != SrcVT.getSizeInBits())
	break;
	DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
	Ops.push_back(Src);
	return true;
	}
	}

	return false;
	}

	/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
	static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask) {
	int MaskWidth = Mask.size();
	SmallVector<SDValue, 16> UsedInputs;
	for (int i = 0, e = Inputs.size(); i < e; ++i) {
	int lo = UsedInputs.size() * MaskWidth;
	int hi = lo + MaskWidth;
	if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
	UsedInputs.push_back(Inputs[i]);
	continue;
	}
	for (int &M : Mask)
	if (lo <= M)
	M -= MaskWidth;
	}
	Inputs = UsedInputs;
	}

	/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
	/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
	/// remaining input indices in case we now have a unary shuffle and adjust the
	/// inputs accordingly.
	/// Returns true if the target shuffle mask was decoded.
	static bool resolveTargetShuffleInputs(SDValue Op,
	SmallVectorImpl<SDValue> &Inputs,
	SmallVectorImpl<int> &Mask,
	SelectionDAG &DAG) {
	if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
	if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
	return false;

	resolveTargetShuffleInputsAndMask(Inputs, Mask);
	return true;
	}

	/// Returns the scalar element that will make up the ith
	/// element of the result of the vector shuffle.
	static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
	unsigned Depth) {
	if (Depth == 6)
	return SDValue(); // Limit search depth.

	SDValue V = SDValue(N, 0);
	EVT VT = V.getValueType();
	unsigned Opcode = V.getOpcode();

	// Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
	if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
	int Elt = SV->getMaskElt(Index);

	if (Elt < 0)
	return DAG.getUNDEF(VT.getVectorElementType());

	unsigned NumElems = VT.getVectorNumElements();
	SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
	: SV->getOperand(1);
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
	}

	// Recurse into target specific vector shuffles to find scalars.
	if (isTargetShuffle(Opcode)) {
	MVT ShufVT = V.getSimpleValueType();
	MVT ShufSVT = ShufVT.getVectorElementType();
	int NumElems = (int)ShufVT.getVectorNumElements();
	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 16> ShuffleOps;
	bool IsUnary;

	if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
	return SDValue();

	int Elt = ShuffleMask[Index];
	if (Elt == SM_SentinelZero)
	return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
	if (Elt == SM_SentinelUndef)
	return DAG.getUNDEF(ShufSVT);

	assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
	SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
	return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
	Depth+1);
	}

	// Actual nodes that may contain scalar elements
	if (Opcode == ISD::BITCAST) {
	V = V.getOperand(0);
	EVT SrcVT = V.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	if (!SrcVT.isVector() \|\| SrcVT.getVectorNumElements() != NumElems)
	return SDValue();
	}

	if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
	return (Index == 0) ? V.getOperand(0)
	: DAG.getUNDEF(VT.getVectorElementType());

	if (V.getOpcode() == ISD::BUILD_VECTOR)
	return V.getOperand(Index);

	return SDValue();
	}

	/// Custom lower build_vector of v16i8.
	static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 8 && !Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	SDValue V;
	bool First = true;

	// SSE4.1 - use PINSRB to insert each byte directly.
	if (Subtarget.hasSSE41()) {
	for (unsigned i = 0; i < 16; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (IsNonZero) {
	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v16i8, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	}

	return V;
	}

	// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
	for (unsigned i = 0; i < 16; ++i) {
	bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
	if (ThisIsNonZero && First) {
	if (NumZero)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else
	V = DAG.getUNDEF(MVT::v8i16);
	First = false;
	}

	if ((i & 1) != 0) {
	// FIXME: Investigate extending to i32 instead of just i16.
	// FIXME: Investigate combining the first 4 bytes as a i32 instead.
	SDValue ThisElt, LastElt;
	bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
	if (LastIsNonZero) {
	LastElt =
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
	}
	if (ThisIsNonZero) {
	ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
	ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
	DAG.getConstant(8, dl, MVT::i8));
	if (LastIsNonZero)
	ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
	} else
	ThisElt = LastElt;

	if (ThisElt) {
	if (1 == i) {
	V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
	: DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v8i16, V);
	} else {
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
	DAG.getIntPtrConstant(i / 2, dl));
	}
	}
	}
	}

	return DAG.getBitcast(MVT::v16i8, V);
	}

	/// Custom lower build_vector of v8i16.
	static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
	unsigned NumNonZero, unsigned NumZero,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (NumNonZero > 4 && !Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	SDValue V;
	bool First = true;
	for (unsigned i = 0; i < 8; ++i) {
	bool IsNonZero = (NonZeros & (1 << i)) != 0;
	if (IsNonZero) {
	// If the build vector contains zeros or our first insertion is not the
	// first index then insert into zero vector to break any register
	// dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
	if (First) {
	First = false;
	if (NumZero \|\| 0 != i)
	V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
	else {
	assert(0 == i && "Expected insertion into zero-index");
	V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
	V = DAG.getBitcast(MVT::v8i16, V);
	continue;
	}
	}
	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	}

	return V;
	}

	/// Custom lower build_vector of v4i32 or v4f32.
	static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Find all zeroable elements.
	std::bitset<4> Zeroable;
	for (int i=0; i < 4; ++i) {
	SDValue Elt = Op->getOperand(i);
	Zeroable[i] = (Elt.isUndef() \|\| X86::isZeroNode(Elt));
	}
	assert(Zeroable.size() - Zeroable.count() > 1 &&
	"We expect at least two non-zero elements!");

	// We only know how to deal with build_vector nodes where elements are either
	// zeroable or extract_vector_elt with constant index.
	SDValue FirstNonZero;
	unsigned FirstNonZeroIdx;
	for (unsigned i=0; i < 4; ++i) {
	if (Zeroable[i])
	continue;
	SDValue Elt = Op->getOperand(i);
	if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Elt.getOperand(1)))
	return SDValue();
	// Make sure that this node is extracting from a 128-bit vector.
	MVT VT = Elt.getOperand(0).getSimpleValueType();
	if (!VT.is128BitVector())
	return SDValue();
	if (!FirstNonZero.getNode()) {
	FirstNonZero = Elt;
	FirstNonZeroIdx = i;
	}
	}

	assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
	SDValue V1 = FirstNonZero.getOperand(0);
	MVT VT = V1.getSimpleValueType();

	// See if this build_vector can be lowered as a blend with zero.
	SDValue Elt;
	unsigned EltMaskIdx, EltIdx;
	int Mask[4];
	for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
	if (Zeroable[EltIdx]) {
	// The zero vector will be on the right hand side.
	Mask[EltIdx] = EltIdx+4;
	continue;
	}

	Elt = Op->getOperand(EltIdx);
	// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
	EltMaskIdx = Elt.getConstantOperandVal(1);
	if (Elt.getOperand(0) != V1 \|\| EltMaskIdx != EltIdx)
	break;
	Mask[EltIdx] = EltIdx;
	}

	if (EltIdx == 4) {
	// Let the shuffle legalizer deal with blend operations.
	SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
	if (V1.getSimpleValueType() != VT)
	V1 = DAG.getBitcast(VT, V1);
	return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
	}

	// See if we can lower this build_vector to a INSERTPS.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDValue V2 = Elt.getOperand(0);
	if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
	V1 = SDValue();

	bool CanFold = true;
	for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
	if (Zeroable[i])
	continue;

	SDValue Current = Op->getOperand(i);
	SDValue SrcVector = Current->getOperand(0);
	if (!V1.getNode())
	V1 = SrcVector;
	CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
	}

	if (!CanFold)
	return SDValue();

	assert(V1.getNode() && "Expected at least two non-zero elements!");
	if (V1.getSimpleValueType() != MVT::v4f32)
	V1 = DAG.getBitcast(MVT::v4f32, V1);
	if (V2.getSimpleValueType() != MVT::v4f32)
	V2 = DAG.getBitcast(MVT::v4f32, V2);

	// Ok, we can emit an INSERTPS instruction.
	unsigned ZMask = Zeroable.to_ulong();

	unsigned InsertPSMask = EltMaskIdx << 6 \| EltIdx << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	SDLoc DL(Op);
	SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getIntPtrConstant(InsertPSMask, DL));
	return DAG.getBitcast(VT, Result);
	}

	/// Return a vector logical shift node.
	static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
	SelectionDAG &DAG, const TargetLowering &TLI,
	const SDLoc &dl) {
	assert(VT.is128BitVector() && "Unknown type for VShift");
	MVT ShVT = MVT::v16i8;
	unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
	SrcOp = DAG.getBitcast(ShVT, SrcOp);
	MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	assert(NumBits % 8 == 0 && "Only support byte sized shifts");
	SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
	return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
	}

	static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
	SelectionDAG &DAG) {

	// Check if the scalar load can be widened into a vector load. And if
	// the address is "base + cst" see if the cst can be "absorbed" into
	// the shuffle mask.
	if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
	SDValue Ptr = LD->getBasePtr();
	if (!ISD::isNormalLoad(LD) \|\| LD->isVolatile())
	return SDValue();
	EVT PVT = LD->getValueType(0);
	if (PVT != MVT::i32 && PVT != MVT::f32)
	return SDValue();

	int FI = -1;
	int64_t Offset = 0;
	if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
	FI = FINode->getIndex();
	Offset = 0;
	} else if (DAG.isBaseWithConstantOffset(Ptr) &&
	isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
	FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
	Offset = Ptr.getConstantOperandVal(1);
	Ptr = Ptr.getOperand(0);
	} else {
	return SDValue();
	}

	// FIXME: 256-bit vector instructions don't require a strict alignment,
	// improve this code to support it better.
	unsigned RequiredAlign = VT.getSizeInBits()/8;
	SDValue Chain = LD->getChain();
	// Make sure the stack object alignment is at least 16 or 32.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
	if (MFI.isFixedObjectIndex(FI)) {
	// Can't change the alignment. FIXME: It's possible to compute
	// the exact stack offset and reference FI + adjust offset instead.
	// If someone really cares about this. That's the way to implement it.
	return SDValue();
	} else {
	MFI.setObjectAlignment(FI, RequiredAlign);
	}
	}

	// (Offset % 16 or 32) must be multiple of 4. Then address is then
	// Ptr + (Offset & ~15).
	if (Offset < 0)
	return SDValue();
	if ((Offset % RequiredAlign) & 3)
	return SDValue();
	int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
	if (StartOffset) {
	SDLoc DL(Ptr);
	Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
	DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
	}

	int EltNo = (Offset - StartOffset) >> 2;
	unsigned NumElems = VT.getVectorNumElements();

	EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
	SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
	LD->getPointerInfo().getWithOffset(StartOffset));

	SmallVector<int, 8> Mask(NumElems, EltNo);

	return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
	}

	return SDValue();
	}

	/// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
	/// elements can be replaced by a single large load which has the same value as
	/// a build_vector or insert_subvector whose loaded operands are 'Elts'.
	///
	/// Example: <load i32 a, load i32 a+4, zero, undef> -> zextload a
	static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
	const SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	bool isAfterLegalize) {
	unsigned NumElems = Elts.size();

	int LastLoadedElt = -1;
	SmallBitVector LoadMask(NumElems, false);
	SmallBitVector ZeroMask(NumElems, false);
	SmallBitVector UndefMask(NumElems, false);

	// For each element in the initializer, see if we've found a load, zero or an
	// undef.
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	if (!Elt.getNode())
	return SDValue();

	if (Elt.isUndef())
	UndefMask[i] = true;
	else if (X86::isZeroNode(Elt) \|\| ISD::isBuildVectorAllZeros(Elt.getNode()))
	ZeroMask[i] = true;
	else if (ISD::isNON_EXTLoad(Elt.getNode())) {
	LoadMask[i] = true;
	LastLoadedElt = i;
	// Each loaded element must be the correct fractional portion of the
	// requested vector load.
	if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
	return SDValue();
	} else
	return SDValue();
	}
	assert((ZeroMask \| UndefMask \| LoadMask).count() == NumElems &&
	"Incomplete element masks");

	// Handle Special Cases - all undef or undef/zero.
	if (UndefMask.count() == NumElems)
	return DAG.getUNDEF(VT);

	// FIXME: Should we return this as a BUILD_VECTOR instead?
	if ((ZeroMask \| UndefMask).count() == NumElems)
	return VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	int FirstLoadedElt = LoadMask.find_first();
	SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
	LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
	EVT LDBaseVT = EltBase.getValueType();

	// Consecutive loads can contain UNDEFS but not ZERO elements.
	// Consecutive loads with UNDEFs and ZEROs elements require a
	// an additional shuffle stage to clear the ZERO elements.
	bool IsConsecutiveLoad = true;
	bool IsConsecutiveLoadWithZeros = true;
	for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
	if (LoadMask[i]) {
	SDValue Elt = peekThroughBitcasts(Elts[i]);
	LoadSDNode *LD = cast<LoadSDNode>(Elt);
	if (!DAG.areNonVolatileConsecutiveLoads(
	LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
	i - FirstLoadedElt)) {
	IsConsecutiveLoad = false;
	IsConsecutiveLoadWithZeros = false;
	break;
	}
	} else if (ZeroMask[i]) {
	IsConsecutiveLoad = false;
	}
	}

	auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
	auto MMOFlags = LDBase->getMemOperand()->getFlags();
	assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
	"Cannot merge volatile loads.");
	SDValue NewLd =
	DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
	LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
	DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
	return NewLd;
	};

	// LOAD - all consecutive load/undefs (must start/end with a load).
	// If we have found an entire vector of loads and undefs, then return a large
	// load of the entire vector width starting at the base pointer.
	// If the vector contains zeros, then attempt to shuffle those elements.
	if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
	(IsConsecutiveLoad \|\| IsConsecutiveLoadWithZeros)) {
	assert(LDBase && "Did not find base load for merging consecutive loads");
	EVT EltVT = LDBase->getValueType(0);
	// Ensure that the input vector size for the merged loads matches the
	// cumulative size of the input elements.
	if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
	return SDValue();

	if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
	return SDValue();

	// Don't create 256-bit non-temporal aligned loads without AVX2 as these
	// will lower to regular temporal loads and use the cache.
	if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
	VT.is256BitVector() && !Subtarget.hasInt256())
	return SDValue();

	if (IsConsecutiveLoad)
	return CreateLoad(VT, LDBase);

	// IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
	// vector and a zero vector to clear out the zero elements.
	if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
	SmallVector<int, 4> ClearMask(NumElems, -1);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (ZeroMask[i])
	ClearMask[i] = i + NumElems;
	else if (LoadMask[i])
	ClearMask[i] = i;
	}
	SDValue V = CreateLoad(VT, LDBase);
	SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
	: DAG.getConstantFP(0.0, DL, VT);
	return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
	}
	}

	int LoadSize =
	(1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();

	// VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
	if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
	(LoadSize == 32 \|\| LoadSize == 64) &&
	((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()))) {
	MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
	: MVT::getIntegerVT(LoadSize);
	MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
	if (TLI.isTypeLegal(VecVT)) {
	SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
	SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
	SDValue ResNode =
	DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
	LDBase->getPointerInfo(),
	LDBase->getAlignment(),
	false/isVolatile/, true/ReadMem/,
	false/WriteMem/);
	DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
	return DAG.getBitcast(VT, ResNode);
	}
	}

	return SDValue();
	}

	static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
	unsigned SplatBitSize, LLVMContext &C) {
	unsigned ScalarSize = VT.getScalarSizeInBits();
	unsigned NumElm = SplatBitSize / ScalarSize;

	SmallVector<Constant *, 32> ConstantVec;
	for (unsigned i = 0; i < NumElm; i++) {
	APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
	Constant *Const;
	if (VT.isFloatingPoint()) {
	if (ScalarSize == 32) {
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
	} else {
	assert(ScalarSize == 64 && "Unsupported floating point scalar size");
	Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
	}
	} else
	Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
	ConstantVec.push_back(Const);
	}
	return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
	}

	static bool isUseOfShuffle(SDNode *N) {
	for (auto *U : N->uses()) {
	if (isTargetShuffle(U->getOpcode()))
	return true;
	if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
	return isUseOfShuffle(U);
	}
	return false;
	}

	/// Attempt to use the vbroadcast instruction to generate a splat value
	/// from a splat BUILD_VECTOR which uses:
	/// a. A single scalar load, or a constant.
	/// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
	///
	/// The VBROADCAST node is returned when a pattern is found,
	/// or SDValue() otherwise.
	static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// VBROADCAST requires AVX.
	// TODO: Splats could be generated for non-AVX CPUs using SSE
	// instructions, but there's less potential gain for only 128-bit vectors.
	if (!Subtarget.hasAVX())
	return SDValue();

	MVT VT = BVOp->getSimpleValueType(0);
	SDLoc dl(BVOp);

	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) &&
	"Unsupported vector type for broadcast.");

	BitVector UndefElements;
	SDValue Ld = BVOp->getSplatValue(&UndefElements);

	// We need a splat of a single value to use broadcast, and it doesn't
	// make any sense if the value is only in one element of the vector.
	if (!Ld \|\| (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
	APInt SplatValue, Undef;
	unsigned SplatBitSize;
	bool HasUndef;
	// Check if this is a repeated constant pattern suitable for broadcasting.
	if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
	SplatBitSize > VT.getScalarSizeInBits() &&
	SplatBitSize < VT.getSizeInBits()) {
	// Avoid replacing with broadcast when it's a use of a shuffle
	// instruction to preserve the present custom lowering of shuffles.
	if (isUseOfShuffle(BVOp) \|\| BVOp->hasOneUse())
	return SDValue();
	// replace BUILD_VECTOR with broadcast of the repeated constants.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	LLVMContext *Ctx = DAG.getContext();
	MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
	if (Subtarget.hasAVX()) {
	if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
	!(SplatBitSize == 64 && Subtarget.is32Bit())) {
	// Splatted value can fit in one INTEGER constant in constant pool.
	// Load the constant and broadcast it.
	MVT CVT = MVT::getIntegerVT(SplatBitSize);
	Type ScalarTy = Type::getIntNTy(Ctx, SplatBitSize);
	Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize == 32 \|\| SplatBitSize == 64) {
	// Splatted value can fit in one FLOAT constant in constant pool.
	// Load the constant and broadcast it.
	// AVX have support for 32 and 64 bit broadcast for floats only.
	// No 64bit integer in 32bit subtarget.
	MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
	// Lower the splat via APFloat directly, to avoid any conversion.
	Constant *C =
	SplatBitSize == 32
	? ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEsingle(), SplatValue))
	: ConstantFP::get(*Ctx,
	APFloat(APFloat::IEEEdouble(), SplatValue));
	SDValue CP = DAG.getConstantPool(C, PVT);
	unsigned Repeat = VT.getSizeInBits() / SplatBitSize;

	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
	MVT::getVectorVT(CVT, Repeat), Ld);
	return DAG.getBitcast(VT, Brdcst);
	} else if (SplatBitSize > 64) {
	// Load the vector of constants and broadcast it.
	MVT CVT = VT.getScalarType();
	Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
	*Ctx);
	SDValue VCP = DAG.getConstantPool(VecC, PVT);
	unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
	unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
	Ld = DAG.getLoad(
	MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);
	SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
	return DAG.getBitcast(VT, Brdcst);
	}
	}
	}
	return SDValue();
	}

	bool ConstSplatVal =
	(Ld.getOpcode() == ISD::Constant \|\| Ld.getOpcode() == ISD::ConstantFP);

	// Make sure that all of the users of a non-constant load are from the
	// BUILD_VECTOR node.
	if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
	return SDValue();

	unsigned ScalarSize = Ld.getValueSizeInBits();
	bool IsGE256 = (VT.getSizeInBits() >= 256);

	// When optimizing for size, generate up to 5 extra bytes for a broadcast
	// instruction to save 8 or more bytes of constant pool data.
	// TODO: If multiple splats are generated to load the same constant,
	// it may be detrimental to overall size. There needs to be a way to detect
	// that condition to know if this is truly a size win.
	bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();

	// Handle broadcasting a single constant scalar from the constant pool
	// into a vector.
	// On Sandybridge (no AVX2), it is still better to load a constant vector
	// from the constant pool and not to broadcast it from a scalar.
	// But override that restriction when optimizing for size.
	// TODO: Check if splatting is recommended for other AVX-capable CPUs.
	if (ConstSplatVal && (Subtarget.hasAVX2() \|\| OptForSize)) {
	EVT CVT = Ld.getValueType();
	assert(!CVT.isVector() && "Must not broadcast a vector type");

	// Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
	// For size optimization, also splat v2f64 and v2i64, and for size opt
	// with AVX2, also splat i8 and i16.
	// With pattern matching, the VBROADCAST node may become a VMOVDDUP.
	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(OptForSize && (ScalarSize == 64 \|\| Subtarget.hasAVX2()))) {
	const Constant *C = nullptr;
	if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
	C = CI->getConstantIntValue();
	else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
	C = CF->getConstantFPValue();

	assert(C && "Invalid constant type");

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue CP =
	DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
	unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
	Ld = DAG.getLoad(
	CVT, dl, DAG.getEntryNode(), CP,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	Alignment);

	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}
	}

	bool IsLoad = ISD::isNormalLoad(Ld.getNode());

	// Handle AVX2 in-register broadcasts.
	if (!IsLoad && Subtarget.hasInt256() &&
	(ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64)))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The scalar source must be a normal load.
	if (!IsLoad)
	return SDValue();

	if (ScalarSize == 32 \|\| (IsGE256 && ScalarSize == 64) \|\|
	(Subtarget.hasVLX() && ScalarSize == 64))
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);

	// The integer check is needed for the 64-bit into 128-bit so it doesn't match
	// double since there is no vbroadcastsd xmm
	if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
	if (ScalarSize == 8 \|\| ScalarSize == 16 \|\| ScalarSize == 64)
	return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
	}

	// Unsupported broadcast.
	return SDValue();
	}

	/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
	/// underlying vector and index.
	///
	/// Modifies \p ExtractedFromVec to the real vector and returns the real
	/// index.
	static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
	SDValue ExtIdx) {
	int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
	if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
	return Idx;

	// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
	// lowered this:
	// (extract_vector_elt (v8f32 %vreg1), Constant<6>)
	// to:
	// (extract_vector_elt (vector_shuffle<2,u,u,u>
	// (extract_subvector (v8f32 %vreg0), Constant<4>),
	// undef)
	// Constant<0>)
	// In this case the vector is the extract_subvector expression and the index
	// is 2, as specified by the shuffle.
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
	SDValue ShuffleVec = SVOp->getOperand(0);
	MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
	assert(ShuffleVecVT.getVectorElementType() ==
	ExtractedFromVec.getSimpleValueType().getVectorElementType());

	int ShuffleIdx = SVOp->getMaskElt(Idx);
	if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
	ExtractedFromVec = ShuffleVec;
	return ShuffleIdx;
	}
	return Idx;
	}

	static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	// Skip if insert_vec_elt is not supported.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
	return SDValue();

	SDLoc DL(Op);
	unsigned NumElems = Op.getNumOperands();

	SDValue VecIn1;
	SDValue VecIn2;
	SmallVector<unsigned, 4> InsertIndices;
	SmallVector<int, 8> Mask(NumElems, -1);

	for (unsigned i = 0; i != NumElems; ++i) {
	unsigned Opc = Op.getOperand(i).getOpcode();

	if (Opc == ISD::UNDEF)
	continue;

	if (Opc != ISD::EXTRACT_VECTOR_ELT) {
	// Quit if more than 1 elements need inserting.
	if (InsertIndices.size() > 1)
	return SDValue();

	InsertIndices.push_back(i);
	continue;
	}

	SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
	SDValue ExtIdx = Op.getOperand(i).getOperand(1);

	// Quit if non-constant index.
	if (!isa<ConstantSDNode>(ExtIdx))
	return SDValue();
	int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);

	// Quit if extracted from vector of different type.
	if (ExtractedFromVec.getValueType() != VT)
	return SDValue();

	if (!VecIn1.getNode())
	VecIn1 = ExtractedFromVec;
	else if (VecIn1 != ExtractedFromVec) {
	if (!VecIn2.getNode())
	VecIn2 = ExtractedFromVec;
	else if (VecIn2 != ExtractedFromVec)
	// Quit if more than 2 vectors to shuffle
	return SDValue();
	}

	if (ExtractedFromVec == VecIn1)
	Mask[i] = Idx;
	else if (ExtractedFromVec == VecIn2)
	Mask[i] = Idx + NumElems;
	}

	if (!VecIn1.getNode())
	return SDValue();

	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);

	for (unsigned Idx : InsertIndices)
	NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
	DAG.getIntPtrConstant(Idx, DL));

	return NV;
	}

	static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
	assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
	Op.getScalarValueSizeInBits() == 1 &&
	"Can not convert non-constant vector");
	uint64_t Immediate = 0;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (!In.isUndef())
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	}
	SDLoc dl(Op);
	MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
	return DAG.getConstant(Immediate, dl, VT);
	}
	// Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
	SDValue
	X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();
	assert((VT.getVectorElementType() == MVT::i1) &&
	"Unexpected type in LowerBUILD_VECTORvXi1!");

	SDLoc dl(Op);
	if (ISD::isBuildVectorAllZeros(Op.getNode()))
	return DAG.getTargetConstant(0, dl, VT);

	if (ISD::isBuildVectorAllOnes(Op.getNode()))
	return DAG.getTargetConstant(1, dl, VT);

	if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
	SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, Imm);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	// Vector has one or more non-const elements
	uint64_t Immediate = 0;
	SmallVector<unsigned, 16> NonConstIdx;
	bool IsSplat = true;
	bool HasConstElts = false;
	int SplatIdx = -1;
	for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
	SDValue In = Op.getOperand(idx);
	if (In.isUndef())
	continue;
	if (!isa<ConstantSDNode>(In))
	NonConstIdx.push_back(idx);
	else {
	Immediate \|= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
	HasConstElts = true;
	}
	if (SplatIdx < 0)
	SplatIdx = idx;
	else if (In != Op.getOperand(SplatIdx))
	IsSplat = false;
	}

	// for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
	if (IsSplat)
	return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
	DAG.getConstant(1, dl, VT),
	DAG.getConstant(0, dl, VT));

	// insert elements one by one
	SDValue DstVec;
	SDValue Imm;
	if (Immediate) {
	MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
	Imm = DAG.getConstant(Immediate, dl, ImmVT);
	}
	else if (HasConstElts)
	Imm = DAG.getConstant(0, dl, VT);
	else
	Imm = DAG.getUNDEF(VT);
	if (Imm.getValueSizeInBits() == VT.getSizeInBits())
	DstVec = DAG.getBitcast(VT, Imm);
	else {
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
	DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
	unsigned InsertIdx = NonConstIdx[i];
	DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
	Op.getOperand(InsertIdx),
	DAG.getIntPtrConstant(InsertIdx, dl));
	}
	return DstVec;
	}

	/// \brief Return true if \p N implements a horizontal binop and return the
	/// operands for the horizontal binop into V0 and V1.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function checks that the build_vector \p N in input implements a
	/// horizontal operation. Parameter \p Opcode defines the kind of horizontal
	/// operation to match.
	/// For example, if \p Opcode is equal to ISD::ADD, then this function
	/// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
	/// is equal to ISD::SUB, then this function checks if this is a horizontal
	/// arithmetic sub.
	///
	/// This function only analyzes elements of \p N whose indices are
	/// in range [BaseIdx, LastIdx).
	static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
	SelectionDAG &DAG,
	unsigned BaseIdx, unsigned LastIdx,
	SDValue &V0, SDValue &V1) {
	EVT VT = N->getValueType(0);

	assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
	assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
	"Invalid Vector in input!");

	bool IsCommutable = (Opcode == ISD::ADD \|\| Opcode == ISD::FADD);
	bool CanFold = true;
	unsigned ExpectedVExtractIdx = BaseIdx;
	unsigned NumElts = LastIdx - BaseIdx;
	V0 = DAG.getUNDEF(VT);
	V1 = DAG.getUNDEF(VT);

	// Check if N implements a horizontal binop.
	for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
	SDValue Op = N->getOperand(i + BaseIdx);

	// Skip UNDEFs.
	if (Op->isUndef()) {
	// Update the expected vector extract index.
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	ExpectedVExtractIdx += 2;
	continue;
	}

	CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();

	if (!CanFold)
	break;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
	CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	Op0.getOperand(0) == Op1.getOperand(0) &&
	isa<ConstantSDNode>(Op0.getOperand(1)) &&
	isa<ConstantSDNode>(Op1.getOperand(1)));
	if (!CanFold)
	break;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();

	if (i * 2 < NumElts) {
	if (V0.isUndef()) {
	V0 = Op0.getOperand(0);
	if (V0.getValueType() != VT)
	return false;
	}
	} else {
	if (V1.isUndef()) {
	V1 = Op0.getOperand(0);
	if (V1.getValueType() != VT)
	return false;
	}
	if (i * 2 == NumElts)
	ExpectedVExtractIdx = BaseIdx;
	}

	SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
	if (I0 == ExpectedVExtractIdx)
	CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
	else if (IsCommutable && I1 == ExpectedVExtractIdx) {
	// Try to match the following dag sequence:
	// (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
	CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
	} else
	CanFold = false;

	ExpectedVExtractIdx += 2;
	}

	return CanFold;
	}

	/// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
	/// a concat_vector.
	///
	/// This is a helper function of LowerToHorizontalOp().
	/// This function expects two 256-bit vectors called V0 and V1.
	/// At first, each vector is split into two separate 128-bit vectors.
	/// Then, the resulting 128-bit vectors are used to implement two
	/// horizontal binary operations.
	///
	/// The kind of horizontal binary operation is defined by \p X86Opcode.
	///
	/// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
	/// the two new horizontal binop.
	/// When Mode is set, the first horizontal binop dag node would take as input
	/// the lower 128-bit of V0 and the upper 128-bit of V0. The second
	/// horizontal binop dag node would take as input the lower 128-bit of V1
	/// and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V0_HI
	/// HADD V1_LO, V1_HI
	///
	/// Otherwise, the first horizontal binop dag node takes as input the lower
	/// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
	/// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
	/// Example:
	/// HADD V0_LO, V1_LO
	/// HADD V0_HI, V1_HI
	///
	/// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
	/// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
	/// the upper 128-bits of the result.
	static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
	const SDLoc &DL, SelectionDAG &DAG,
	unsigned X86Opcode, bool Mode,
	bool isUndefLO, bool isUndefHI) {
	MVT VT = V0.getSimpleValueType();
	assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
	"Invalid nodes in input!");

	unsigned NumElts = VT.getVectorNumElements();
	SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
	SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
	SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
	SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
	MVT NewVT = V0_LO.getSimpleValueType();

	SDValue LO = DAG.getUNDEF(NewVT);
	SDValue HI = DAG.getUNDEF(NewVT);

	if (Mode) {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && !V0->isUndef())
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
	if (!isUndefHI && !V1->isUndef())
	HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
	} else {
	// Don't emit a horizontal binop if the result is expected to be UNDEF.
	if (!isUndefLO && (!V0_LO->isUndef() \|\| !V1_LO->isUndef()))
	LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);

	if (!isUndefHI && (!V0_HI->isUndef() \|\| !V1_HI->isUndef()))
	HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
	}

	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
	}

	/// Returns true iff \p BV builds a vector with the result equivalent to
	/// the result of ADDSUB operation.
	/// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	static bool isAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1) {

	MVT VT = BV->getSimpleValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	unsigned NumElts = VT.getVectorNumElements();
	SDValue InVec0 = DAG.getUNDEF(VT);
	SDValue InVec1 = DAG.getUNDEF(VT);

	// Odd-numbered elements in the input build vector are obtained from
	// adding two integer/float elements.
	// Even-numbered elements in the input build vector are obtained from
	// subtracting two integer/float elements.
	unsigned ExpectedOpcode = ISD::FSUB;
	unsigned NextExpectedOpcode = ISD::FADD;
	bool AddFound = false;
	bool SubFound = false;

	for (unsigned i = 0, e = NumElts; i != e; ++i) {
	SDValue Op = BV->getOperand(i);

	// Skip 'undef' values.
	unsigned Opcode = Op.getOpcode();
	if (Opcode == ISD::UNDEF) {
	std::swap(ExpectedOpcode, NextExpectedOpcode);
	continue;
	}

	// Early exit if we found an unexpected opcode.
	if (Opcode != ExpectedOpcode)
	return false;

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);

	// Try to match the following pattern:
	// (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
	// Early exit if we cannot match that sequence.
	if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT \|\|
	!isa<ConstantSDNode>(Op0.getOperand(1)) \|\|
	!isa<ConstantSDNode>(Op1.getOperand(1)) \|\|
	Op0.getOperand(1) != Op1.getOperand(1))
	return false;

	unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
	if (I0 != i)
	return false;

	// We found a valid add/sub node. Update the information accordingly.
	if (i & 1)
	AddFound = true;
	else
	SubFound = true;

	// Update InVec0 and InVec1.
	if (InVec0.isUndef()) {
	InVec0 = Op0.getOperand(0);
	if (InVec0.getSimpleValueType() != VT)
	return false;
	}
	if (InVec1.isUndef()) {
	InVec1 = Op1.getOperand(0);
	if (InVec1.getSimpleValueType() != VT)
	return false;
	}

	// Make sure that operands in input to each add/sub node always
	// come from a same pair of vectors.
	if (InVec0 != Op0.getOperand(0)) {
	if (ExpectedOpcode == ISD::FSUB)
	return false;

	// FADD is commutable. Try to commute the operands
	// and then test again.
	std::swap(Op0, Op1);
	if (InVec0 != Op0.getOperand(0))
	return false;
	}

	if (InVec1 != Op1.getOperand(0))
	return false;

	// Update the pair of expected opcodes.
	std::swap(ExpectedOpcode, NextExpectedOpcode);
	}

	// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
	if (!AddFound \|\| !SubFound \|\| InVec0.isUndef() \|\| InVec1.isUndef())
	return false;

	Opnd0 = InVec0;
	Opnd1 = InVec1;
	return true;
	}

	/// Returns true if is possible to fold MUL and an idiom that has already been
	/// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
	/// If (and only if) true is returned, the operands of FMADDSUB are written to
	/// parameters \p Opnd0, \p Opnd1, \p Opnd2.
	///
	/// Prior to calling this function it should be known that there is some
	/// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
	/// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
	/// before replacement of such SDNode with ADDSUB operation. Thus the number
	/// of \p Opnd0 uses is expected to be equal to 2.
	/// For example, this function may be called for the following IR:
	/// %AB = fmul fast <2 x double> %A, %B
	/// %Sub = fsub fast <2 x double> %AB, %C
	/// %Add = fadd fast <2 x double> %AB, %C
	/// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
	/// <2 x i32> <i32 0, i32 3>
	/// There is a def for %Addsub here, which potentially can be replaced by
	/// X86ISD::ADDSUB operation:
	/// %Addsub = X86ISD::ADDSUB %AB, %C
	/// and such ADDSUB can further be replaced with FMADDSUB:
	/// %Addsub = FMADDSUB %A, %B, %C.
	///
	/// The main reason why this method is called before the replacement of the
	/// recognized ADDSUB idiom with ADDSUB operation is that such replacement
	/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
	/// FMADDSUB is.
	static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
	SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
	if (Opnd0.getOpcode() != ISD::FMUL \|\| Opnd0->use_size() != 2 \|\|
	!Subtarget.hasAnyFMA())
	return false;

	// FIXME: These checks must match the similar ones in
	// DAGCombiner::visitFADDForFMACombine. It would be good to have one
	// function that would answer if it is Ok to fuse MUL + ADD to FMADD
	// or MUL + ADDSUB to FMADDSUB.
	const TargetOptions &Options = DAG.getTarget().Options;
	bool AllowFusion =
	(Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath);
	if (!AllowFusion)
	return false;

	Opnd2 = Opnd1;
	Opnd1 = Opnd0.getOperand(1);
	Opnd0 = Opnd0.getOperand(0);

	return true;
	}

	/// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
	/// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
	static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
	return SDValue();

	MVT VT = BV->getSimpleValueType(0);
	SDLoc DL(BV);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	// 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
	// recognition.
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	/// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
	static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = BV->getSimpleValueType(0);
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumUndefsLO = 0;
	unsigned NumUndefsHI = 0;
	unsigned Half = NumElts/2;

	// Count the number of UNDEF operands in the build_vector in input.
	for (unsigned i = 0, e = Half; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsLO++;

	for (unsigned i = Half, e = NumElts; i != e; ++i)
	if (BV->getOperand(i)->isUndef())
	NumUndefsHI++;

	// Early exit if this is either a build_vector of all UNDEFs or all the
	// operands but one are UNDEF.
	if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
	return SDValue();

	SDLoc DL(BV);
	SDValue InVec0, InVec1;
	if ((VT == MVT::v4f32 \|\| VT == MVT::v2f64) && Subtarget.hasSSE3()) {
	// Try to match an SSE3 float HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if ((VT == MVT::v4i32 \|\| VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
	// Try to match an SSSE3 integer HADD/HSUB.
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
	}

	if (!Subtarget.hasAVX())
	return SDValue();

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64)) {
	// Try to match an AVX horizontal add/sub of packed single/double
	// precision floating point values from 256-bit vectors.
	SDValue InVec2, InVec3;
	if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);

	if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
	} else if (VT == MVT::v8i32 \|\| VT == MVT::v16i16) {
	// Try to match an AVX2 horizontal add/sub of signed integers.
	SDValue InVec2, InVec3;
	unsigned X86Opcode;
	bool CanFold = true;

	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
	isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
	((InVec0.isUndef() \|\| InVec2.isUndef()) \|\| InVec0 == InVec2) &&
	((InVec1.isUndef() \|\| InVec3.isUndef()) \|\| InVec1 == InVec3))
	X86Opcode = X86ISD::HSUB;
	else
	CanFold = false;

	if (CanFold) {
	// Fold this build_vector into a single horizontal add/sub.
	// Do this only if the target has AVX2.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);

	// Do not try to expand this build_vector into a pair of horizontal
	// add/sub if we can emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into a pair of horizontal binop followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
	isUndefLO, isUndefHI);
	}
	}

	if ((VT == MVT::v8f32 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8i32 \|\|
	VT == MVT::v16i16) && Subtarget.hasAVX()) {
	unsigned X86Opcode;
	if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HADD;
	else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::HSUB;
	else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHADD;
	else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
	X86Opcode = X86ISD::FHSUB;
	else
	return SDValue();

	// Don't try to expand this build_vector into a pair of horizontal add/sub
	// if we can simply emit a pair of scalar add/sub.
	if (NumUndefsLO + 1 == Half \|\| NumUndefsHI + 1 == Half)
	return SDValue();

	// Convert this build_vector into two horizontal add/sub followed by
	// a concat vector.
	bool isUndefLO = NumUndefsLO == Half;
	bool isUndefHI = NumUndefsHI == Half;
	return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
	isUndefLO, isUndefHI);
	}

	return SDValue();
	}

	/// If a BUILD_VECTOR's source elements all apply the same bit operation and
	/// one of their operands is constant, lower to a pair of BUILD_VECTOR and
	/// just apply the bit to the vectors.
	/// NOTE: Its not in our interest to start make a general purpose vectorizer
	/// from this, but enough scalar bit operations are created from the later
	/// legalization + scalarization stages to need basic support.
	static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op->getSimpleValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// Check that all elements have the same opcode.
	// TODO: Should we allow UNDEFS and if so how many?
	unsigned Opcode = Op->getOperand(0).getOpcode();
	for (unsigned i = 1; i < NumElems; ++i)
	if (Opcode != Op->getOperand(i).getOpcode())
	return SDValue();

	// TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
	switch (Opcode) {
	default:
	return SDValue();
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR:
	if (!TLI.isOperationLegalOrPromote(Opcode, VT))
	return SDValue();
	break;
	}

	SmallVector<SDValue, 4> LHSElts, RHSElts;
	for (SDValue Elt : Op->ops()) {
	SDValue LHS = Elt.getOperand(0);
	SDValue RHS = Elt.getOperand(1);

	// We expect the canonicalized RHS operand to be the constant.
	if (!isa<ConstantSDNode>(RHS))
	return SDValue();
	LHSElts.push_back(LHS);
	RHSElts.push_back(RHS);
	}

	SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
	SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
	return DAG.getNode(Opcode, DL, VT, LHS, RHS);
	}

	/// Create a vector constant without a load. SSE/AVX provide the bare minimum
	/// functionality to do this, so it's all zeros, all ones, or some derivation
	/// that is cheap to calculate.
	static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();

	// Vectors containing all zeros can be matched by pxor and xorps.
	if (ISD::isBuildVectorAllZeros(Op.getNode())) {
	// Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
	// and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
	if (VT == MVT::v4i32 \|\| VT == MVT::v8i32 \|\| VT == MVT::v16i32)
	return Op;

	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
	// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
	// vpcmpeqd on 256-bit vectors.
	if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
	if (VT == MVT::v4i32 \|\| VT == MVT::v16i32 \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()))
	return Op;

	return getOnesVector(VT, DAG, DL);
	}

	return SDValue();
	}

	SDValue
	X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);

	MVT VT = Op.getSimpleValueType();
	MVT ExtVT = VT.getVectorElementType();
	unsigned NumElems = Op.getNumOperands();

	// Generate vectors for predicate vectors.
	if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
	return LowerBUILD_VECTORvXi1(Op, DAG);

	if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
	return VectorConstant;

	BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
	if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
	return AddSub;
	if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
	return HorizontalOp;
	if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
	return Broadcast;
	if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
	return BitOp;

	unsigned EVTBits = ExtVT.getSizeInBits();

	unsigned NumZero = 0;
	unsigned NumNonZero = 0;
	uint64_t NonZeros = 0;
	bool IsAllConstants = true;
	SmallSet<SDValue, 8> Values;
	for (unsigned i = 0; i < NumElems; ++i) {
	SDValue Elt = Op.getOperand(i);
	if (Elt.isUndef())
	continue;
	Values.insert(Elt);
	if (Elt.getOpcode() != ISD::Constant &&
	Elt.getOpcode() != ISD::ConstantFP)
	IsAllConstants = false;
	if (X86::isZeroNode(Elt))
	NumZero++;
	else {
	assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
	NonZeros \|= ((uint64_t)1 << i);
	NumNonZero++;
	}
	}

	// All undef vector. Return an UNDEF. All zero vectors were handled above.
	if (NumNonZero == 0)
	return DAG.getUNDEF(VT);

	// Special case for single non-zero, non-undef, element.
	if (NumNonZero == 1) {
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);

	// If this is an insertion of an i64 value on x86-32, and if the top bits of
	// the value are obviously zero, truncate the value to i32 and do the
	// insertion that way. Only do this if the value is non-constant or if the
	// value is a constant being inserted into element 0. It is cheaper to do
	// a constant pool load than it is to do a movd + shuffle.
	if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
	(!IsAllConstants \|\| Idx == 0)) {
	if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
	// Handle SSE only.
	assert(VT == MVT::v2i64 && "Expected an SSE value type!");
	MVT VecVT = MVT::v4i32;

	// Truncate the value (which may itself be a constant) to i32, and
	// convert it to a vector with movd (S2V+shuffle to zero extend).
	Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
	return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
	Item, Idx * 2, true, Subtarget, DAG));
	}
	}

	// If we have a constant or non-constant insertion into the low element of
	// a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
	// the rest of the elements. This will be matched as movd/movq/movss/movsd
	// depending on what the source datatype is.
	if (Idx == 0) {
	if (NumZero == 0)
	return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

	if (ExtVT == MVT::i32 \|\| ExtVT == MVT::f32 \|\| ExtVT == MVT::f64 \|\|
	(ExtVT == MVT::i64 && Subtarget.is64Bit())) {
	assert((VT.is128BitVector() \|\| VT.is256BitVector() \|\|
	VT.is512BitVector()) &&
	"Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
	return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}

	// We can't directly insert an i8 or i16 into a vector, so zero extend
	// it to i32 first.
	if (ExtVT == MVT::i16 \|\| ExtVT == MVT::i8) {
	Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
	if (VT.getSizeInBits() >= 256) {
	MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
	if (Subtarget.hasAVX()) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	} else {
	// Without AVX, we need to extend to a 128-bit vector and then
	// insert into the 256-bit vector.
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
	Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
	}
	} else {
	assert(VT.is128BitVector() && "Expected an SSE value type!");
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
	Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
	}
	return DAG.getBitcast(VT, Item);
	}
	}

	// Is it a vector logical left shift?
	if (NumElems == 2 && Idx == 1 &&
	X86::isZeroNode(Op.getOperand(0)) &&
	!X86::isZeroNode(Op.getOperand(1))) {
	unsigned NumBits = VT.getSizeInBits();
	return getVShift(true, VT,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	VT, Op.getOperand(1)),
	NumBits/2, DAG, *this, dl);
	}

	if (IsAllConstants) // Otherwise, it's better to do a constpool load.
	return SDValue();

	// Otherwise, if this is a vector with i32 or f32 elements, and the element
	// is a non-constant being inserted into an element other than the low one,
	// we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
	// movd/movss) to move this into the low element, then shuffle it into
	// place.
	if (EVTBits == 32) {
	Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
	return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
	}
	}

	// Splat is obviously ok. Let legalizer expand it to a shuffle.
	if (Values.size() == 1) {
	if (EVTBits == 32) {
	// Instead of a shuffle like this:
	// shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
	// Check if it's possible to issue this instead.
	// shuffle (vload ptr)), undef, <1, 1, 1, 1>
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue Item = Op.getOperand(Idx);
	if (Op.getNode()->isOnlyUserOf(Item.getNode()))
	return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
	}
	return SDValue();
	}

	// A vector full of immediates; various special cases are already
	// handled, so this is best done with a single constant-pool load.
	if (IsAllConstants)
	return SDValue();

	// See if we can use a vector load to get all of the elements.
	if (VT.is128BitVector() \|\| VT.is256BitVector() \|\| VT.is512BitVector()) {
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
	return LD;
	}

	// For AVX-length vectors, build the individual 128-bit pieces and use
	// shuffles to put them in place.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);

	EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);

	// Build both the lower and upper subvector.
	SDValue Lower =
	DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
	SDValue Upper = DAG.getBuildVector(
	HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));

	// Recreate the wider vector with the lower and upper part.
	if (VT.is256BitVector())
	return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
	}

	// Let legalizer expand 2-wide build_vectors.
	if (EVTBits == 64) {
	if (NumNonZero == 1) {
	// One half is zero or undef.
	unsigned Idx = countTrailingZeros(NonZeros);
	SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
	Op.getOperand(Idx));
	return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
	}
	return SDValue();
	}

	// If element VT is < 32 bits, convert it to inserts into a zero vector.
	if (EVTBits == 8 && NumElems == 16)
	if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	if (EVTBits == 16 && NumElems == 8)
	if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
	DAG, Subtarget))
	return V;

	// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
	if (EVTBits == 32 && NumElems == 4)
	if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
	return V;

	// If element VT is == 32 bits, turn it into a number of shuffles.
	if (NumElems == 4 && NumZero > 0) {
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < 4; ++i) {
	bool isZero = !(NonZeros & (1ULL << i));
	if (isZero)
	Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
	else
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	}

	for (unsigned i = 0; i < 2; ++i) {
	switch ((NonZeros & (0x3 << i2)) >> (i2)) {
	default: break;
	case 0:
	Ops[i] = Ops[i*2]; // Must be a zero vector.
	break;
	case 1:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2+1], Ops[i2]);
	break;
	case 2:
	Ops[i] = getMOVL(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	case 3:
	Ops[i] = getUnpackl(DAG, dl, VT, Ops[i2], Ops[i2+1]);
	break;
	}
	}

	bool Reverse1 = (NonZeros & 0x3) == 2;
	bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
	int MaskVec[] = {
	Reverse1 ? 1 : 0,
	Reverse1 ? 0 : 1,
	static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
	static_cast<int>(Reverse2 ? NumElems : NumElems+1)
	};
	return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
	}

	if (Values.size() > 1 && VT.is128BitVector()) {
	// Check for a build vector from mostly shuffle plus few inserting.
	if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
	return Sh;

	// For SSE 4.1, use insertps to put the high elements into the low element.
	if (Subtarget.hasSSE41()) {
	SDValue Result;
	if (!Op.getOperand(0).isUndef())
	Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
	else
	Result = DAG.getUNDEF(VT);

	for (unsigned i = 1; i < NumElems; ++i) {
	if (Op.getOperand(i).isUndef()) continue;
	Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
	Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
	}
	return Result;
	}

	// Otherwise, expand into a number of unpckl*, start by extending each of
	// our (non-undef) elements to the full vector width with the element in the
	// bottom slot of the vector (which generates no code for SSE).
	SmallVector<SDValue, 8> Ops(NumElems);
	for (unsigned i = 0; i < NumElems; ++i) {
	if (!Op.getOperand(i).isUndef())
	Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
	else
	Ops[i] = DAG.getUNDEF(VT);
	}

	// Next, we iteratively mix elements, e.g. for v4f32:
	// Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
	// : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
	// Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
	for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
	// Generate scaled UNPCKL shuffle mask.
	SmallVector<int, 16> Mask;
	for(unsigned i = 0; i != Scale; ++i)
	Mask.push_back(i);
	for (unsigned i = 0; i != Scale; ++i)
	Mask.push_back(NumElems+i);
	Mask.append(NumElems - Mask.size(), SM_SentinelUndef);

	for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
	Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2i], Ops[(2i)+1], Mask);
	}
	return Ops[0];
	}
	return SDValue();
	}

	// 256-bit AVX can use the vinsertf128 instruction
	// to create 256-bit vectors from two other 128-bit ones.
	static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();

	assert((ResVT.is256BitVector() \|\|
	ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");

	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	unsigned NumElems = ResVT.getVectorNumElements();
	if (ResVT.is256BitVector())
	return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);

	if (Op.getNumOperands() == 4) {
	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	SDValue V3 = Op.getOperand(2);
	SDValue V4 = Op.getOperand(3);
	return concat256BitVectors(
	concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
	concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
	NumElems, DAG, dl);
	}
	return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
	}

	// Return true if all the operands of the given CONCAT_VECTORS node are zeros
	// except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
	static bool isExpandWithZeros(const SDValue &Op) {
	assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
	"Expand with zeros only possible in CONCAT_VECTORS nodes!");

	for (unsigned i = 1; i < Op.getNumOperands(); i++)
	if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
	return false;

	return true;
	}

	// Returns true if the given node is a type promotion (by concatenating i1
	// zeros) of the result of a node that already zeros all upper bits of
	// k-register.
	static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
	unsigned Opc = Op.getOpcode();

	assert(Opc == ISD::CONCAT_VECTORS &&
	Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected node to check for type promotion!");

	// As long as we are concatenating zeros to the upper part of a previous node
	// result, climb up the tree until a node with different opcode is
	// encountered
	while (Opc == ISD::INSERT_SUBVECTOR \|\| Opc == ISD::CONCAT_VECTORS) {
	if (Opc == ISD::INSERT_SUBVECTOR) {
	if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
	Op.getConstantOperandVal(2) == 0)
	Op = Op.getOperand(1);
	else
	return SDValue();
	} else { // Opc == ISD::CONCAT_VECTORS
	if (isExpandWithZeros(Op))
	Op = Op.getOperand(0);
	else
	return SDValue();
	}
	Opc = Op.getOpcode();
	}

	// Check if the first inserted node zeroes the upper bits, or an 'and' result
	// of a node that zeros the upper bits (its masked version).
	if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) \|\|
	(Op.getOpcode() == ISD::AND &&
	(isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) \|\|
	isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG & DAG) {
	SDLoc dl(Op);
	MVT ResVT = Op.getSimpleValueType();
	unsigned NumOfOperands = Op.getNumOperands();

	assert(isPowerOf2_32(NumOfOperands) &&
	"Unexpected number of operands in CONCAT_VECTORS");

	// If this node promotes - by concatenating zeroes - the type of the result
	// of a node with instruction that zeroes all upper (irrelevant) bits of the
	// output register, mark it as legal and catch the pattern in instruction
	// selection to avoid emitting extra insturctions (for zeroing upper bits).
	if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
	SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
	SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
	ZeroC);
	}

	SDValue Undef = DAG.getUNDEF(ResVT);
	if (NumOfOperands > 2) {
	// Specialize the cases when all, or all but one, of the operands are undef.
	unsigned NumOfDefinedOps = 0;
	unsigned OpIdx = 0;
	for (unsigned i = 0; i < NumOfOperands; i++)
	if (!Op.getOperand(i).isUndef()) {
	NumOfDefinedOps++;
	OpIdx = i;
	}
	if (NumOfDefinedOps == 0)
	return Undef;
	if (NumOfDefinedOps == 1) {
	unsigned SubVecNumElts =
	Op.getOperand(OpIdx).getValueType().getVectorNumElements();
	SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
	Op.getOperand(OpIdx), IdxVal);
	}

	MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
	ResVT.getVectorNumElements()/2);
	SmallVector<SDValue, 2> Ops;
	for (unsigned i = 0; i < NumOfOperands/2; i++)
	Ops.push_back(Op.getOperand(i));
	SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
	Ops.clear();
	for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
	Ops.push_back(Op.getOperand(i));
	SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
	}

	// 2 operands
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	unsigned NumElems = ResVT.getVectorNumElements();
	assert(V1.getValueType() == V2.getValueType() &&
	V1.getValueType().getVectorNumElements() == NumElems/2 &&
	"Unexpected operands in CONCAT_VECTORS");

	if (ResVT.getSizeInBits() >= 16)
	return Op; // The operation is legal with KUNPCK

	bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
	bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
	SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
	if (IsZeroV1 && IsZeroV2)
	return ZeroVec;

	SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
	if (V2.isUndef())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
	if (IsZeroV2)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);

	SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
	if (V1.isUndef())
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);

	if (IsZeroV1)
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);

	V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
	}

	static SDValue LowerCONCAT_VECTORS(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getVectorElementType() == MVT::i1)
	return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);

	assert((VT.is256BitVector() && Op.getNumOperands() == 2) \|\|
	(VT.is512BitVector() && (Op.getNumOperands() == 2 \|\|
	Op.getNumOperands() == 4)));

	// AVX can use the vinsertf128 instruction to create 256-bit vectors
	// from two other 128-bit ones.

	// 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
	return LowerAVXCONCAT_VECTORS(Op, DAG);
	}

	//===----------------------------------------------------------------------===//
	// Vector shuffle lowering
	//
	// This is an experimental code path for lowering vector shuffles on x86. It is
	// designed to handle arbitrary vector shuffles and blends, gracefully
	// degrading performance as necessary. It works hard to recognize idiomatic
	// shuffles and lower them to optimal instruction patterns without leaving
	// a framework that allows reasonably efficient handling of all vector shuffle
	// patterns.
	//===----------------------------------------------------------------------===//

	/// \brief Tiny helper function to identify a no-op mask.
	///
	/// This is a somewhat boring predicate function. It checks whether the mask
	/// array input, which is assumed to be a single-input shuffle mask of the kind
	/// used by the X86 shuffle instructions (not a fully general
	/// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
	/// in-place shuffle are 'no-op's.
	static bool isNoopShuffleMask(ArrayRef<int> Mask) {
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != i)
	return false;
	}
	return true;
	}

	/// \brief Test whether there are elements crossing 128-bit lanes in this
	/// shuffle mask.
	///
	/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
	/// and we routinely test for these.
	static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	return true;
	return false;
	}

	/// \brief Test whether a shuffle mask is equivalent within each sub-lane.
	///
	/// This checks a shuffle mask to see if it is performing the same
	/// lane-relative shuffle in each sub-lane. This trivially implies
	/// that it is also not lane-crossing. It may however involve a blend from the
	/// same lane of a second vector.
	///
	/// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
	/// non-trivial to compute in the face of undef lanes. The representation is
	/// suitable for use with existing 128-bit shuffles as entries from the second
	/// vector have been remapped to [LaneSize, 2*LaneSize).
	static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, -1);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] == SM_SentinelUndef \|\| Mask[i] >= 0);
	if (Mask[i] < 0)
	continue;
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
	: Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] < 0)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// Test whether a shuffle mask is equivalent within each 128-bit lane.
	static bool
	is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
	}

	/// Test whether a shuffle mask is equivalent within each 256-bit lane.
	static bool
	is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
	}

	/// Test whether a target shuffle mask is equivalent within each sub-lane.
	/// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
	static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
	ArrayRef<int> Mask,
	SmallVectorImpl<int> &RepeatedMask) {
	int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
	RepeatedMask.assign(LaneSize, SM_SentinelUndef);
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i) {
	assert(isUndefOrZero(Mask[i]) \|\| (Mask[i] >= 0));
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] == SM_SentinelZero) {
	if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
	return false;
	RepeatedMask[i % LaneSize] = SM_SentinelZero;
	continue;
	}
	if ((Mask[i] % Size) / LaneSize != i / LaneSize)
	// This entry crosses lanes, so there is no way to model this shuffle.
	return false;

	// Ok, handle the in-lane shuffles by detecting if and when they repeat.
	// Adjust second vector indices to start at LaneSize instead of Size.
	int LocalM =
	Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
	if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
	// This is the first non-undef entry in this slot of a 128-bit lane.
	RepeatedMask[i % LaneSize] = LocalM;
	else if (RepeatedMask[i % LaneSize] != LocalM)
	// Found a mismatch with the repeated mask.
	return false;
	}
	return true;
	}

	/// \brief Checks whether a shuffle mask is equivalent to an explicit list of
	/// arguments.
	///
	/// This is a fast way to test a shuffle mask against a fixed pattern:
	///
	/// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
	///
	/// It returns true if the mask is exactly as wide as the argument list, and
	/// each element of the mask is either -1 (signifying undef) or the value given
	/// in the argument.
	static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	if (Mask.size() != ExpectedMask.size())
	return false;

	int Size = Mask.size();

	// If the values are build vectors, we can look through them to find
	// equivalent inputs that make the shuffles equivalent.
	auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
	auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);

	for (int i = 0; i < Size; ++i) {
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
	auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
	auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
	if (!MaskBV \|\| !ExpectedBV \|\|
	MaskBV->getOperand(Mask[i] % Size) !=
	ExpectedBV->getOperand(ExpectedMask[i] % Size))
	return false;
	}
	}

	return true;
	}

	/// Checks whether a target shuffle mask is equivalent to an explicit pattern.
	///
	/// The masks must be exactly the same width.
	///
	/// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
	/// value in ExpectedMask is always accepted. Otherwise the indices must match.
	///
	/// SM_SentinelZero is accepted as a valid negative index but must match in both.
	static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
	ArrayRef<int> ExpectedMask) {
	int Size = Mask.size();
	if (Size != (int)ExpectedMask.size())
	return false;

	for (int i = 0; i < Size; ++i)
	if (Mask[i] == SM_SentinelUndef)
	continue;
	else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
	return false;
	else if (Mask[i] != ExpectedMask[i])
	return false;

	return true;
	}

	// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
	// mask.
	static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
	const APInt &Zeroable) {
	int NumElts = Mask.size();
	assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");

	SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
	for (int i = 0; i != NumElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
	TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
	}
	return TargetMask;
	}

	// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
	// instructions.
	static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
	if (VT != MVT::v8i32 && VT != MVT::v8f32)
	return false;

	SmallVector<int, 8> Unpcklwd;
	createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
	/* Unary = */ false);
	SmallVector<int, 8> Unpckhwd;
	createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
	/* Unary = */ false);
	bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) \|\|
	isTargetShuffleEquivalent(Mask, Unpckhwd));
	return IsUnpackwdMask;
	}

	/// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
	///
	/// This helper function produces an 8-bit shuffle immediate corresponding to
	/// the ubiquitous shuffle encoding scheme used in x86 instructions for
	/// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
	/// example.
	///
	/// NB: We rely heavily on "undef" masks preserving the input lane.
	static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
	assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
	assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");

	unsigned Imm = 0;
	Imm \|= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
	Imm \|= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
	Imm \|= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
	Imm \|= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
	return Imm;
	}

	static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
	SelectionDAG &DAG) {
	return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
	}

	/// \brief Compute whether each element of a shuffle is zeroable.
	///
	/// A "zeroable" vector shuffle element is one which can be lowered to zero.
	/// Either it is an undef element in the shuffle mask, the element of the input
	/// referenced is undef, or the element of the input referenced is known to be
	/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
	/// as many lanes with this technique as possible to simplify the remaining
	/// shuffle.
	static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
	SDValue V1, SDValue V2) {
	APInt Zeroable(Mask.size(), 0);
	V1 = peekThroughBitcasts(V1);
	V2 = peekThroughBitcasts(V2);

	bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());

	int VectorSizeInBits = V1.getValueSizeInBits();
	int ScalarSizeInBits = VectorSizeInBits / Mask.size();
	assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	int M = Mask[i];
	// Handle the easy cases.
	if (M < 0 \|\| (M >= 0 && M < Size && V1IsZero) \|\| (M >= Size && V2IsZero)) {
	Zeroable.setBit(i);
	continue;
	}

	// Determine shuffle input and normalize the mask.
	SDValue V = M < Size ? V1 : V2;
	M %= Size;

	// Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
	if (V.getOpcode() != ISD::BUILD_VECTOR)
	continue;

	// If the BUILD_VECTOR has fewer elements then the bitcasted portion of
	// the (larger) source element must be UNDEF/ZERO.
	if ((Size % V.getNumOperands()) == 0) {
	int Scale = Size / V->getNumOperands();
	SDValue Op = V.getOperand(M / Scale);
	if (Op.isUndef() \|\| X86::isZeroNode(Op))
	Zeroable.setBit(i);
	else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
	APInt Val = Cst->getAPIntValue();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	} else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
	APInt Val = Cst->getValueAPF().bitcastToAPInt();
	Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
	Val = Val.getLoBits(ScalarSizeInBits);
	if (Val == 0)
	Zeroable.setBit(i);
	}
	continue;
	}

	// If the BUILD_VECTOR has more elements then all the (smaller) source
	// elements must be UNDEF or ZERO.
	if ((V.getNumOperands() % Size) == 0) {
	int Scale = V->getNumOperands() / Size;
	bool AllZeroable = true;
	for (int j = 0; j < Scale; ++j) {
	SDValue Op = V.getOperand((M * Scale) + j);
	AllZeroable &= (Op.isUndef() \|\| X86::isZeroNode(Op));
	}
	if (AllZeroable)
	Zeroable.setBit(i);
	continue;
	}
	}

	return Zeroable;
	}

	// The Shuffle result is as follow:
	// 0a[0]0a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
	// Each Zeroable's element correspond to a particular Mask's element.
	// As described in computeZeroableShuffleElements function.
	//
	// The function looks for a sub-mask that the nonzero elements are in
	// increasing order. If such sub-mask exist. The function returns true.
	static bool isNonZeroElementsInOrder(const APInt &Zeroable,
	ArrayRef<int> Mask, const EVT &VectorType,
	bool &IsZeroSideLeft) {
	int NextElement = -1;
	// Check if the Mask's nonzero elements are in increasing order.
	for (int i = 0, e = Mask.size(); i < e; i++) {
	// Checks if the mask's zeros elements are built from only zeros.
	assert(Mask[i] >= -1 && "Out of bound mask element!");
	if (Mask[i] < 0)
	return false;
	if (Zeroable[i])
	continue;
	// Find the lowest non zero element
	if (NextElement < 0) {
	NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
	IsZeroSideLeft = NextElement != 0;
	}
	// Exit if the mask's non zero elements are not in increasing order.
	if (NextElement != Mask[i])
	return false;
	NextElement++;
	}
	return true;
	}

	/// Try to lower a shuffle with a single PSHUFB of V1 or V2.
	static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	const int NumBytes = VT.getSizeInBits() / 8;
	const int NumEltBytes = VT.getScalarSizeInBits() / 8;

	assert((Subtarget.hasSSSE3() && VT.is128BitVector()) \|\|
	(Subtarget.hasAVX2() && VT.is256BitVector()) \|\|
	(Subtarget.hasBWI() && VT.is512BitVector()));

	SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
	// Sign bit set in i8 mask means zero element.
	SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);

	SDValue V;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / NumEltBytes];
	if (M < 0) {
	PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
	continue;
	}
	if (Zeroable[i / NumEltBytes]) {
	PSHUFBMask[i] = ZeroMask;
	continue;
	}

	// We can only use a single input of V1 or V2.
	SDValue SrcV = (M >= Size ? V2 : V1);
	if (V && V != SrcV)
	return SDValue();
	V = SrcV;
	M %= Size;

	// PSHUFB can't cross lanes, ensure this doesn't happen.
	if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
	return SDValue();

	M = M % LaneSize;
	M = M * NumEltBytes + (i % NumEltBytes);
	PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
	}
	assert(V && "Failed to find a source input");

	MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
	DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
	}

	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl);

	// X86 has dedicated shuffle that can be lowered to VEXPAND
	static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
	const APInt &Zeroable,
	ArrayRef<int> Mask, SDValue &V1,
	SDValue &V2, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	bool IsLeftZeroSide = true;
	if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
	IsLeftZeroSide))
	return SDValue();
	unsigned VEXPANDMask = (~Zeroable).getZExtValue();
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
	unsigned NumElts = VT.getVectorNumElements();
	assert((NumElts == 4 \|\| NumElts == 8 \|\| NumElts == 16) &&
	"Unexpected number of vector elements");
	SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
	Subtarget, DAG, DL);
	SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
	return DAG.getSelect(DL, VT, VMask,
	DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
	ZeroVector);
	}

	static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &UnpackOpcode, bool IsUnary,
	ArrayRef<int> TargetMask, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	int NumElts = VT.getVectorNumElements();

	bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
	for (int i = 0; i != NumElts; i += 2) {
	int M1 = TargetMask[i + 0];
	int M2 = TargetMask[i + 1];
	Undef1 &= (SM_SentinelUndef == M1);
	Undef2 &= (SM_SentinelUndef == M2);
	Zero1 &= isUndefOrZero(M1);
	Zero2 &= isUndefOrZero(M2);
	}
	assert(!((Undef1 \|\| Zero1) && (Undef2 \|\| Zero2)) &&
	"Zeroable shuffle detected");

	// Attempt to match the target mask against the unpack lo/hi mask patterns.
	SmallVector<int, 64> Unpckl, Unpckh;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
	V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
	return true;
	}

	// If an unary shuffle, attempt to match as an unpack lo/hi with zero.
	if (IsUnary && (Zero1 \|\| Zero2)) {
	// Don't bother if we can blend instead.
	if ((Subtarget.hasSSE41() \|\| VT == MVT::v2i64 \|\| VT == MVT::v2f64) &&
	isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
	return false;

	bool MatchLo = true, MatchHi = true;
	for (int i = 0; (i != NumElts) && (MatchLo \|\| MatchHi); ++i) {
	int M = TargetMask[i];

	// Ignore if the input is known to be zero or the index is undef.
	if ((((i & 1) == 0) && Zero1) \|\| (((i & 1) == 1) && Zero2) \|\|
	(M == SM_SentinelUndef))
	continue;

	MatchLo &= (M == Unpckl[i]);
	MatchHi &= (M == Unpckh[i]);
	}

	if (MatchLo \|\| MatchHi) {
	UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
	V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
	return true;
	}
	}

	// If a binary shuffle, commute and try again.
	if (!IsUnary) {
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
	UnpackOpcode = X86ISD::UNPCKL;
	std::swap(V1, V2);
	return true;
	}

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
	UnpackOpcode = X86ISD::UNPCKH;
	std::swap(V1, V2);
	return true;
	}
	}

	return false;
	}

	// X86 has dedicated unpack instructions that can handle specific blend
	// operations: UNPCKH and UNPCKL.
	static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SmallVector<int, 8> Unpckl;
	createUnpackShuffleMask(VT, Unpckl, /* Lo = / true, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);

	SmallVector<int, 8> Unpckh;
	createUnpackShuffleMask(VT, Unpckh, /* Lo = / false, / Unary = */ false);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);

	// Commute and try again.
	ShuffleVectorSDNode::commuteMask(Unpckl);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
	return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);

	ShuffleVectorSDNode::commuteMask(Unpckh);
	if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);

	return SDValue();
	}

	/// \brief Try to emit a bitmask instruction for a shuffle.
	///
	/// This handles cases where we can model a blend exactly as a bitmask due to
	/// one of the inputs being zeroable.
	static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() && "Floating point types are not supported");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
	SDValue V;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Zeroable[i])
	continue;
	if (Mask[i] % Size != i)
	return SDValue(); // Not a blend.
	if (!V)
	V = Mask[i] < Size ? V1 : V2;
	else if (V != (Mask[i] < Size ? V1 : V2))
	return SDValue(); // Can only let one input through the mask.

	VMaskOps[i] = AllOnes;
	}
	if (!V)
	return SDValue(); // No non-zeroable elements!

	SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
	return DAG.getNode(ISD::AND, DL, VT, V, VMask);
	}

	/// \brief Try to emit a blend instruction for a shuffle using bit math.
	///
	/// This is used as a fallback approach when first class blend instructions are
	/// unavailable. Currently it is only suitable for integer vectors, but could
	/// be generalized for floating point vectors if desirable.
	static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.isInteger() && "Only supports integer vector types!");
	MVT EltVT = VT.getVectorElementType();
	SDValue Zero = DAG.getConstant(0, DL, EltVT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
	SmallVector<SDValue, 16> MaskOps;
	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
	return SDValue(); // Shuffled input!
	MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
	}

	SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
	V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
	// We have to cast V2 around.
	MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
	V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
	DAG.getBitcast(MaskVT, V1Mask),
	DAG.getBitcast(MaskVT, V2)));
	return DAG.getNode(ISD::OR, DL, VT, V1, V2);
	}

	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG);

	static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
	MutableArrayRef<int> TargetMask,
	bool &ForceV1Zero, bool &ForceV2Zero,
	uint64_t &BlendMask) {
	bool V1IsZeroOrUndef =
	V1.isUndef() \|\| ISD::isBuildVectorAllZeros(V1.getNode());
	bool V2IsZeroOrUndef =
	V2.isUndef() \|\| ISD::isBuildVectorAllZeros(V2.getNode());

	BlendMask = 0;
	ForceV1Zero = false, ForceV2Zero = false;
	assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");

	// Attempt to generate the binary blend mask. If an input is zero then
	// we can use any lane.
	// TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
	for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
	int M = TargetMask[i];
	if (M == SM_SentinelUndef)
	continue;
	if (M == i)
	continue;
	if (M == i + Size) {
	BlendMask \|= 1ull << i;
	continue;
	}
	if (M == SM_SentinelZero) {
	if (V1IsZeroOrUndef) {
	ForceV1Zero = true;
	TargetMask[i] = i;
	continue;
	}
	if (V2IsZeroOrUndef) {
	ForceV2Zero = true;
	BlendMask \|= 1ull << i;
	TargetMask[i] = i + Size;
	continue;
	}
	}
	return false;
	}
	return true;
	}

	uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
	uint64_t ScaledMask = 0;
	for (int i = 0; i != Size; ++i)
	if (BlendMask & (1ull << i))
	ScaledMask \|= ((1ull << Scale) - 1) << (i * Scale);
	return ScaledMask;
	}

	/// \brief Try to emit a blend instruction for a shuffle.
	///
	/// This doesn't do any checks for the availability of instructions for blending
	/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
	/// be matched in the backend with the type given. What it does check for is
	/// that the shuffle mask is a blend, or convertible into a blend with zero.
	static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Original,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);

	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
	BlendMask))
	return SDValue();

	// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
	if (ForceV1Zero)
	V1 = getZeroVector(VT, Subtarget, DAG, DL);
	if (ForceV2Zero)
	V2 = getZeroVector(VT, Subtarget, DAG, DL);

	switch (VT.SimpleTy) {
	case MVT::v2f64:
	case MVT::v4f32:
	case MVT::v4f64:
	case MVT::v8f32:
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));

	case MVT::v4i64:
	case MVT::v8i32:
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	LLVM_FALLTHROUGH;
	case MVT::v2i64:
	case MVT::v4i32:
	// If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
	// that instruction.
	if (Subtarget.hasAVX2()) {
	// Scale the blend by the number of 32-bit dwords per element.
	int Scale = VT.getScalarSizeInBits() / 32;
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}
	LLVM_FALLTHROUGH;
	case MVT::v8i16: {
	// For integer shuffles we need to expand the mask and cast the inputs to
	// v8i16s prior to blending.
	int Scale = 8 / VT.getVectorNumElements();
	BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = DAG.getBitcast(MVT::v8i16, V2);
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8)));
	}

	case MVT::v16i16: {
	assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// We can lower these with PBLENDW which is mirrored across 128-bit lanes.
	assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
	BlendMask = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	BlendMask \|= 1ull << i;
	return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
	DAG.getConstant(BlendMask, DL, MVT::i8));
	}
	LLVM_FALLTHROUGH;
	}
	case MVT::v16i8:
	case MVT::v32i8: {
	assert((VT.is128BitVector() \|\| Subtarget.hasAVX2()) &&
	"256-bit byte-blends require AVX2 support!");

	if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}

	// Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
	if (SDValue Masked =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return Masked;

	// Scale the blend by the number of bytes per element.
	int Scale = VT.getScalarSizeInBits() / 8;

	// This form of blend is always done on bytes. Compute the byte vector
	// type.
	MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);

	// Compute the VSELECT mask. Note that VSELECT is really confusing in the
	// mix of LLVM's code generator and the x86 backend. We tell the code
	// generator that boolean values in the elements of an x86 vector register
	// are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
	// mapping a select to operand #1, and 'false' mapping to operand #2. The
	// reality in x86 is that vector masks (pre-AVX-512) use only the high bit
	// of the element (the remaining are ignored) and 0 in that high bit would
	// mean operand #1 while 1 in the high bit would mean operand #2. So while
	// the LLVM model for boolean values in vector elements gets the relevant
	// bit set, it is set backwards and over constrained relative to x86's
	// actual model.
	SmallVector<SDValue, 32> VSELECTMask;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	for (int j = 0; j < Scale; ++j)
	VSELECTMask.push_back(
	Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
	: DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
	MVT::i8));

	V1 = DAG.getBitcast(BlendVT, V1);
	V2 = DAG.getBitcast(BlendVT, V2);
	return DAG.getBitcast(
	VT,
	DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
	V1, V2));
	}
	case MVT::v16f32:
	case MVT::v8f64:
	case MVT::v8i64:
	case MVT::v16i32:
	case MVT::v32i16:
	case MVT::v64i8: {
	MVT IntegerType =
	MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
	SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
	return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
	}
	default:
	llvm_unreachable("Not a supported integer vector type!");
	}
	}

	/// \brief Try to lower as a blend of elements from two inputs followed by
	/// a single-input permutation.
	///
	/// This matches the pattern where we can blend elements from two inputs and
	/// then reduce the shuffle to a single-input permutation.
	static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// We build up the blend mask while checking whether a blend is a viable way
	// to reduce the shuffle.
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	SmallVector<int, 32> PermuteMask(Mask.size(), -1);

	for (int i = 0, Size = Mask.size(); i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");

	if (BlendMask[Mask[i] % Size] < 0)
	BlendMask[Mask[i] % Size] = Mask[i];
	else if (BlendMask[Mask[i] % Size] != Mask[i])
	return SDValue(); // Can't blend in the needed input!

	PermuteMask[i] = Mask[i] % Size;
	}

	SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
	}

	/// \brief Generic routine to decompose a shuffle and blend into independent
	/// blends and permutes.
	///
	/// This matches the extremely common pattern for handling combined
	/// shuffle+blend operations on newer X86 ISAs where we have very fast blend
	/// operations. It will try to pick the best arrangement of shuffles and
	/// blends.
	static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
	MVT VT, SDValue V1,
	SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// Shuffle the input elements into the desired positions in V1 and V2 and
	// blend them together.
	SmallVector<int, 32> V1Mask(Mask.size(), -1);
	SmallVector<int, 32> V2Mask(Mask.size(), -1);
	SmallVector<int, 32> BlendMask(Mask.size(), -1);
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] < Size) {
	V1Mask[i] = Mask[i];
	BlendMask[i] = i;
	} else if (Mask[i] >= Size) {
	V2Mask[i] = Mask[i] - Size;
	BlendMask[i] = i + Size;
	}

	// Try to lower with the simpler initial blend strategy unless one of the
	// input shuffles would be a no-op. We prefer to shuffle inputs as the
	// shuffle may be able to fold with a load or other benefit. However, when
	// we'll have to do 2x as many shuffles in order to achieve this, blending
	// first is a better strategy.
	if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
	if (SDValue BlendPerm =
	lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
	return BlendPerm;

	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
	return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
	}

	/// \brief Try to lower a vector shuffle as a rotation.
	///
	/// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
	static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	int NumElts = Mask.size();

	// We need to detect various ways of spelling a rotation:
	// [11, 12, 13, 14, 15, 0, 1, 2]
	// [-1, 12, 13, 14, -1, -1, 1, -1]
	// [-1, -1, -1, -1, -1, -1, 1, 2]
	// [ 3, 4, 5, 6, 7, 8, 9, 10]
	// [-1, 4, 5, 6, -1, -1, 9, -1]
	// [-1, 4, 5, 6, -1, -1, -1, -1]
	int Rotation = 0;
	SDValue Lo, Hi;
	for (int i = 0; i < NumElts; ++i) {
	int M = Mask[i];
	assert((M == SM_SentinelUndef \|\| (0 <= M && M < (2*NumElts))) &&
	"Unexpected mask index.");
	if (M < 0)
	continue;

	// Determine where a rotated vector would have started.
	int StartIdx = i - (M % NumElts);
	if (StartIdx == 0)
	// The identity rotation isn't interesting, stop.
	return -1;

	// If we found the tail of a vector the rotation must be the missing
	// front. If we found the head of a vector, it must be how much of the
	// head.
	int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;

	if (Rotation == 0)
	Rotation = CandidateRotation;
	else if (Rotation != CandidateRotation)
	// The rotations don't match, so we can't match this mask.
	return -1;

	// Compute which value this mask is pointing at.
	SDValue MaskV = M < NumElts ? V1 : V2;

	// Compute which of the two target values this index should be assigned
	// to. This reflects whether the high elements are remaining or the low
	// elements are remaining.
	SDValue &TargetV = StartIdx < 0 ? Hi : Lo;

	// Either set up this value if we've not encountered it before, or check
	// that it remains consistent.
	if (!TargetV)
	TargetV = MaskV;
	else if (TargetV != MaskV)
	// This may be a rotation, but it pulls from the inputs in some
	// unsupported interleaving.
	return -1;
	}

	// Check that we successfully analyzed the mask, and normalize the results.
	assert(Rotation != 0 && "Failed to locate a viable rotation!");
	assert((Lo \|\| Hi) && "Failed to find a rotated input vector!");
	if (!Lo)
	Lo = Hi;
	else if (!Hi)
	Hi = Lo;

	V1 = Lo;
	V2 = Hi;

	return Rotation;
	}

	/// \brief Try to lower a vector shuffle as a byte rotation.
	///
	/// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
	/// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
	/// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
	/// try to generically lower a vector shuffle through such an pattern. It
	/// does not check for the profitability of lowering either as PALIGNR or
	/// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
	/// This matches shuffle vectors that look like:
	///
	/// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask) {
	// Don't accept any shuffles with zero elements.
	if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
	return -1;

	// PALIGNR works on 128-bit lanes.
	SmallVector<int, 16> RepeatedMask;
	if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
	return -1;

	int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
	if (Rotation <= 0)
	return -1;

	// PALIGNR rotates bytes, so we need to scale the
	// rotation based on how many bytes are in the vector lane.
	int NumElts = RepeatedMask.size();
	int Scale = 16 / NumElts;
	return Rotation * Scale;
	}

	static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");

	SDValue Lo = V1, Hi = V2;
	int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
	if (ByteRotation <= 0)
	return SDValue();

	// Cast the inputs to i8 vector of correct length to match PALIGNR or
	// PSLLDQ/PSRLDQ.
	MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
	Lo = DAG.getBitcast(ByteVT, Lo);
	Hi = DAG.getBitcast(ByteVT, Hi);

	// SSSE3 targets can use the palignr instruction.
	if (Subtarget.hasSSSE3()) {
	assert((!VT.is512BitVector() \|\| Subtarget.hasBWI()) &&
	"512-bit PALIGNR requires BWI instructions");
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
	DAG.getConstant(ByteRotation, DL, MVT::i8)));
	}

	assert(VT.is128BitVector() &&
	"Rotate-based lowering only supports 128-bit lowering!");
	assert(Mask.size() <= 16 &&
	"Can shuffle at most 16 bytes in a 128-bit vector!");
	assert(ByteVT == MVT::v16i8 &&
	"SSE2 rotate lowering only needed for v16i8!");

	// Default SSE2 implementation
	int LoByteShift = 16 - ByteRotation;
	int HiByteShift = ByteRotation;

	SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
	DAG.getConstant(LoByteShift, DL, MVT::i8));
	SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
	DAG.getConstant(HiByteShift, DL, MVT::i8));
	return DAG.getBitcast(VT,
	DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
	}

	/// \brief Try to lower a vector shuffle as a dword/qword rotation.
	///
	/// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
	/// rotation of the concatenation of two vectors; This routine will
	/// try to generically lower a vector shuffle through such an pattern.
	///
	/// Essentially it concatenates V1 and V2, shifts right by some number of
	/// elements, and takes the low elements as the result. Note that while this is
	/// specified as a right shift because x86 is little-endian, it is a *left
	/// rotate* of the vector lanes.
	static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert((VT.getScalarType() == MVT::i32 \|\| VT.getScalarType() == MVT::i64) &&
	"Only 32-bit and 64-bit elements are supported!");

	// 128/256-bit vectors are only supported with VLX.
	assert((Subtarget.hasVLX() \|\| (!VT.is128BitVector() && !VT.is256BitVector()))
	&& "VLX required for 128/256-bit vectors");

	SDValue Lo = V1, Hi = V2;
	int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
	if (Rotation <= 0)
	return SDValue();

	return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
	DAG.getConstant(Rotation, DL, MVT::i8));
	}

	/// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
	///
	/// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
	/// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
	/// matches elements from one of the input vectors shuffled to the left or
	/// right with zeroable elements 'shifted in'. It handles both the strictly
	/// bit-wise element shifts and the byte shift across an entire 128-bit double
	/// quad word lane.
	///
	/// PSHL : (little-endian) left bit shift.
	/// [ zz, 0, zz, 2 ]
	/// [ -1, 4, zz, -1 ]
	/// PSRL : (little-endian) right bit shift.
	/// [ 1, zz, 3, zz]
	/// [ -1, -1, 7, zz]
	/// PSLLDQ : (little-endian) left byte shift
	/// [ zz, 0, 1, 2, 3, 4, 5, 6]
	/// [ zz, zz, -1, -1, 2, 3, 4, -1]
	/// [ zz, zz, zz, zz, zz, zz, -1, 1]
	/// PSRLDQ : (little-endian) right byte shift
	/// [ 5, 6, 7, zz, zz, zz, zz, zz]
	/// [ -1, 5, 6, 7, zz, zz, zz, zz]
	/// [ 1, 2, -1, -1, -1, -1, zz, zz]
	static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
	unsigned ScalarSizeInBits,
	ArrayRef<int> Mask, int MaskOffset,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget) {
	int Size = Mask.size();
	unsigned SizeInBits = Size * ScalarSizeInBits;

	auto CheckZeros = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i < Size; i += Scale)
	for (int j = 0; j < Shift; ++j)
	if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
	return false;

	return true;
	};

	auto MatchShift = [&](int Shift, int Scale, bool Left) {
	for (int i = 0; i != Size; i += Scale) {
	unsigned Pos = Left ? i + Shift : i;
	unsigned Low = Left ? i : i + Shift;
	unsigned Len = Scale - Shift;
	if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
	return -1;
	}

	int ShiftEltBits = ScalarSizeInBits * Scale;
	bool ByteShift = ShiftEltBits > 64;
	Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
	: (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
	int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);

	// Normalize the scale for byte shifts to still produce an i64 element
	// type.
	Scale = ByteShift ? Scale / 2 : Scale;

	// We need to round trip through the appropriate type for the shift.
	MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
	ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
	: MVT::getVectorVT(ShiftSVT, Size / Scale);
	return (int)ShiftAmt;
	};

	// SSE/AVX supports logical shifts up to 64-bit integers - so we can just
	// keep doubling the size of the integer elements up to that. We can
	// then shift the elements of the integer vector by whole multiples of
	// their width within the elements of the larger integer vector. Test each
	// multiple to see if we can find a match with the moved element indices
	// and that the shifted in elements are all zeroable.
	unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
	for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
	for (int Shift = 1; Shift != Scale; ++Shift)
	for (bool Left : {true, false})
	if (CheckZeros(Shift, Scale, Left)) {
	int ShiftAmt = MatchShift(Shift, Scale, Left);
	if (0 < ShiftAmt)
	return ShiftAmt;
	}

	// no match
	return -1;
	}

	static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Size = Mask.size();
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	MVT ShiftVT;
	SDValue V = V1;
	unsigned Opcode;

	// Try to match shuffle against V1 shift.
	int ShiftAmt = matchVectorShuffleAsShift(
	ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);

	// If V1 failed, try to match shuffle against V2 shift.
	if (ShiftAmt < 0) {
	ShiftAmt =
	matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
	Mask, Size, Zeroable, Subtarget);
	V = V2;
	}

	if (ShiftAmt < 0)
	return SDValue();

	assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
	"Illegal integer vector type");
	V = DAG.getBitcast(ShiftVT, V);
	V = DAG.getNode(Opcode, DL, ShiftVT, V,
	DAG.getConstant(ShiftAmt, DL, MVT::i8));
	return DAG.getBitcast(VT, V);
	}

	// EXTRQ: Extract Len elements from lower half of source, starting at Idx.
	// Remainder of lower half result is zero and upper half is all undef.
	static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx, const APInt &Zeroable) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
	assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	// Determine the extraction length from the part of the
	// lower half that isn't zeroable.
	int Len = HalfSize;
	for (; Len > 0; --Len)
	if (!Zeroable[Len - 1])
	break;
	assert(Len > 0 && "Zeroable shuffle mask");

	// Attempt to match first Len sequential elements from the lower half.
	SDValue Src;
	int Idx = -1;
	for (int i = 0; i != Len; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	SDValue &V = (M < Size ? V1 : V2);
	M = M % Size;

	// The extracted elements must start at a valid index and all mask
	// elements must be in the lower half.
	if (i > M \|\| M >= HalfSize)
	return false;

	if (Idx < 0 \|\| (Src == V && Idx == (M - i))) {
	Src = V;
	Idx = M - i;
	continue;
	}
	return false;
	}

	if (!Src \|\| Idx < 0)
	return false;

	assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Src;
	return true;
	}

	// INSERTQ: Extract lowest Len elements from lower half of second source and
	// insert over first source, starting at Idx.
	// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
	static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
	ArrayRef<int> Mask, uint64_t &BitLen,
	uint64_t &BitIdx) {
	int Size = Mask.size();
	int HalfSize = Size / 2;
	assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");

	// Upper half must be undefined.
	if (!isUndefInRange(Mask, HalfSize, HalfSize))
	return false;

	for (int Idx = 0; Idx != HalfSize; ++Idx) {
	SDValue Base;

	// Attempt to match first source from mask before insertion point.
	if (isUndefInRange(Mask, 0, Idx)) {
	/* EMPTY */
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
	Base = V1;
	} else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
	Base = V2;
	} else {
	continue;
	}

	// Extend the extraction length looking to match both the insertion of
	// the second source and the remaining elements of the first.
	for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
	SDValue Insert;
	int Len = Hi - Idx;

	// Match insertion.
	if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
	Insert = V1;
	} else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
	Insert = V2;
	} else {
	continue;
	}

	// Match the remaining elements of the lower half.
	if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
	/* EMPTY */
	} else if ((!Base \|\| (Base == V1)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
	Base = V1;
	} else if ((!Base \|\| (Base == V2)) &&
	isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
	Size + Hi)) {
	Base = V2;
	} else {
	continue;
	}

	BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
	BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
	V1 = Base;
	V2 = Insert;
	return true;
	}
	}

	return false;
	}

	/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
	static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
	return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
	return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
	V2 ? V2 : DAG.getUNDEF(VT),
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));

	return SDValue();
	}

	/// \brief Lower a vector shuffle as a zero or any extension.
	///
	/// Given a specific number of elements, element bit width, and extension
	/// stride, produce either a zero or any extension based on the available
	/// features of the subtarget. The extended elements are consecutive and
	/// begin and can start from an offsetted element index in the input; to
	/// avoid excess shuffling the offset must either being in the bottom lane
	/// or at the start of a higher lane. All extended elements must be from
	/// the same lane.
	static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
	ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(Scale > 1 && "Need a scale to extend.");
	int EltBits = VT.getScalarSizeInBits();
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = 128 / EltBits;
	int OffsetLane = Offset / NumEltsPerLane;
	assert((EltBits == 8 \|\| EltBits == 16 \|\| EltBits == 32) &&
	"Only 8, 16, and 32 bit elements can be extended.");
	assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
	assert(0 <= Offset && "Extension offset must be positive.");
	assert((Offset < NumEltsPerLane \|\| Offset % NumEltsPerLane == 0) &&
	"Extension offset must be in the first lane or start an upper lane.");

	// Check that an index is in same lane as the base offset.
	auto SafeOffset = [&](int Idx) {
	return OffsetLane == (Idx / NumEltsPerLane);
	};

	// Shift along an input so that the offset base moves to the first element.
	auto ShuffleOffset = [&](SDValue V) {
	if (!Offset)
	return V;

	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = 0; i * Scale < NumElements; ++i) {
	int SrcIdx = i + Offset;
	ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
	}
	return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
	};

	// Found a valid zext mask! Try various lowering strategies based on the
	// input type and available ISA extensions.
	if (Subtarget.hasSSE41()) {
	// Not worth offsetting 128-bit vectors if scale == 2, a pattern using
	// PUNPCK will catch this in a later shuffle match.
	if (Offset && Scale == 2 && VT.is128BitVector())
	return SDValue();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
	NumElements / Scale);
	InputV = ShuffleOffset(InputV);
	InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
	return DAG.getBitcast(VT, InputV);
	}

	assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");

	// For any extends we can cheat for larger element sizes and use shuffle
	// instructions that can fold with a load and/or copy.
	if (AnyExt && EltBits == 32) {
	int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
	-1};
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}
	if (AnyExt && EltBits == 16 && Scale > 2) {
	int PSHUFDMask[4] = {Offset / 2, -1,
	SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
	InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
	DAG.getBitcast(MVT::v4i32, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	int PSHUFWMask[4] = {1, -1, -1, -1};
	unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
	return DAG.getBitcast(
	VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
	DAG.getBitcast(MVT::v8i16, InputV),
	getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
	}

	// The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
	// to 64-bits.
	if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
	assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
	assert(VT.is128BitVector() && "Unexpected vector width!");

	int LoIdx = Offset * EltBits;
	SDValue Lo = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(LoIdx, DL, MVT::i8)));

	if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) \|\|
	!SafeOffset(Offset + 1))
	return DAG.getBitcast(VT, Lo);

	int HiIdx = (Offset + 1) * EltBits;
	SDValue Hi = DAG.getBitcast(
	MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
	DAG.getConstant(EltBits, DL, MVT::i8),
	DAG.getConstant(HiIdx, DL, MVT::i8)));
	return DAG.getBitcast(VT,
	DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
	}

	// If this would require more than 2 unpack instructions to expand, use
	// pshufb when available. We can only use more than 2 unpack instructions
	// when zero extending i8 elements which also makes it easier to use pshufb.
	if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
	assert(NumElements == 16 && "Unexpected byte vector width!");
	SDValue PSHUFBMask[16];
	for (int i = 0; i < 16; ++i) {
	int Idx = Offset + (i / Scale);
	PSHUFBMask[i] = DAG.getConstant(
	(i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
	}
	InputV = DAG.getBitcast(MVT::v16i8, InputV);
	return DAG.getBitcast(
	VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
	DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
	}

	// If we are extending from an offset, ensure we start on a boundary that
	// we can unpack from.
	int AlignToUnpack = Offset % (NumElements / Scale);
	if (AlignToUnpack) {
	SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
	for (int i = AlignToUnpack; i < NumElements; ++i)
	ShMask[i - AlignToUnpack] = i;
	InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
	Offset -= AlignToUnpack;
	}

	// Otherwise emit a sequence of unpacks.
	do {
	unsigned UnpackLoHi = X86ISD::UNPCKL;
	if (Offset >= (NumElements / 2)) {
	UnpackLoHi = X86ISD::UNPCKH;
	Offset -= (NumElements / 2);
	}

	MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
	SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
	: getZeroVector(InputVT, Subtarget, DAG, DL);
	InputV = DAG.getBitcast(InputVT, InputV);
	InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
	Scale /= 2;
	EltBits *= 2;
	NumElements /= 2;
	} while (Scale > 1);
	return DAG.getBitcast(VT, InputV);
	}

	/// \brief Try to lower a vector shuffle as a zero extension on any microarch.
	///
	/// This routine will try to do everything in its power to cleverly lower
	/// a shuffle which happens to match the pattern of a zero extend. It doesn't
	/// check for the profitability of this lowering, it tries to aggressively
	/// match this pattern. It will use all of the micro-architectural details it
	/// can to emit an efficient lowering. It handles both blends with all-zero
	/// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
	/// masking out later).
	///
	/// The reason we have dedicated lowering for zext-style shuffles is that they
	/// are both incredibly common and often quite performance sensitive.
	static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	int Bits = VT.getSizeInBits();
	int NumLanes = Bits / 128;
	int NumElements = VT.getVectorNumElements();
	int NumEltsPerLane = NumElements / NumLanes;
	assert(VT.getScalarSizeInBits() <= 32 &&
	"Exceeds 32-bit integer zero extension limit");
	assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");

	// Define a helper function to check a particular ext-scale and lower to it if
	// valid.
	auto Lower = [&](int Scale) -> SDValue {
	SDValue InputV;
	bool AnyExt = true;
	int Offset = 0;
	int Matches = 0;
	for (int i = 0; i < NumElements; ++i) {
	int M = Mask[i];
	if (M < 0)
	continue; // Valid anywhere but doesn't tell us anything.
	if (i % Scale != 0) {
	// Each of the extended elements need to be zeroable.
	if (!Zeroable[i])
	return SDValue();

	// We no longer are in the anyext case.
	AnyExt = false;
	continue;
	}

	// Each of the base elements needs to be consecutive indices into the
	// same input vector.
	SDValue V = M < NumElements ? V1 : V2;
	M = M % NumElements;
	if (!InputV) {
	InputV = V;
	Offset = M - (i / Scale);
	} else if (InputV != V)
	return SDValue(); // Flip-flopping inputs.

	// Offset must start in the lowest 128-bit lane or at the start of an
	// upper lane.
	// FIXME: Is it ever worth allowing a negative base offset?
	if (!((0 <= Offset && Offset < NumEltsPerLane) \|\|
	(Offset % NumEltsPerLane) == 0))
	return SDValue();

	// If we are offsetting, all referenced entries must come from the same
	// lane.
	if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
	return SDValue();

	if ((M % NumElements) != (Offset + (i / Scale)))
	return SDValue(); // Non-consecutive strided elements.
	Matches++;
	}

	// If we fail to find an input, we have a zero-shuffle which should always
	// have already been handled.
	// FIXME: Maybe handle this here in case during blending we end up with one?
	if (!InputV)
	return SDValue();

	// If we are offsetting, don't extend if we only match a single input, we
	// can always do better by using a basic PSHUF or PUNPCK.
	if (Offset != 0 && Matches < 2)
	return SDValue();

	return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
	DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
	};

	// The widest scale possible for extending is to a 64-bit integer.
	assert(Bits % 64 == 0 &&
	"The number of bits in a vector must be divisible by 64 on x86!");
	int NumExtElements = Bits / 64;

	// Each iteration, try extending the elements half as much, but into twice as
	// many elements.
	for (; NumExtElements < NumElements; NumExtElements *= 2) {
	assert(NumElements % NumExtElements == 0 &&
	"The input vector size must be divisible by the extended size.");
	if (SDValue V = Lower(NumElements / NumExtElements))
	return V;
	}

	// General extends failed, but 128-bit vectors may be able to use MOVQ.
	if (Bits != 128)
	return SDValue();

	// Returns one of the source operands if the shuffle can be reduced to a
	// MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
	auto CanZExtLowHalf = [&]() {
	for (int i = NumElements / 2; i != NumElements; ++i)
	if (!Zeroable[i])
	return SDValue();
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
	return V1;
	if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
	return V2;
	return SDValue();
	};

	if (SDValue V = CanZExtLowHalf()) {
	V = DAG.getBitcast(MVT::v2i64, V);
	V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
	return DAG.getBitcast(VT, V);
	}

	// No viable ext lowering found.
	return SDValue();
	}

	/// \brief Try to get a scalar value for a specific element of a vector.
	///
	/// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
	static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
	SelectionDAG &DAG) {
	MVT VT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	V = peekThroughBitcasts(V);

	// If the bitcasts shift the element size, we can't extract an equivalent
	// element from it.
	MVT NewVT = V.getSimpleValueType();
	if (!NewVT.isVector() \|\| NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
	// Ensure the scalar operand is the same size as the destination.
	// FIXME: Add support for scalar truncation where possible.
	SDValue S = V.getOperand(Idx);
	if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
	return DAG.getBitcast(EltVT, S);
	}

	return SDValue();
	}

	/// \brief Helper to test for a load that can be folded with x86 shuffles.
	///
	/// This is particularly important because the set of instructions varies
	/// significantly based on whether the operand is a load or not.
	static bool isShuffleFoldableLoad(SDValue V) {
	V = peekThroughBitcasts(V);
	return ISD::isNON_EXTLoad(V.getNode());
	}

	/// \brief Try to lower insertion of a single element into a zero vector.
	///
	/// This is a common pattern that we have especially efficient patterns to lower
	/// across all subtarget feature sets.
	static SDValue lowerVectorShuffleAsElementInsertion(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT ExtVT = VT;
	MVT EltVT = VT.getVectorElementType();

	int V2Index =
	find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
	Mask.begin();
	bool IsV1Zeroable = true;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (i != V2Index && !Zeroable[i]) {
	IsV1Zeroable = false;
	break;
	}

	// Check for a single input from a SCALAR_TO_VECTOR node.
	// FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
	// all the smarts here sunk into that routine. However, the current
	// lowering of BUILD_VECTOR makes that nearly impossible until the old
	// vector shuffle lowering is dead.
	SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
	DAG);
	if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
	// We need to zext the scalar if it is smaller than an i32.
	V2S = DAG.getBitcast(EltVT, V2S);
	if (EltVT == MVT::i8 \|\| EltVT == MVT::i16) {
	// Using zext to expand a narrow element won't work for non-zero
	// insertions.
	if (!IsV1Zeroable)
	return SDValue();

	// Zero-extend directly to i32.
	ExtVT = MVT::v4i32;
	V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
	}
	V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
	} else if (Mask[V2Index] != (int)Mask.size() \|\| EltVT == MVT::i8 \|\|
	EltVT == MVT::i16) {
	// Either not inserting from the low element of the input or the input
	// element size is too small to use VZEXT_MOVL to clear the high bits.
	return SDValue();
	}

	if (!IsV1Zeroable) {
	// If V1 can't be treated as a zero vector we have fewer options to lower
	// this. We can't support integer vectors or non-zero targets cheaply, and
	// the V1 elements can't be permuted in any way.
	assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
	if (!VT.isFloatingPoint() \|\| V2Index != 0)
	return SDValue();
	SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
	V1Mask[V2Index] = -1;
	if (!isNoopShuffleMask(V1Mask))
	return SDValue();
	// This is essentially a special case blend operation, but if we have
	// general purpose blend operations, they are always faster. Bail and let
	// the rest of the lowering handle these as blends.
	if (Subtarget.hasSSE41())
	return SDValue();

	// Otherwise, use MOVSD or MOVSS.
	assert((EltVT == MVT::f32 \|\| EltVT == MVT::f64) &&
	"Only two types of floating point element types to handle!");
	return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
	ExtVT, V1, V2);
	}

	// This lowering only works for the low element with floating point vectors.
	if (VT.isFloatingPoint() && V2Index != 0)
	return SDValue();

	V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
	if (ExtVT != VT)
	V2 = DAG.getBitcast(VT, V2);

	if (V2Index != 0) {
	// If we have 4 or fewer lanes we can cheaply shuffle the element into
	// the desired position. Otherwise it is more efficient to do a vector
	// shift left. We know that we can do a vector shift left because all
	// the inputs are zero.
	if (VT.isFloatingPoint() \|\| VT.getVectorNumElements() <= 4) {
	SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
	V2Shuffle[V2Index] = 0;
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
	} else {
	V2 = DAG.getBitcast(MVT::v16i8, V2);
	V2 = DAG.getNode(
	X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
	DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
	DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
	DAG.getDataLayout(), VT)));
	V2 = DAG.getBitcast(VT, V2);
	}
	}
	return V2;
	}

	/// Try to lower broadcast of a single - truncated - integer element,
	/// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
	///
	/// This assumes we have AVX2.
	static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
	SDValue V0, int BroadcastIdx,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX2() &&
	"We can only lower integer broadcasts with AVX2!");

	EVT EltVT = VT.getVectorElementType();
	EVT V0VT = V0.getValueType();

	assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
	assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");

	EVT V0EltVT = V0VT.getVectorElementType();
	if (!V0EltVT.isInteger())
	return SDValue();

	const unsigned EltSize = EltVT.getSizeInBits();
	const unsigned V0EltSize = V0EltVT.getSizeInBits();

	// This is only a truncation if the original element type is larger.
	if (V0EltSize <= EltSize)
	return SDValue();

	assert(((V0EltSize % EltSize) == 0) &&
	"Scalar type sizes must all be powers of 2 on x86!");

	const unsigned V0Opc = V0.getOpcode();
	const unsigned Scale = V0EltSize / EltSize;
	const unsigned V0BroadcastIdx = BroadcastIdx / Scale;

	if ((V0Opc != ISD::SCALAR_TO_VECTOR \|\| V0BroadcastIdx != 0) &&
	V0Opc != ISD::BUILD_VECTOR)
	return SDValue();

	SDValue Scalar = V0.getOperand(V0BroadcastIdx);

	// If we're extracting non-least-significant bits, shift so we can truncate.
	// Hopefully, we can fold away the trunc/srl/load into the broadcast.
	// Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
	// vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
	if (const int OffsetIdx = BroadcastIdx % Scale)
	Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
	DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));

	return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
	DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
	}

	/// \brief Try to lower broadcast of a single element.
	///
	/// For convenience, this code also bundles all of the subtarget feature set
	/// filtering. While a little annoying to re-dispatch on type here, there isn't
	/// a convenient way to factor it out.
	static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) \|\|
	(Subtarget.hasAVX() && VT.isFloatingPoint()) \|\|
	(Subtarget.hasAVX2() && VT.isInteger())))
	return SDValue();

	// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
	// we can only broadcast from a register with AVX2.
	unsigned NumElts = Mask.size();
	unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
	bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) \|\| Subtarget.hasAVX2();

	// Check that the mask is a broadcast.
	int BroadcastIdx = -1;
	for (int i = 0; i != (int)NumElts; ++i) {
	SmallVector<int, 8> BroadcastMask(NumElts, i);
	if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
	BroadcastIdx = i;
	break;
	}
	}

	if (BroadcastIdx < 0)
	return SDValue();
	assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
	"a sorted mask where the broadcast "
	"comes from V1.");

	// Go up the chain of (vector) values to find a scalar load that we can
	// combine with the broadcast.
	SDValue V = V1;
	for (;;) {
	switch (V.getOpcode()) {
	case ISD::BITCAST: {
	SDValue VSrc = V.getOperand(0);
	MVT SrcVT = VSrc.getSimpleValueType();
	if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
	break;
	V = VSrc;
	continue;
	}
	case ISD::CONCAT_VECTORS: {
	int OperandSize = Mask.size() / V.getNumOperands();
	V = V.getOperand(BroadcastIdx / OperandSize);
	BroadcastIdx %= OperandSize;
	continue;
	}
	case ISD::INSERT_SUBVECTOR: {
	SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
	auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
	if (!ConstantIdx)
	break;

	int BeginIdx = (int)ConstantIdx->getZExtValue();
	int EndIdx =
	BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
	if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
	BroadcastIdx -= BeginIdx;
	V = VInner;
	} else {
	V = VOuter;
	}
	continue;
	}
	}
	break;
	}

	// Check if this is a broadcast of a scalar. We special case lowering
	// for scalars so that we can more effectively fold with loads.
	// First, look through bitcast: if the original value has a larger element
	// type than the shuffle, the broadcast element is in essence truncated.
	// Make that explicit to ease folding.
	if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
	if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
	DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
	return TruncBroadcast;

	MVT BroadcastVT = VT;

	// Peek through any bitcast (only useful for loads).
	SDValue BC = peekThroughBitcasts(V);

	// Also check the simpler case, where we can directly reuse the scalar.
	if (V.getOpcode() == ISD::BUILD_VECTOR \|\|
	(V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
	V = V.getOperand(BroadcastIdx);

	// If we can't broadcast from a register, check that the input is a load.
	if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
	return SDValue();
	} else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
	BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
	Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
	}

	// If we are broadcasting a load that is only used by the shuffle
	// then we can reduce the vector load to the broadcasted scalar load.
	LoadSDNode *Ld = cast<LoadSDNode>(BC);
	SDValue BaseAddr = Ld->getOperand(1);
	EVT SVT = BroadcastVT.getScalarType();
	unsigned Offset = BroadcastIdx * SVT.getStoreSize();
	SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
	V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
	DAG.getMachineFunction().getMachineMemOperand(
	Ld->getMemOperand(), Offset, SVT.getStoreSize()));
	DAG.makeEquivalentMemoryOrdering(Ld, V);
	} else if (!BroadcastFromReg) {
	// We can't broadcast from a vector register.
	return SDValue();
	} else if (BroadcastIdx != 0) {
	// We can only broadcast from the zero-element of a vector register,
	// but it can be advantageous to broadcast from the zero-element of a
	// subvector.
	if (!VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	// VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();

	// Only broadcast the zero-element of a 128-bit subvector.
	unsigned EltSize = VT.getScalarSizeInBits();
	if (((BroadcastIdx * EltSize) % 128) != 0)
	return SDValue();

	// The shuffle input might have been a bitcast we looked through; look at
	// the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
	// later bitcast it to BroadcastVT.
	MVT SrcVT = V.getSimpleValueType();
	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	assert((SrcVT.is256BitVector() \|\| SrcVT.is512BitVector()) &&
	"Unexpected vector size");

	MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
	V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
	DAG.getIntPtrConstant(BroadcastIdx, DL));
	}

	if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
	V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	DAG.getBitcast(MVT::f64, V));

	// Bitcast back to the same scalar type as BroadcastVT.
	MVT SrcVT = V.getSimpleValueType();
	if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
	assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
	"Unexpected vector element size");
	if (SrcVT.isVector()) {
	unsigned NumSrcElts = SrcVT.getVectorNumElements();
	SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
	} else {
	SrcVT = BroadcastVT.getScalarType();
	}
	V = DAG.getBitcast(SrcVT, V);
	}

	// 32-bit targets need to load i64 as a f64 and then bitcast the result.
	if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
	V = DAG.getBitcast(MVT::f64, V);
	unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
	BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
	}

	// We only support broadcasting from 128-bit vectors to minimize the
	// number of patterns we need to deal with in isel. So extract down to
	// 128-bits.
	if (SrcVT.getSizeInBits() > 128)
	V = extract128BitVector(V, 0, DAG, DL);

	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
	}

	// Check for whether we can use INSERTPS to perform the shuffle. We only use
	// INSERTPS when the V1 elements are already in the correct locations
	// because otherwise we can just always use two SHUFPS instructions which
	// are much smaller to encode than a SHUFPS and an INSERTPS. We can also
	// perform INSERTPS if a single V1 element is out of place and all V2
	// elements are zeroable.
	static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
	unsigned &InsertPSMask,
	const APInt &Zeroable,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Attempt to match INSERTPS with one element from VA or VB being
	// inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
	// are updated.
	auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
	ArrayRef<int> CandidateMask) {
	unsigned ZMask = 0;
	int VADstIndex = -1;
	int VBDstIndex = -1;
	bool VAUsedInPlace = false;

	for (int i = 0; i < 4; ++i) {
	// Synthesize a zero mask from the zeroable elements (includes undefs).
	if (Zeroable[i]) {
	ZMask \|= 1 << i;
	continue;
	}

	// Flag if we use any VA inputs in place.
	if (i == CandidateMask[i]) {
	VAUsedInPlace = true;
	continue;
	}

	// We can only insert a single non-zeroable element.
	if (VADstIndex >= 0 \|\| VBDstIndex >= 0)
	return false;

	if (CandidateMask[i] < 4) {
	// VA input out of place for insertion.
	VADstIndex = i;
	} else {
	// VB input for insertion.
	VBDstIndex = i;
	}
	}

	// Don't bother if we have no (non-zeroable) element for insertion.
	if (VADstIndex < 0 && VBDstIndex < 0)
	return false;

	// Determine element insertion src/dst indices. The src index is from the
	// start of the inserted vector, not the start of the concatenated vector.
	unsigned VBSrcIndex = 0;
	if (VADstIndex >= 0) {
	// If we have a VA input out of place, we use VA as the V2 element
	// insertion and don't use the original V2 at all.
	VBSrcIndex = CandidateMask[VADstIndex];
	VBDstIndex = VADstIndex;
	VB = VA;
	} else {
	VBSrcIndex = CandidateMask[VBDstIndex] - 4;
	}

	// If no V1 inputs are used in place, then the result is created only from
	// the zero mask and the V2 insertion - so remove V1 dependency.
	if (!VAUsedInPlace)
	VA = DAG.getUNDEF(MVT::v4f32);

	// Update V1, V2 and InsertPSMask accordingly.
	V1 = VA;
	V2 = VB;

	// Insert the V2 element into the desired position.
	InsertPSMask = VBSrcIndex << 6 \| VBDstIndex << 4 \| ZMask;
	assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
	return true;
	};

	if (matchAsInsertPS(V1, V2, Mask))
	return true;

	// Commute and try again.
	SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
	ShuffleVectorSDNode::commuteMask(CommutedMask);
	if (matchAsInsertPS(V2, V1, CommutedMask))
	return true;

	return false;
	}

	static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");

	// Attempt to match the insertps pattern.
	unsigned InsertPSMask;
	if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
	return SDValue();

	// Insert the V2 element into the desired position.
	return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	/// \brief Try to lower a shuffle as a permute of the inputs followed by an
	/// UNPCK instruction.
	///
	/// This specifically targets cases where we end up with alternating between
	/// the two inputs, and so can permute them into something that feeds a single
	/// UNPCK instruction. Note that this routine only targets integer vectors
	/// because for floating point vectors we have a generalized SHUFPS lowering
	/// strategy that handles everything that doesn't exactly match an unpack,
	/// making this clever lowering unnecessary.
	static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!VT.isFloatingPoint() &&
	"This routine only supports integer vectors.");
	assert(VT.is128BitVector() &&
	"This routine only works on 128-bit vectors.");
	assert(!V2.isUndef() &&
	"This routine should only be used when blending two inputs.");
	assert(Mask.size() >= 2 && "Single element masks are invalid.");

	int Size = Mask.size();

	int NumLoInputs =
	count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
	int NumHiInputs =
	count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });

	bool UnpackLo = NumLoInputs >= NumHiInputs;

	auto TryUnpack = [&](int ScalarSize, int Scale) {
	SmallVector<int, 16> V1Mask((unsigned)Size, -1);
	SmallVector<int, 16> V2Mask((unsigned)Size, -1);

	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	// Each element of the unpack contains Scale elements from this mask.
	int UnpackIdx = i / Scale;

	// We only handle the case where V1 feeds the first slots of the unpack.
	// We rely on canonicalization to ensure this is the case.
	if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
	return SDValue();

	// Setup the mask for this input. The indexing is tricky as we have to
	// handle the unpack stride.
	SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
	VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
	Mask[i] % Size;
	}

	// If we will have to shuffle both inputs to use the unpack, check whether
	// we can just unpack first and shuffle the result. If so, skip this unpack.
	if ((NumLoInputs == 0 \|\| NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
	!isNoopShuffleMask(V2Mask))
	return SDValue();

	// Shuffle the inputs into place.
	V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
	V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);

	// Cast the inputs to the type we will use to unpack them.
	MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
	V1 = DAG.getBitcast(UnpackVT, V1);
	V2 = DAG.getBitcast(UnpackVT, V2);

	// Unpack the inputs and cast the result back to the desired type.
	return DAG.getBitcast(
	VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	UnpackVT, V1, V2));
	};

	// We try each unpack from the largest to the smallest to try and find one
	// that fits this mask.
	int OrigScalarSize = VT.getScalarSizeInBits();
	for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
	if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
	return Unpack;

	// If none of the unpack-rooted lowerings worked (or were profitable) try an
	// initial unpack.
	if (NumLoInputs == 0 \|\| NumHiInputs == 0) {
	assert((NumLoInputs > 0 \|\| NumHiInputs > 0) &&
	"We have to have some inputs!");
	int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;

	// FIXME: We could consider the total complexity of the permute of each
	// possible unpacking. Or at the least we should consider how many
	// half-crossings are created.
	// FIXME: We could consider commuting the unpacks.

	SmallVector<int, 32> PermMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");

	PermMask[i] =
	2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
	}
	return DAG.getVectorShuffle(
	VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
	DL, VT, V1, V2),
	DAG.getUNDEF(VT), PermMask);
	}

	return SDValue();
	}

	/// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
	///
	/// This is the basis function for the 2-lane 64-bit shuffles as we have full
	/// support for floating point shuffles but not integer shuffles. These
	/// instructions will incur a domain crossing penalty on some chips though so
	/// it is better to avoid lowering through this for integer vectors where
	/// possible.
	static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. Simulate this by using the
	// single input as both of the "inputs" to this instruction..
	unsigned SHUFPDMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1);

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	return DAG.getNode(
	X86ISD::SHUFP, DL, MVT::v2f64,
	Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}
	assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
	assert(Mask[1] >= 2 && "Non-canonicalized blend!");

	// If we have a single input, insert that into V1 if we can do so cheaply.
	if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
	Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;
	}

	// Try to use one of the special instruction patterns to handle two common
	// blend patterns if a zero-blend above didn't work.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {1, 3}))
	if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
	// We can either use a special instruction to load over the low double or
	// to move just the low double.
	return DAG.getNode(
	isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
	DL, MVT::v2f64, V2,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));

	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
	return V;

	unsigned SHUFPDMask = (Mask[0] == 1) \| (((Mask[1] - 2) == 1) << 1);
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
	DAG.getConstant(SHUFPDMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 2-lane 64-bit integer shuffles.
	///
	/// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
	/// the integer unit to minimize domain crossing penalties. However, for blends
	/// it falls back to the floating point shuffle operation with appropriate bit
	/// casting.
	static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
	assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We have to map the mask as it is actually a v4i32 shuffle instruction.
	V1 = DAG.getBitcast(MVT::v4i32, V1);
	int WidenedMask[4] = {
	std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
	std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
	return DAG.getBitcast(
	MVT::v2i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
	}
	assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
	assert(Mask[0] < 2 && "We sort V1 to be the first input.");
	assert(Mask[1] >= 2 && "We sort V2 to be the second input.");

	// If we have a blend of two same-type PACKUS operations and the blend aligns
	// with the low and high halves, we can just merge the PACKUS operations.
	// This is particularly important as it lets us merge shuffles that this
	// routine itself creates.
	auto GetPackNode = [](SDValue V) {
	V = peekThroughBitcasts(V);
	return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
	};
	if (SDValue V1Pack = GetPackNode(V1))
	if (SDValue V2Pack = GetPackNode(V2)) {
	EVT PackVT = V1Pack.getValueType();
	if (PackVT == V2Pack.getValueType())
	return DAG.getBitcast(MVT::v2i64,
	DAG.getNode(X86ISD::PACKUS, DL, PackVT,
	Mask[0] == 0 ? V1Pack.getOperand(0)
	: V1Pack.getOperand(1),
	Mask[1] == 2 ? V2Pack.getOperand(0)
	: V2Pack.getOperand(1)));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// When loading a scalar and then shuffling it into a vector we can often do
	// the insertion cheaply.
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;
	// Try inverting the insertion since for v2 masks it is easy to do and we
	// can't reliably sort the mask one way or the other.
	int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
	return Insertion;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
	Mask, DAG);

	// We implement this with SHUFPD which is pretty lame because it will likely
	// incur 2 cycles of stall for integer vectors on Nehalem and older chips.
	// However, all the alternatives are still more cycles and newer chips don't
	// have this problem. It would be really nice if x86 had better shuffles here.
	V1 = DAG.getBitcast(MVT::v2f64, V1);
	V2 = DAG.getBitcast(MVT::v2f64, V2);
	return DAG.getBitcast(MVT::v2i64,
	DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
	}

	/// \brief Test whether this can be lowered with a single SHUFPS instruction.
	///
	/// This is used to disable more specialized lowerings when the shufps lowering
	/// will happen to be efficient.
	static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
	// This routine only handles 128-bit shufps.
	assert(Mask.size() == 4 && "Unsupported mask size!");
	assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
	assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
	assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
	assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");

	// To lower with a single SHUFPS we need to have the low half and high half
	// each requiring a single input.
	if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
	return false;
	if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
	return false;

	return true;
	}

	/// \brief Lower a vector shuffle using the SHUFPS instruction.
	///
	/// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
	/// It makes no assumptions about whether this is the best lowering, it simply
	/// uses it.
	static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	SDValue LowV = V1, HighV = V2;
	int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 1) {
	int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();

	// Compute the index adjacent to V2Index and in the same half by toggling
	// the low bit.
	int V2AdjIndex = V2Index ^ 1;

	if (Mask[V2AdjIndex] < 0) {
	// Handles all the cases where we have a single V2 element and an undef.
	// This will only ever happen in the high lanes because we commute the
	// vector otherwise.
	if (V2Index < 2)
	std::swap(LowV, HighV);
	NewMask[V2Index] -= 4;
	} else {
	// Handle the case where the V2 element ends up adjacent to a V1 element.
	// To make this work, blend them together as the first step.
	int V1Index = V2AdjIndex;
	int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
	V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now proceed to reconstruct the final blend as we have the necessary
	// high or low half formed.
	if (V2Index < 2) {
	LowV = V2;
	HighV = V1;
	} else {
	HighV = V2;
	}
	NewMask[V1Index] = 2; // We put the V1 element in V2[2].
	NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
	}
	} else if (NumV2Elements == 2) {
	if (Mask[0] < 4 && Mask[1] < 4) {
	// Handle the easy case where we have V1 in the low lanes and V2 in the
	// high lanes.
	NewMask[2] -= 4;
	NewMask[3] -= 4;
	} else if (Mask[2] < 4 && Mask[3] < 4) {
	// We also handle the reversed case because this utility may get called
	// when we detect a SHUFPS pattern but can't easily commute the shuffle to
	// arrange things in the right direction.
	NewMask[0] -= 4;
	NewMask[1] -= 4;
	HighV = V1;
	LowV = V2;
	} else {
	// We have a mixture of V1 and V2 in both low and high lanes. Rather than
	// trying to place elements directly, just blend them and set up the final
	// shuffle to place them.

	// The first two blend mask elements are for V1, the second two are for
	// V2.
	int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
	Mask[2] < 4 ? Mask[2] : Mask[3],
	(Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
	(Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
	V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));

	// Now we do a normal shuffle of V1 by giving V1 as both operands to
	// a blend.
	LowV = HighV = V1;
	NewMask[0] = Mask[0] < 4 ? 0 : 2;
	NewMask[1] = Mask[0] < 4 ? 2 : 0;
	NewMask[2] = Mask[2] < 4 ? 1 : 3;
	NewMask[3] = Mask[2] < 4 ? 3 : 1;
	}
	}
	return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
	getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
	}

	/// \brief Lower 4-lane 32-bit floating point shuffles.
	///
	/// Uses instructions exclusively from the floating point unit to minimize
	/// domain crossing penalties, as these are sufficient to implement all v4f32
	/// shuffles.
	static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (Subtarget.hasSSE3()) {
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
	if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
	}

	if (Subtarget.hasAVX()) {
	// If we have AVX, we can use VPERMILPS which will allow folding a load
	// into the shuffle.
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Otherwise, use a straight shuffle of a single input vector. We pass the
	// input vector to both operands to simulate this with a SHUFPS.
	return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// There are special ways we can lower some single-element blends. However, we
	// have custom ways we can lower more complex single-element blends below that
	// we defer to if both this and BLENDPS fail to match, so restrict this to
	// when the V2 input is targeting element 0 of the mask -- that is the fast
	// case here.
	if (NumV2Elements == 1 && Mask[0] >= 4)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (Subtarget.hasSSE41()) {
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use INSERTPS if we can complete the shuffle efficiently.
	if (SDValue V =
	lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
	return V;

	if (!isSingleSHUFPSMask(Mask))
	if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
	DL, MVT::v4f32, V1, V2, Mask, DAG))
	return BlendPerm;
	}

	// Use low/high mov instructions.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
	return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
	if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
	return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise fall back to a SHUFPS lowering strategy.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
	}

	/// \brief Lower 4-lane i32 vector shuffles.
	///
	/// We try to handle these with integer-domain shuffles where we can, but for
	/// blends we use the floating point domain blend instructions.
	static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });

	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Straight shuffle of a single input vector. For everything from SSE2
	// onward this has a single fast instruction with no scary immediates.
	// We coerce the shuffle pattern to be compatible with UNPCK instructions
	// but we aren't actually going to use the UNPCK instruction because doing
	// so prevents folding a load into this instruction or making a copy.
	const int UnpackLoMask[] = {0, 0, 1, 1};
	const int UnpackHiMask[] = {2, 2, 3, 3};
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
	Mask = UnpackLoMask;
	else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
	Mask = UnpackHiMask;

	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
	if (Subtarget.hasSSSE3())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (!isSingleSHUFPSMask(Mask)) {
	// If we have direct support for blends, we should lower by decomposing into
	// a permute. That will be faster than the domain cross.
	if (IsBlendSupported)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
	Mask, DAG);

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v4i32, V1, V2, Mask, DAG))
	return Unpack;
	}

	// We implement this with SHUFPS because it can blend from two vectors.
	// Because we're going to eventually use SHUFPS, we use SHUFPS even to build
	// up the inputs, bypassing domain shift penalties that we would incur if we
	// directly used PSHUFD on Nehalem and older. For newer chips, this isn't
	// relevant.
	SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
	SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
	return DAG.getBitcast(MVT::v4i32, ShufPS);
	}

	/// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
	/// shuffle lowering, and the most complex part.
	///
	/// The lowering strategy is to try to form pairs of input lanes which are
	/// targeted at the same half of the final vector, and then use a dword shuffle
	/// to place them onto the right half, and finally unpack the paired lanes into
	/// their final position.
	///
	/// The exact breakdown of how to form these dword pairs and align them on the
	/// correct sides is really tricky. See the comments within the function for
	/// more of the details.
	///
	/// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
	/// lane must shuffle the exact same way. In fact, you must pass a v8 Mask to
	/// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
	/// vector, form the analogous 128-bit 8-element Mask.
	static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
	const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
	MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);

	assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
	MutableArrayRef<int> LoMask = Mask.slice(0, 4);
	MutableArrayRef<int> HiMask = Mask.slice(4, 4);

	SmallVector<int, 4> LoInputs;
	copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
	int NumLToL =
	std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
	int NumHToL = LoInputs.size() - NumLToL;
	int NumLToH =
	std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
	int NumHToH = HiInputs.size() - NumLToH;
	MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
	MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
	MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
	MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);

	// If we are splatting two values from one half - one to each half, then
	// we can shuffle that half so each is splatted to a dword, then splat those
	// to their respective halves.
	auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
	int DOffset) {
	int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
	int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
	V = DAG.getNode(ShufWOp, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
	V = DAG.getBitcast(PSHUFDVT, V);
	V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
	return DAG.getBitcast(VT, V);
	};

	if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
	return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
	if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
	return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);

	// Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
	// such inputs we can swap two of the dwords across the half mark and end up
	// with <=2 inputs to each half in each half. Once there, we can fall through
	// to the generic code below. For example:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
	//
	// However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
	// and an existing 2-into-2 on the other half. In this case we may have to
	// pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
	// 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
	// Fortunately, we don't have to handle anything but a 2-into-2 pattern
	// because any other situation (including a 3-into-1 or 1-into-3 in the other
	// half than the one we target for fixing) will be fixed when we re-enter this
	// path. We will also combine away any sequence of PSHUFD instructions that
	// result into a single instruction. Here is an example of the tricky case:
	//
	// Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
	//
	// This now has a 1-into-3 in the high half! Instead, we do two shuffles:
	//
	// Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
	//
	// Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
	// Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
	//
	// The result is fine to be handled by the generic logic.
	auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
	ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
	int AOffset, int BOffset) {
	assert((AToAInputs.size() == 3 \|\| AToAInputs.size() == 1) &&
	"Must call this with A having 3 or 1 inputs from the A half.");
	assert((BToAInputs.size() == 1 \|\| BToAInputs.size() == 3) &&
	"Must call this with B having 1 or 3 inputs from the B half.");
	assert(AToAInputs.size() + BToAInputs.size() == 4 &&
	"Must call this with either 3:1 or 1:3 inputs (summing to 4).");

	bool ThreeAInputs = AToAInputs.size() == 3;

	// Compute the index of dword with only one word among the three inputs in
	// a half by taking the sum of the half with three inputs and subtracting
	// the sum of the actual three inputs. The difference is the remaining
	// slot.
	int ADWord, BDWord;
	int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
	int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
	int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
	ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
	int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
	int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
	int TripleNonInputIdx =
	TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
	TripleDWord = TripleNonInputIdx / 2;

	// We use xor with one to compute the adjacent DWord to whichever one the
	// OneInput is in.
	OneInputDWord = (OneInput / 2) ^ 1;

	// Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
	// and BToA inputs. If there is also such a problem with the BToB and AToB
	// inputs, we don't try to fix it necessarily -- we'll recurse and see it in
	// the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
	// is essential that we don't create a 3<-1 as then we might oscillate.
	if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
	// Compute how many inputs will be flipped by swapping these DWords. We
	// need
	// to balance this to ensure we don't form a 3-1 shuffle in the other
	// half.
	int NumFlippedAToBInputs =
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
	std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
	int NumFlippedBToBInputs =
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
	std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
	if ((NumFlippedAToBInputs == 1 &&
	(NumFlippedBToBInputs == 0 \|\| NumFlippedBToBInputs == 2)) \|\|
	(NumFlippedBToBInputs == 1 &&
	(NumFlippedAToBInputs == 0 \|\| NumFlippedAToBInputs == 2))) {
	// We choose whether to fix the A half or B half based on whether that
	// half has zero flipped inputs. At zero, we may not be able to fix it
	// with that half. We also bias towards fixing the B half because that
	// will more commonly be the high half, and we have to bias one way.
	auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
	ArrayRef<int> Inputs) {
	int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
	bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
	// Determine whether the free index is in the flipped dword or the
	// unflipped dword based on where the pinned index is. We use this bit
	// in an xor to conditionally select the adjacent dword.
	int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
	bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	if (IsFixIdxInput == IsFixFreeIdxInput)
	FixFreeIdx += 1;
	IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
	assert(IsFixIdxInput != IsFixFreeIdxInput &&
	"We need to be changing the number of flipped inputs!");
	int PSHUFHalfMask[] = {0, 1, 2, 3};
	std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
	V = DAG.getNode(
	FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
	MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
	getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));

	for (int &M : Mask)
	if (M >= 0 && M == FixIdx)
	M = FixFreeIdx;
	else if (M >= 0 && M == FixFreeIdx)
	M = FixIdx;
	};
	if (NumFlippedBToBInputs != 0) {
	int BPinnedIdx =
	BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
	} else {
	assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
	int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
	FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
	}
	}
	}

	int PSHUFDMask[] = {0, 1, 2, 3};
	PSHUFDMask[ADWord] = BDWord;
	PSHUFDMask[BDWord] = ADWord;
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// Adjust the mask to match the new locations of A and B.
	for (int &M : Mask)
	if (M >= 0 && M/2 == ADWord)
	M = 2 * BDWord + M % 2;
	else if (M >= 0 && M/2 == BDWord)
	M = 2 * ADWord + M % 2;

	// Recurse back into this routine to re-compute state now that this isn't
	// a 3 and 1 problem.
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
	DAG);
	};
	if ((NumLToL == 3 && NumHToL == 1) \|\| (NumLToL == 1 && NumHToL == 3))
	return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
	if ((NumHToH == 3 && NumLToH == 1) \|\| (NumHToH == 1 && NumLToH == 3))
	return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);

	// At this point there are at most two inputs to the low and high halves from
	// each half. That means the inputs can always be grouped into dwords and
	// those dwords can then be moved to the correct half with a dword shuffle.
	// We use at most one low and one high word shuffle to collect these paired
	// inputs into dwords, and finally a dword shuffle to place them.
	int PSHUFLMask[4] = {-1, -1, -1, -1};
	int PSHUFHMask[4] = {-1, -1, -1, -1};
	int PSHUFDMask[4] = {-1, -1, -1, -1};

	// First fix the masks for all the inputs that are staying in their
	// original halves. This will then dictate the targets of the cross-half
	// shuffles.
	auto fixInPlaceInputs =
	[&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
	MutableArrayRef<int> SourceHalfMask,
	MutableArrayRef<int> HalfMask, int HalfOffset) {
	if (InPlaceInputs.empty())
	return;
	if (InPlaceInputs.size() == 1) {
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
	return;
	}
	if (IncomingInputs.empty()) {
	// Just fix all of the in place inputs.
	for (int Input : InPlaceInputs) {
	SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
	PSHUFDMask[Input / 2] = Input / 2;
	}
	return;
	}

	assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
	SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
	InPlaceInputs[0] - HalfOffset;
	// Put the second input next to the first so that they are packed into
	// a dword. We find the adjacent index by toggling the low bit.
	int AdjIndex = InPlaceInputs[0] ^ 1;
	SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
	PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
	};
	fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
	fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);

	// Now gather the cross-half inputs and place them into a free dword of
	// their target half.
	// FIXME: This operation could almost certainly be simplified dramatically to
	// look more like the 3-1 fixing operation.
	auto moveInputsToRightHalf = [&PSHUFDMask](
	MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
	MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
	MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
	int DestOffset) {
	auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
	return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
	};
	auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
	int Word) {
	int LowWord = Word & ~1;
	int HighWord = Word \| 1;
	return isWordClobbered(SourceHalfMask, LowWord) \|\|
	isWordClobbered(SourceHalfMask, HighWord);
	};

	if (IncomingInputs.empty())
	return;

	if (ExistingInputs.empty()) {
	// Map any dwords with inputs from them into the right half.
	for (int Input : IncomingInputs) {
	// If the source half mask maps over the inputs, turn those into
	// swaps and use the swapped lane.
	if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
	if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
	SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
	Input - SourceOffset;
	// We have to swap the uses in our half mask in one sweep.
	for (int &M : HalfMask)
	if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
	M = Input;
	else if (M == Input)
	M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	} else {
	assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
	Input - SourceOffset &&
	"Previous placement doesn't match!");
	}
	// Note that this correctly re-maps both when we do a swap and when
	// we observe the other side of the swap above. We rely on that to
	// avoid swapping the members of the input list directly.
	Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
	}

	// Map the input's dword into the correct half.
	if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
	PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
	else
	assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
	Input / 2 &&
	"Previous placement doesn't match!");
	}

	// And just directly shift any other-half mask elements to be same-half
	// as we will have mirrored the dword containing the element into the
	// same position within that half.
	for (int &M : HalfMask)
	if (M >= SourceOffset && M < SourceOffset + 4) {
	M = M - SourceOffset + DestOffset;
	assert(M >= 0 && "This should never wrap below zero!");
	}
	return;
	}

	// Ensure we have the input in a viable dword of its current half. This
	// is particularly tricky because the original position may be clobbered
	// by inputs being moved and staying in that half.
	if (IncomingInputs.size() == 1) {
	if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
	SourceOffset;
	SourceHalfMask[InputFixed - SourceOffset] =
	IncomingInputs[0] - SourceOffset;
	std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
	InputFixed);
	IncomingInputs[0] = InputFixed;
	}
	} else if (IncomingInputs.size() == 2) {
	if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 \|\|
	isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
	// We have two non-adjacent or clobbered inputs we need to extract from
	// the source half. To do this, we need to map them into some adjacent
	// dword slot in the source mask.
	int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
	IncomingInputs[1] - SourceOffset};

	// If there is a free slot in the source half mask adjacent to one of
	// the inputs, place the other input in it. We use (Index XOR 1) to
	// compute an adjacent index.
	if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
	SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	InputsFixed[1] = InputsFixed[0] ^ 1;
	} else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
	SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
	SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
	InputsFixed[0] = InputsFixed[1] ^ 1;
	} else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
	// The two inputs are in the same DWord but it is clobbered and the
	// adjacent DWord isn't used at all. Move both inputs to the free
	// slot.
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
	SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
	InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
	InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
	} else {
	// The only way we hit this point is if there is no clobbering
	// (because there are no off-half inputs to this half) and there is no
	// free slot adjacent to one of the inputs. In this case, we have to
	// swap an input with a non-input.
	for (int i = 0; i < 4; ++i)
	assert((SourceHalfMask[i] < 0 \|\| SourceHalfMask[i] == i) &&
	"We can't handle any clobbers here!");
	assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
	"Cannot have adjacent inputs here!");

	SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
	SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;

	// We also have to update the final source mask in this case because
	// it may need to undo the above swap.
	for (int &M : FinalSourceHalfMask)
	if (M == (InputsFixed[0] ^ 1) + SourceOffset)
	M = InputsFixed[1] + SourceOffset;
	else if (M == InputsFixed[1] + SourceOffset)
	M = (InputsFixed[0] ^ 1) + SourceOffset;

	InputsFixed[1] = InputsFixed[0] ^ 1;
	}

	// Point everything at the fixed inputs.
	for (int &M : HalfMask)
	if (M == IncomingInputs[0])
	M = InputsFixed[0] + SourceOffset;
	else if (M == IncomingInputs[1])
	M = InputsFixed[1] + SourceOffset;

	IncomingInputs[0] = InputsFixed[0] + SourceOffset;
	IncomingInputs[1] = InputsFixed[1] + SourceOffset;
	}
	} else {
	llvm_unreachable("Unhandled input size!");
	}

	// Now hoist the DWord down to the right half.
	int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
	assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
	PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
	for (int &M : HalfMask)
	for (int Input : IncomingInputs)
	if (M == Input)
	M = FreeDWord * 2 + Input % 2;
	};
	moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
	/SourceOffset/ 4, /DestOffset/ 0);
	moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
	/SourceOffset/ 0, /DestOffset/ 4);

	// Now enact all the shuffles we've computed to move the inputs into their
	// target half.
	if (!isNoopShuffleMask(PSHUFLMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFHMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
	if (!isNoopShuffleMask(PSHUFDMask))
	V = DAG.getBitcast(
	VT,
	DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));

	// At this point, each half should contain all its inputs, and we can then
	// just shuffle them into their final position.
	assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
	"Failed to lift all the high half inputs to the low mask!");
	assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
	"Failed to lift all the low half inputs to the high mask!");

	// Do a half shuffle for the low mask.
	if (!isNoopShuffleMask(LoMask))
	V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));

	// Do a half shuffle with the high mask after shifting its values down.
	for (int &M : HiMask)
	if (M >= 0)
	M -= 4;
	if (!isNoopShuffleMask(HiMask))
	V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
	getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));

	return V;
	}

	/// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
	/// blend if only one input is used.
	static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
	bool &V2InUse) {
	SDValue V1Mask[16];
	SDValue V2Mask[16];
	V1InUse = false;
	V2InUse = false;

	int Size = Mask.size();
	int Scale = 16 / Size;
	for (int i = 0; i < 16; ++i) {
	if (Mask[i / Scale] < 0) {
	V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
	} else {
	const int ZeroMask = 0x80;
	int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
	: ZeroMask;
	int V2Idx = Mask[i / Scale] < Size
	? ZeroMask
	: (Mask[i / Scale] - Size) * Scale + i % Scale;
	if (Zeroable[i / Scale])
	V1Idx = V2Idx = ZeroMask;
	V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
	V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
	V1InUse \|= (ZeroMask != V1Idx);
	V2InUse \|= (ZeroMask != V2Idx);
	}
	}

	if (V1InUse)
	V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V1),
	DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
	if (V2InUse)
	V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
	DAG.getBitcast(MVT::v16i8, V2),
	DAG.getBuildVector(MVT::v16i8, DL, V2Mask));

	// If we need shuffled inputs from both, blend the two.
	SDValue V;
	if (V1InUse && V2InUse)
	V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
	else
	V = V1InUse ? V1 : V2;

	// Cast the result back to the correct type.
	return DAG.getBitcast(VT, V);
	}

	/// \brief Generic lowering of 8-lane i16 shuffles.
	///
	/// This handles both single-input shuffles and combined shuffle/blends with
	/// two inputs. The single input shuffles are immediately delegated to
	/// a dedicated lowering routine.
	///
	/// The blends are lowered in one of three fundamental ways. If there are few
	/// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
	/// of the input is significantly cheaper when lowered as an interleaving of
	/// the two inputs, try to interleave them. Otherwise, blend the low and high
	/// halves of the inputs separately (making them have relatively few inputs)
	/// and then concatenate them.
	static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });

	if (NumV2Inputs == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
	Mask, Subtarget, DAG))
	return Rotate;

	// Make a copy of the mask so it can be modified.
	SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
	return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
	MutableMask, Subtarget,
	DAG);
	}

	assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
	"All single-input shuffles should be canonicalized to be V1-input "
	"shuffles.");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	// There are special ways we can lower some single-element blends.
	if (NumV2Inputs == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	// We have different paths for blend lowering, but they all must use the
	// exact same predicate.
	bool IsBlendSupported = Subtarget.hasSSE41();
	if (IsBlendSupported)
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
	return V;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
	return BitBlend;

	// Try to lower by permuting the inputs into an unpack instruction.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
	V2, Mask, DAG))
	return Unpack;

	// If we can't directly blend but can use PSHUFB, that will be better as it
	// can both shuffle and set up the inefficient blend.
	if (!IsBlendSupported && Subtarget.hasSSSE3()) {
	bool V1InUse, V2InUse;
	return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
	Zeroable, DAG, V1InUse, V2InUse);
	}

	// We can always bit-blend if we have to so the fallback strategy is to
	// decompose into single-input permutes and blends.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
	Mask, DAG);
	}

	/// \brief Check whether a compaction lowering can be done by dropping even
	/// elements and compute how many times even elements must be dropped.
	///
	/// This handles shuffles which take every Nth element where N is a power of
	/// two. Example shuffle masks:
	///
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
	/// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
	/// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
	/// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
	/// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
	/// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
	///
	/// Any of these lanes can of course be undef.
	///
	/// This routine only supports N <= 3.
	/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
	/// for larger N.
	///
	/// \returns N above, or the number of times even elements must be dropped if
	/// there is such a number. Otherwise returns zero.
	static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
	bool IsSingleInput) {
	// The modulus for the shuffle vector entries is based on whether this is
	// a single input or not.
	int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
	assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
	"We should only be called with masks with a power-of-2 size!");

	uint64_t ModMask = (uint64_t)ShuffleModulus - 1;

	// We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
	// and 2^3 simultaneously. This is because we may have ambiguity with
	// partially undef inputs.
	bool ViableForN[3] = {true, true, true};

	for (int i = 0, e = Mask.size(); i < e; ++i) {
	// Ignore undef lanes, we'll optimistically collapse them to the pattern we
	// want.
	if (Mask[i] < 0)
	continue;

	bool IsAnyViable = false;
	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j]) {
	uint64_t N = j + 1;

	// The shuffle mask must be equal to (i * 2^N) % M.
	if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
	IsAnyViable = true;
	else
	ViableForN[j] = false;
	}
	// Early exit if we exhaust the possible powers of two.
	if (!IsAnyViable)
	break;
	}

	for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
	if (ViableForN[j])
	return j + 1;

	// Return 0 as there is no viable power of two.
	return 0;
	}

	/// \brief Generic lowering of v16i8 shuffles.
	///
	/// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
	/// detect any complexity reducing interleaving. If that doesn't help, it uses
	/// UNPCK to spread the i8 elements across two i16-element vectors, and uses
	/// the existing lowering for v8i16 blends on each half, finally PACK-ing them
	/// back together.
	static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to use a zext lowering.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// See if we can use SSE4A Extraction / Insertion.
	if (Subtarget.hasSSE4A())
	if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return V;

	int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });

	// For single-input shuffles, there are some nicer lowering tricks we can use.
	if (NumV2Elements == 0) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Check whether we can widen this to an i16 shuffle by duplicating bytes.
	// Notably, this handles splat and partial-splat shuffles more efficiently.
	// However, it only makes sense if the pre-duplication shuffle simplifies
	// things significantly. Currently, this means we need to be able to
	// express the pre-duplication shuffle as an i16 shuffle.
	//
	// FIXME: We should check for other patterns which can be widened into an
	// i16 shuffle as well.
	auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
	for (int i = 0; i < 16; i += 2)
	if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
	return false;

	return true;
	};
	auto tryToWidenViaDuplication = [&]() -> SDValue {
	if (!canWidenViaDuplication(Mask))
	return SDValue();
	SmallVector<int, 4> LoInputs;
	copy_if(Mask, std::back_inserter(LoInputs),
	[](int M) { return M >= 0 && M < 8; });
	std::sort(LoInputs.begin(), LoInputs.end());
	LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
	LoInputs.end());
	SmallVector<int, 4> HiInputs;
	copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
	std::sort(HiInputs.begin(), HiInputs.end());
	HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
	HiInputs.end());

	bool TargetLo = LoInputs.size() >= HiInputs.size();
	ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
	ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;

	int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
	SmallDenseMap<int, int, 8> LaneMap;
	for (int I : InPlaceInputs) {
	PreDupI16Shuffle[I/2] = I/2;
	LaneMap[I] = I;
	}
	int j = TargetLo ? 0 : 4, je = j + 4;
	for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
	// Check if j is already a shuffle of this input. This happens when
	// there are two adjacent bytes after we move the low one.
	if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
	// If we haven't yet mapped the input, search for a slot into which
	// we can map it.
	while (j < je && PreDupI16Shuffle[j] >= 0)
	++j;

	if (j == je)
	// We can't place the inputs into a single half with a simple i16 shuffle, so bail.
	return SDValue();

	// Map this input with the i16 shuffle.
	PreDupI16Shuffle[j] = MovingInputs[i] / 2;
	}

	// Update the lane map based on the mapping we ended up with.
	LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
	}
	V1 = DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));

	// Unpack the bytes to form the i16s that will be shuffled into place.
	V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
	MVT::v16i8, V1, V1);

	int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0) {
	int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
	assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
	if (PostDupI16Shuffle[i / 2] < 0)
	PostDupI16Shuffle[i / 2] = MappedMask;
	else
	assert(PostDupI16Shuffle[i / 2] == MappedMask &&
	"Conflicting entries in the original shuffle!");
	}
	return DAG.getBitcast(
	MVT::v16i8,
	DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
	DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
	};
	if (SDValue V = tryToWidenViaDuplication())
	return V;
	}

	if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
	Zeroable, DAG))
	return Masked;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
	return V;

	// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
	// with PSHUFB. It is important to do this before we attempt to generate any
	// blends but after all of the single-input lowerings. If the single input
	// lowerings can find an instruction sequence that is faster than a PSHUFB, we
	// want to preserve that and we can DAG combine any longer sequences into
	// a PSHUFB in the end. But once we start blending from multiple inputs,
	// the complexity of DAG combining bad patterns back into PSHUFB is too high,
	// and there are very few patterns that would actually be faster than the
	// PSHUFB approach because of its ability to zero lanes.
	//
	// FIXME: The only exceptions to the above are blends which are exact
	// interleavings with direct instructions supporting them. We currently don't
	// handle those well here.
	if (Subtarget.hasSSSE3()) {
	bool V1InUse = false;
	bool V2InUse = false;

	SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);

	// If both V1 and V2 are in use and we can use a direct blend or an unpack,
	// do so. This avoids using them to handle blends-with-zero which is
	// important as a single pshufb is significantly faster for that.
	if (V1InUse && V2InUse) {
	if (Subtarget.hasSSE41())
	if (SDValue Blend = lowerVectorShuffleAsBlend(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Blend;

	// We can use an unpack to do the blending rather than an or in some
	// cases. Even though the or may be (very minorly) more efficient, we
	// preference this lowering because there are common cases where part of
	// the complexity of the shuffles goes away when we do the final blend as
	// an unpack.
	// FIXME: It might be worth trying to detect if the unpack-feeding
	// shuffles will both be pshufb, in which case we shouldn't bother with
	// this.
	if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
	DL, MVT::v16i8, V1, V2, Mask, DAG))
	return Unpack;
	}

	return PSHUFB;
	}

	// There are special ways we can lower some single-element blends.
	if (NumV2Elements == 1)
	if (SDValue V = lowerVectorShuffleAsElementInsertion(
	DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return V;

	if (SDValue BitBlend =
	lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
	return BitBlend;

	// Check whether a compaction lowering can be done. This handles shuffles
	// which take every Nth element for some even N. See the helper function for
	// details.
	//
	// We special case these as they can be particularly efficiently handled with
	// the PACKUSB instruction on x86 and they show up in common patterns of
	// rearranging bytes to truncate wide elements.
	bool IsSingleInput = V2.isUndef();
	if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
	// NumEvenDrops is the power of two stride of the elements. Another way of
	// thinking about it is that we need to drop the even elements this many
	// times to get the original input.

	// First we need to zero all the dropped bytes.
	assert(NumEvenDrops <= 3 &&
	"No support for dropping even elements more than 3 times.");
	// We use the mask type to pick which bytes are preserved based on how many
	// elements are dropped.
	MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
	SDValue ByteClearMask = DAG.getBitcast(
	MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
	V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
	if (!IsSingleInput)
	V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);

	// Now pack things back together.
	V1 = DAG.getBitcast(MVT::v8i16, V1);
	V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
	SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
	for (int i = 1; i < NumEvenDrops; ++i) {
	Result = DAG.getBitcast(MVT::v8i16, Result);
	Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
	}

	return Result;
	}

	// Handle multi-input cases by blending single-input shuffles.
	if (NumV2Elements > 0)
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
	Mask, DAG);

	// The fallback path for single-input shuffles widens this into two v8i16
	// vectors with unpacks, shuffles those, and then pulls them back together
	// with a pack.
	SDValue V = V1;

	std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
	for (int i = 0; i < 16; ++i)
	if (Mask[i] >= 0)
	(i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];

	SDValue VLoHalf, VHiHalf;
	// Check if any of the odd lanes in the v16i8 are used. If not, we can mask
	// them out and avoid using UNPCK{L,H} to extract the elements of V as
	// i16s.
	if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
	none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
	// Use a mask to drop the high bytes.
	VLoHalf = DAG.getBitcast(MVT::v8i16, V);
	VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
	DAG.getConstant(0x00FF, DL, MVT::v8i16));

	// This will be a single vector shuffle instead of a blend so nuke VHiHalf.
	VHiHalf = DAG.getUNDEF(MVT::v8i16);

	// Squash the masks to point directly into VLoHalf.
	for (int &M : LoBlendMask)
	if (M >= 0)
	M /= 2;
	for (int &M : HiBlendMask)
	if (M >= 0)
	M /= 2;
	} else {
	// Otherwise just unpack the low half of V into VLoHalf and the high half into
	// VHiHalf so that we can blend them as i16s.
	SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);

	VLoHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
	VHiHalf = DAG.getBitcast(
	MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
	}

	SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
	SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);

	return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
	}

	/// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
	///
	/// This routine breaks down the specific type of 128-bit shuffle and
	/// dispatches to the lowering routines accordingly.
	static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	switch (VT.SimpleTy) {
	case MVT::v2i64:
	return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v2f64:
	return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i32:
	return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4f32:
	return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i16:
	return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i8:
	return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Unimplemented!");
	}
	}

	/// \brief Generic routine to split vector shuffle into half-sized shuffles.
	///
	/// This routine just extracts two subvectors, shuffles them independently, and
	/// then concatenates them back together. This should work effectively with all
	/// AVX vector shuffle types.
	static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(VT.getSizeInBits() >= 256 &&
	"Only for 256-bit or wider vector shuffles!");
	assert(V1.getSimpleValueType() == VT && "Bad operand type!");
	assert(V2.getSimpleValueType() == VT && "Bad operand type!");

	ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
	ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);

	int NumElements = VT.getVectorNumElements();
	int SplitNumElements = NumElements / 2;
	MVT ScalarVT = VT.getVectorElementType();
	MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);

	// Rather than splitting build-vectors, just build two narrower build
	// vectors. This helps shuffling with splats and zeros.
	auto SplitVector = [&](SDValue V) {
	V = peekThroughBitcasts(V);

	MVT OrigVT = V.getSimpleValueType();
	int OrigNumElements = OrigVT.getVectorNumElements();
	int OrigSplitNumElements = OrigNumElements / 2;
	MVT OrigScalarVT = OrigVT.getVectorElementType();
	MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);

	SDValue LoV, HiV;

	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV) {
	LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(0, DL));
	HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
	DAG.getIntPtrConstant(OrigSplitNumElements, DL));
	} else {

	SmallVector<SDValue, 16> LoOps, HiOps;
	for (int i = 0; i < OrigSplitNumElements; ++i) {
	LoOps.push_back(BV->getOperand(i));
	HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
	}
	LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
	HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
	}
	return std::make_pair(DAG.getBitcast(SplitVT, LoV),
	DAG.getBitcast(SplitVT, HiV));
	};

	SDValue LoV1, HiV1, LoV2, HiV2;
	std::tie(LoV1, HiV1) = SplitVector(V1);
	std::tie(LoV2, HiV2) = SplitVector(V2);

	// Now create two 4-way blends of these half-width vectors.
	auto HalfBlend = [&](ArrayRef<int> HalfMask) {
	bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
	SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
	SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
	for (int i = 0; i < SplitNumElements; ++i) {
	int M = HalfMask[i];
	if (M >= NumElements) {
	if (M >= NumElements + SplitNumElements)
	UseHiV2 = true;
	else
	UseLoV2 = true;
	V2BlendMask[i] = M - NumElements;
	BlendMask[i] = SplitNumElements + i;
	} else if (M >= 0) {
	if (M >= SplitNumElements)
	UseHiV1 = true;
	else
	UseLoV1 = true;
	V1BlendMask[i] = M;
	BlendMask[i] = i;
	}
	}

	// Because the lowering happens after all combining takes place, we need to
	// manually combine these blend masks as much as possible so that we create
	// a minimal number of high-level vector shuffle nodes.

	// First try just blending the halves of V1 or V2.
	if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
	return DAG.getUNDEF(SplitVT);
	if (!UseLoV2 && !UseHiV2)
	return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	if (!UseLoV1 && !UseHiV1)
	return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);

	SDValue V1Blend, V2Blend;
	if (UseLoV1 && UseHiV1) {
	V1Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
	} else {
	// We only use half of V1 so map the usage down into the final blend mask.
	V1Blend = UseLoV1 ? LoV1 : HiV1;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
	BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
	}
	if (UseLoV2 && UseHiV2) {
	V2Blend =
	DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
	} else {
	// We only use half of V2 so map the usage down into the final blend mask.
	V2Blend = UseLoV2 ? LoV2 : HiV2;
	for (int i = 0; i < SplitNumElements; ++i)
	if (BlendMask[i] >= SplitNumElements)
	BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
	}
	return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
	};
	SDValue Lo = HalfBlend(LoMask);
	SDValue Hi = HalfBlend(HiMask);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
	}

	/// \brief Either split a vector in halves or decompose the shuffles and the
	/// blend.
	///
	/// This is provided as a good fallback for many lowerings of non-single-input
	/// shuffles with more than one 128-bit lane. In those cases, we want to select
	/// between splitting the shuffle into 128-bit components and stitching those
	/// back together vs. extracting the single-input shuffles and blending those
	/// results.
	static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This routine must not be used to lower single-input "
	"shuffles as it could then recurse on itself.");
	int Size = Mask.size();

	// If this can be modeled as a broadcast of two elements followed by a blend,
	// prefer that lowering. This is especially important because broadcasts can
	// often fold with memory operands.
	auto DoBothBroadcast = [&] {
	int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
	for (int M : Mask)
	if (M >= Size) {
	if (V2BroadcastIdx < 0)
	V2BroadcastIdx = M - Size;
	else if (M - Size != V2BroadcastIdx)
	return false;
	} else if (M >= 0) {
	if (V1BroadcastIdx < 0)
	V1BroadcastIdx = M;
	else if (M != V1BroadcastIdx)
	return false;
	}
	return true;
	};
	if (DoBothBroadcast())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
	DAG);

	// If the inputs all stem from a single 128-bit lane of each input, then we
	// split them rather than blending because the split will decompose to
	// unusually few instructions.
	int LaneCount = VT.getSizeInBits() / 128;
	int LaneSize = Size / LaneCount;
	SmallBitVector LaneInputs[2];
	LaneInputs[0].resize(LaneCount, false);
	LaneInputs[1].resize(LaneCount, false);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
	if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

	// Otherwise, just fall back to decomposed shuffles and a blend. This requires
	// that the decomposed single-input shuffles don't end up here.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
	}

	/// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
	/// a permutation and blend of those lanes.
	///
	/// This essentially blends the out-of-lane inputs to each lane into the lane
	/// from a permuted copy of the vector. This lowering strategy results in four
	/// instructions in the worst case for a single-input cross lane shuffle which
	/// is lower than any other fully general cross-lane shuffle strategy I'm aware
	/// of. Special cases for each particular shuffle pattern should be handled
	/// prior to trying this lowering.
	static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	SelectionDAG &DAG) {
	// FIXME: This should probably be generalized for 512-bit vectors as well.
	assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
	int Size = Mask.size();
	int LaneSize = Size / 2;

	// If there are only inputs from one 128-bit lane, splitting will in fact be
	// less expensive. The flags track whether the given lane contains an element
	// that crosses to another lane.
	bool LaneCrossing[2] = {false, false};
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
	LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
	if (!LaneCrossing[0] \|\| !LaneCrossing[1])
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);

	assert(V2.isUndef() &&
	"This last part of this routine only works on single input shuffles");

	SmallVector<int, 32> FlippedBlendMask(Size);
	for (int i = 0; i < Size; ++i)
	FlippedBlendMask[i] =
	Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
	? Mask[i]
	: Mask[i] % LaneSize +
	(i / LaneSize) * LaneSize + Size);

	// Flip the vector, and blend the results which should now be in-lane. The
	// VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
	// 5 for the high source. The value 3 selects the high half of source 2 and
	// the value 2 selects the low half of source 2. We only use source 2 to
	// allow folding it into a memory operand.
	unsigned PERMMask = 3 \| 2 << 4;
	SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
	V1, DAG.getConstant(PERMMask, DL, MVT::i8));
	return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
	}

	/// \brief Handle lowering 2-lane 128-bit shuffles.
	static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
	SDValue V2, ArrayRef<int> Mask,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// TODO: If minimizing size and one of the inputs is a zero vector and the
	// the zero vector has only one use, we could use a VPERM2X128 to save the
	// instruction bytes needed to explicitly generate the zero vector.

	// Blends are faster and handle all the non-lane-crossing cases.
	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
	bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());

	// If either input operand is a zero vector, use VPERM2X128 because its mask
	// allows us to replace the zero input with an implicit zero.
	if (!IsV1Zero && !IsV2Zero) {
	// Check for patterns which can be matched with a single insert of a 128-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
	// With AVX2, use VPERMQ/VPERMPD to allow memory folding.
	if (Subtarget.hasAVX2() && V2.isUndef())
	return SDValue();

	// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
	// this will likely become vinsertf128 which can't fold a 256-bit memop.
	if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}
	}
	}

	// Otherwise form a 128-bit permutation. After accounting for undefs,
	// convert the 64-bit shuffle mask selection values into 128-bit
	// selection bits by dividing the indexes by 2 and shifting into positions
	// defined by a vperm2*128 instruction's immediate control byte.

	// The immediate permute control byte looks like this:
	// [1:0] - select 128 bits from sources for low half of destination
	// [2] - ignore
	// [3] - zero low half of destination
	// [5:4] - select 128 bits from sources for high half of destination
	// [6] - ignore
	// [7] - zero high half of destination

	int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
	int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];

	unsigned PermMask = MaskLO \| (MaskHI << 4);

	// If either input is a zero vector, replace it with an undef input.
	// Shuffle mask values < 4 are selecting elements of V1.
	// Shuffle mask values >= 4 are selecting elements of V2.
	// Adjust each half of the permute mask by clearing the half that was
	// selecting the zero vector and setting the zero mask bit.
	if (IsV1Zero) {
	V1 = DAG.getUNDEF(VT);
	if (MaskLO < 2)
	PermMask = (PermMask & 0xf0) \| 0x08;
	if (MaskHI < 2)
	PermMask = (PermMask & 0x0f) \| 0x80;
	}
	if (IsV2Zero) {
	V2 = DAG.getUNDEF(VT);
	if (MaskLO >= 2)
	PermMask = (PermMask & 0xf0) \| 0x08;
	if (MaskHI >= 2)
	PermMask = (PermMask & 0x0f) \| 0x80;
	}

	return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
	/// shuffling each lane.
	///
	/// This will only succeed when the result of fixing the 128-bit lanes results
	/// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
	/// each 128-bit lanes. This handles many cases where we can quickly blend away
	/// the lane crosses early and then use simpler shuffles within each lane.
	///
	/// FIXME: It might be worthwhile at some point to support this without
	/// requiring the 128-bit lane-relative shuffles to be repeating, but currently
	/// in x86 only floating point has interesting non-repeating shuffles, and even
	/// those are still marginally more expensive.
	static SDValue lowerVectorShuffleByMerging128BitLanes(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	assert(!V2.isUndef() && "This is only useful with multiple inputs.");

	int Size = Mask.size();
	int LaneSize = 128 / VT.getScalarSizeInBits();
	int NumLanes = Size / LaneSize;
	assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");

	// See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
	// check whether the in-128-bit lane shuffles share a repeating pattern.
	SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
	SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
	for (int i = 0; i < Size; ++i) {
	if (Mask[i] < 0)
	continue;

	int j = i / LaneSize;

	if (Lanes[j] < 0) {
	// First entry we've seen for this lane.
	Lanes[j] = Mask[i] / LaneSize;
	} else if (Lanes[j] != Mask[i] / LaneSize) {
	// This doesn't match the lane selected previously!
	return SDValue();
	}

	// Check that within each lane we have a consistent shuffle mask.
	int k = i % LaneSize;
	if (InLaneMask[k] < 0) {
	InLaneMask[k] = Mask[i] % LaneSize;
	} else if (InLaneMask[k] != Mask[i] % LaneSize) {
	// This doesn't fit a repeating in-lane mask.
	return SDValue();
	}
	}

	// First shuffle the lanes into place.
	MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
	VT.getSizeInBits() / 64);
	SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
	for (int i = 0; i < NumLanes; ++i)
	if (Lanes[i] >= 0) {
	LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
	LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
	}

	V1 = DAG.getBitcast(LaneVT, V1);
	V2 = DAG.getBitcast(LaneVT, V2);
	SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);

	// Cast it back to the type we actually want.
	LaneShuffle = DAG.getBitcast(VT, LaneShuffle);

	// Now do a simple shuffle that isn't lane crossing.
	SmallVector<int, 8> NewMask((unsigned)Size, -1);
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0)
	NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
	assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
	"Must not introduce lane crosses at this point!");

	return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
	}

	/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
	/// This allows for fast cases such as subvector extraction/insertion
	/// or shuffling smaller vector types which can lower more efficiently.
	static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
	SDValue V1, SDValue V2,
	ArrayRef<int> Mask,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(VT.is256BitVector() && "Expected 256-bit vector");

	unsigned NumElts = VT.getVectorNumElements();
	unsigned HalfNumElts = NumElts / 2;
	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);

	bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
	bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
	if (!UndefLower && !UndefUpper)
	return SDValue();

	// Upper half is undef and lower half is whole upper subvector.
	// e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
	if (UndefUpper &&
	isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(0, DL));
	}

	// Lower half is undef and upper half is whole lower subvector.
	// e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
	if (UndefLower &&
	isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
	DAG.getIntPtrConstant(HalfNumElts, DL));
	}

	// If the shuffle only uses two of the four halves of the input operands,
	// then extract them and perform the 'half' shuffle at half width.
	// e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
	int HalfIdx1 = -1, HalfIdx2 = -1;
	SmallVector<int, 8> HalfMask(HalfNumElts);
	unsigned Offset = UndefLower ? HalfNumElts : 0;
	for (unsigned i = 0; i != HalfNumElts; ++i) {
	int M = Mask[i + Offset];
	if (M < 0) {
	HalfMask[i] = M;
	continue;
	}

	// Determine which of the 4 half vectors this element is from.
	// i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
	int HalfIdx = M / HalfNumElts;

	// Determine the element index into its half vector source.
	int HalfElt = M % HalfNumElts;

	// We can shuffle with up to 2 half vectors, set the new 'half'
	// shuffle mask accordingly.
	if (HalfIdx1 < 0 \|\| HalfIdx1 == HalfIdx) {
	HalfMask[i] = HalfElt;
	HalfIdx1 = HalfIdx;
	continue;
	}
	if (HalfIdx2 < 0 \|\| HalfIdx2 == HalfIdx) {
	HalfMask[i] = HalfElt + HalfNumElts;
	HalfIdx2 = HalfIdx;
	continue;
	}

	// Too many half vectors referenced.
	return SDValue();
	}
	assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");

	// Only shuffle the halves of the inputs when useful.
	int NumLowerHalves =
	(HalfIdx1 == 0 \|\| HalfIdx1 == 2) + (HalfIdx2 == 0 \|\| HalfIdx2 == 2);
	int NumUpperHalves =
	(HalfIdx1 == 1 \|\| HalfIdx1 == 3) + (HalfIdx2 == 1 \|\| HalfIdx2 == 3);

	// uuuuXXXX - don't extract uppers just to insert again.
	if (UndefLower && NumUpperHalves != 0)
	return SDValue();

	// XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
	if (UndefUpper && NumUpperHalves == 2)
	return SDValue();

	// AVX2 - XXXXuuuu - always extract lowers.
	if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
	// AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
	if (VT == MVT::v4f64 \|\| VT == MVT::v4i64)
	return SDValue();
	// AVX2 supports variable 32-bit element cross-lane shuffles.
	if (VT == MVT::v8f32 \|\| VT == MVT::v8i32) {
	// XXXXuuuu - don't extract lowers and uppers.
	if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
	return SDValue();
	}
	}

	auto GetHalfVector = [&](int HalfIdx) {
	if (HalfIdx < 0)
	return DAG.getUNDEF(HalfVT);
	SDValue V = (HalfIdx < 2 ? V1 : V2);
	HalfIdx = (HalfIdx % 2) * HalfNumElts;
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
	DAG.getIntPtrConstant(HalfIdx, DL));
	};

	SDValue Half1 = GetHalfVector(HalfIdx1);
	SDValue Half2 = GetHalfVector(HalfIdx2);
	SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
	DAG.getIntPtrConstant(Offset, DL));
	}

	/// \brief Test whether the specified input (0 or 1) is in-place blended by the
	/// given mask.
	///
	/// This returns true if the elements from a particular input are already in the
	/// slot required by the given mask and require no permutation.
	static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
	assert((Input == 0 \|\| Input == 1) && "Only two inputs to shuffles.");
	int Size = Mask.size();
	for (int i = 0; i < Size; ++i)
	if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
	return false;

	return true;
	}

	/// Handle case where shuffle sources are coming from the same 128-bit lane and
	/// every lane can be represented as the same repeating mask - allowing us to
	/// shuffle the sources with the repeating shuffle and then permute the result
	/// to the destination lanes.
	static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
	const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	int NumElts = VT.getVectorNumElements();
	int NumLanes = VT.getSizeInBits() / 128;
	int NumLaneElts = NumElts / NumLanes;

	// On AVX2 we may be able to just shuffle the lowest elements and then
	// broadcast the result.
	if (Subtarget.hasAVX2()) {
	for (unsigned BroadcastSize : {16, 32, 64}) {
	if (BroadcastSize <= VT.getScalarSizeInBits())
	continue;
	int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();

	// Attempt to match a repeating pattern every NumBroadcastElts,
	// accounting for UNDEFs but only references the lowest 128-bit
	// lane of the inputs.
	auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j) {
	int M = Mask[i + j];
	if (M < 0)
	continue;
	int &R = RepeatMask[j];
	if (0 != ((M % NumElts) / NumLaneElts))
	return false;
	if (0 <= R && R != M)
	return false;
	R = M;
	}
	return true;
	};

	SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
	if (!FindRepeatingBroadcastMask(RepeatMask))
	continue;

	// Shuffle the (lowest) repeated elements in place for broadcast.
	SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);

	// Shuffle the actual broadcast.
	SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumBroadcastElts)
	for (int j = 0; j != NumBroadcastElts; ++j)
	BroadcastMask[i + j] = j;
	return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
	BroadcastMask);
	}
	}

	// Bail if the shuffle mask doesn't cross 128-bit lanes.
	if (!is128BitLaneCrossingShuffleMask(VT, Mask))
	return SDValue();

	// Bail if we already have a repeated lane shuffle mask.
	SmallVector<int, 8> RepeatedShuffleMask;
	if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
	return SDValue();

	// On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
	// (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
	int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
	int NumSubLanes = NumLanes * SubLaneScale;
	int NumSubLaneElts = NumLaneElts / SubLaneScale;

	// Check that all the sources are coming from the same lane and see if we can
	// form a repeating shuffle mask (local to each sub-lane). At the same time,
	// determine the source sub-lane for each destination sub-lane.
	int TopSrcSubLane = -1;
	SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
	SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
	SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};

	for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
	// Extract the sub-lane mask, check that it all comes from the same lane
	// and normalize the mask entries to come from the first lane.
	int SrcLane = -1;
	SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
	if (M < 0)
	continue;
	int Lane = (M % NumElts) / NumLaneElts;
	if ((0 <= SrcLane) && (SrcLane != Lane))
	return SDValue();
	SrcLane = Lane;
	int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
	SubLaneMask[Elt] = LocalM;
	}

	// Whole sub-lane is UNDEF.
	if (SrcLane < 0)
	continue;

	// Attempt to match against the candidate repeated sub-lane masks.
	for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
	auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
	for (int i = 0; i != NumSubLaneElts; ++i) {
	if (M1[i] < 0 \|\| M2[i] < 0)
	continue;
	if (M1[i] != M2[i])
	return false;
	}
	return true;
	};

	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
	if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
	continue;

	// Merge the sub-lane mask into the matching repeated sub-lane mask.
	for (int i = 0; i != NumSubLaneElts; ++i) {
	int M = SubLaneMask[i];
	if (M < 0)
	continue;
	assert((RepeatedSubLaneMask[i] < 0 \|\| RepeatedSubLaneMask[i] == M) &&
	"Unexpected mask element");
	RepeatedSubLaneMask[i] = M;
	}

	// Track the top most source sub-lane - by setting the remaining to UNDEF
	// we can greatly simplify shuffle matching.
	int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
	TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
	Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
	break;
	}

	// Bail if we failed to find a matching repeated sub-lane mask.
	if (Dst2SrcSubLanes[DstSubLane] < 0)
	return SDValue();
	}
	assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
	"Unexpected source lane");

	// Create a repeating shuffle mask for the entire vector.
	SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
	for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
	int Lane = SubLane / SubLaneScale;
	auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
	for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
	int M = RepeatedSubLaneMask[Elt];
	if (M < 0)
	continue;
	int Idx = (SubLane * NumSubLaneElts) + Elt;
	RepeatedMask[Idx] = M + (Lane * NumLaneElts);
	}
	}
	SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);

	// Shuffle each source sub-lane to its destination.
	SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
	for (int i = 0; i != NumElts; i += NumSubLaneElts) {
	int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
	if (SrcSubLane < 0)
	continue;
	for (int j = 0; j != NumSubLaneElts; ++j)
	SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
	}

	return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
	SubLaneMask);
	}

	static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
	unsigned &ShuffleImm,
	ArrayRef<int> Mask) {
	int NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() == 64 &&
	(NumElts == 2 \|\| NumElts == 4 \|\| NumElts == 8) &&
	"Unexpected data type for VSHUFPD");

	// Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
	// Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
	ShuffleImm = 0;
	bool ShufpdMask = true;
	bool CommutableMask = true;
	for (int i = 0; i < NumElts; ++i) {
	if (Mask[i] == SM_SentinelUndef)
	continue;
	if (Mask[i] < 0)
	return false;
	int Val = (i & 6) + NumElts * (i & 1);
	int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
	if (Mask[i] < Val \|\| Mask[i] > Val + 1)
	ShufpdMask = false;
	if (Mask[i] < CommutVal \|\| Mask[i] > CommutVal + 1)
	CommutableMask = false;
	ShuffleImm \|= (Mask[i] % 2) << i;
	}

	if (ShufpdMask)
	return true;
	if (CommutableMask) {
	std::swap(V1, V2);
	return true;
	}

	return false;
	}

	static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert((VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v8f64)&&
	"Unexpected data type for VSHUFPD");

	unsigned Immediate = 0;
	if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
	return SDValue();

	return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
	DAG.getConstant(Immediate, DL, MVT::i8));
	}

	static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
	MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());

	SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);

	return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
	}

	/// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	// With AVX2 we have direct support for this permutation.
	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
	DAG);
	}

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
	return Op;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) \|\|
	isShuffleMaskInputInPlace(1, Mask))))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// If we have AVX2 then we always want to lower with a blend because an v4 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 4-lane 64-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v4i64 shuffling..
	static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
	assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");

	if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on both lanes.
	SmallVector<int, 2> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v4i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
	DAG.getBitcast(MVT::v8i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	// AVX2 provides a direct instruction for permuting a single input across
	// lanes.
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or VEXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
	return V;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle. However, if we have AVX2 and either inputs are already in place,
	// we will be able to shuffle even across lanes the other input in a single
	// instruction so skip this pattern.
	if (!isShuffleMaskInputInPlace(0, Mask) &&
	!isShuffleMaskInputInPlace(1, Mask))
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
	///
	/// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
	/// isn't available.
	static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 &&
	"Repeated masks must be half the mask width!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
	return V;

	// Otherwise, fall back to a SHUFPS sequence. Here it is important that we
	// have already handled any direct blends.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
	}

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If we have a single input shuffle with different shuffle patterns in the
	// two 128-bit lanes use the variable mask to VPERMILPS.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
	return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);

	if (Subtarget.hasAVX2())
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);

	// Otherwise, fall back.
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
	DAG);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
	return Result;
	// If we have VLX support, we can use VEXPAND.
	if (Subtarget.hasVLX())
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code using vpunpcklwd and
	// vpunpckhwd instrs than vblend.
	if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
	if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG))
	return V;

	// If we have AVX2 then we always want to lower with a blend because at v8 we
	// can fully permute the elements.
	if (Subtarget.hasAVX2())
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
	Mask, DAG);

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 8-lane 32-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v8i32 shuffling..
	static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
	// since after split we get a more efficient code than vblend by using
	// vpunpcklwd and vpunpckhwd instrs.
	if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
	!Subtarget.hasAVX512())
	if (SDValue V =
	lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the two 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// If we have VLX support, we can use VALIGN or EXPAND.
	if (Subtarget.hasVLX()) {
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;
	}

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return V;

	// If the shuffle patterns aren't repeated but it is a single input, directly
	// generate a cross-lane VPERMD instruction.
	if (V2.isUndef()) {
	SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
	return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
	}

	// Assume that a single SHUFPS is faster than an alternative sequence of
	// multiple instructions (even if the CPU has a domain penalty).
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v8i32, ShufPS);
	}

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic blend lowering.
	return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
	Mask, DAG);
	}

	/// \brief Handle lowering of 16-lane 16-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v16i16 shuffling..
	static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (V2.isUndef()) {
	// There are no generalized cross-lane shuffle operations available on i16
	// element types.
	if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
	Mask, DAG);

	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v16 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// AVX512BWVL can lower to VPERMW.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
	}

	/// \brief Handle lowering of 32-lane 8-bit integer shuffles.
	///
	/// This routine is only called when we have AVX2 and thus a reasonable
	/// instruction set for v32i8 shuffling..
	static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
	Mask, Subtarget, DAG))
	return Broadcast;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There are no generalized cross-lane shuffle operations available on i8
	// element types.
	if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
	return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
	DAG);

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// Try to simplify this by merging 128-bit lanes to enable a lane-based
	// shuffle.
	if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
	DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
	return Result;

	// Otherwise fall back on generic lowering.
	return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 256-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 256-bit x86 vector
	/// shuffle or splits it into two 128-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = VT.getVectorNumElements();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Handle special cases where the lower or upper half is UNDEF.
	if (SDValue V =
	lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return V;

	// There is a really nice hard cut-over between AVX1 and AVX2 that means we
	// can check for those subtargets here and avoid much of the subtarget
	// querying in the per-vector-type lowering routines. With AVX1 we have
	// essentially zero ability to manipulate a 256-bit vector with integer
	// types. Since we'll use floating point types there eventually, just
	// immediately cast everything to a float and operate entirely in that domain.
	if (VT.isInteger() && !Subtarget.hasAVX2()) {
	int ElementBits = VT.getScalarSizeInBits();
	if (ElementBits < 32) {
	// No floating point type available, if we can't use the bit operations
	// for masking/blending then decompose into 128-bit vectors.
	if (SDValue V =
	lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
	return V;
	if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
	return V;
	return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
	}

	MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
	VT.getVectorNumElements());
	V1 = DAG.getBitcast(FpVT, V1);
	V2 = DAG.getBitcast(FpVT, V2);
	return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
	}

	switch (VT.SimpleTy) {
	case MVT::v4f64:
	return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v4i64:
	return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8f32:
	return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i32:
	return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i16:
	return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i8:
	return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 256-bit x86 vector type!");
	}
	}

	/// \brief Try to lower a vector shuffle as a 128-bit shuffles.
	static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
	ArrayRef<int> Mask, SDValue V1,
	SDValue V2, SelectionDAG &DAG) {
	assert(VT.getScalarSizeInBits() == 64 &&
	"Unexpected element type size for 128bit shuffle.");

	// To handle 256 bit vector requires VLX and most probably
	// function lowerV2X128VectorShuffle() is better solution.
	assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");

	SmallVector<int, 4> WidenedMask;
	if (!canWidenShuffleElements(Mask, WidenedMask))
	return SDValue();

	// Check for patterns which can be matched with a single insert of a 256-bit
	// subvector.
	bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 0, 1, 2, 3});
	if (OnlyUsesV1 \|\| isShuffleEquivalent(V1, V2, Mask,
	{0, 1, 2, 3, 8, 9, 10, 11})) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
	SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
	DAG.getIntPtrConstant(0, DL));
	SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
	OnlyUsesV1 ? V1 : V2,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
	}

	assert(WidenedMask.size() == 4);

	// See if this is an insertion of the lower 128-bits of V2 into V1.
	bool IsInsert = true;
	int V2Index = -1;
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	// Make sure all V1 subvectors are in place.
	if (WidenedMask[i] < 4) {
	if (WidenedMask[i] != i) {
	IsInsert = false;
	break;
	}
	} else {
	// Make sure we only have a single V2 index and its the lowest 128-bits.
	if (V2Index >= 0 \|\| WidenedMask[i] != 4) {
	IsInsert = false;
	break;
	}
	V2Index = i;
	}
	}
	if (IsInsert && V2Index >= 0) {
	MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
	SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
	DAG.getIntPtrConstant(0, DL));
	return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
	}

	// Try to lower to to vshuf64x2/vshuf32x4.
	SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
	unsigned PermMask = 0;
	// Insure elements came from the same Op.
	for (int i = 0; i < 4; ++i) {
	assert(WidenedMask[i] >= -1);
	if (WidenedMask[i] < 0)
	continue;

	SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
	unsigned OpIndex = i / 2;
	if (Ops[OpIndex].isUndef())
	Ops[OpIndex] = Op;
	else if (Ops[OpIndex] != Op)
	return SDValue();

	// Convert the 128-bit shuffle mask selection values into 128-bit selection
	// bits defined by a vshuf64x2 instruction's immediate control byte.
	PermMask \|= (WidenedMask[i] % 4) << (i * 2);
	}

	return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
	DAG.getConstant(PermMask, DL, MVT::i8));
	}

	/// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
	static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (V2.isUndef()) {
	// Use low duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
	return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);

	if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
	// Non-half-crossing single input shuffles can be lowered with an
	// interleaved permutation.
	unsigned VPERMILPMask = (Mask[0] == 1) \| ((Mask[1] == 1) << 1) \|
	((Mask[2] == 3) << 2) \| ((Mask[3] == 3) << 3) \|
	((Mask[4] == 5) << 4) \| ((Mask[5] == 5) << 5) \|
	((Mask[6] == 7) << 6) \| ((Mask[7] == 7) << 7);
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
	DAG.getConstant(VPERMILPMask, DL, MVT::i8));
	}

	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
	}

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Shuf128;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Unpck;

	// Check if the blend happens to exactly fit that of SHUFPD.
	if (SDValue Op =
	lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
	return Op;

	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
	static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// If the shuffle mask is repeated in each 128-bit lane, we have many more
	// options to efficiently lower the shuffle.
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");

	// Use even/odd duplicate instructions for masks that match their pattern.
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
	return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
	if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
	return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);

	if (V2.isUndef())
	return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
	return Unpck;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// Otherwise, fall back to a SHUFPS sequence.
	return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 8-lane 64-bit integer shuffles.
	static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
	assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

	if (SDValue Shuf128 =
	lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Shuf128;

	if (V2.isUndef()) {
	// When the shuffle is mirrored between the 128-bit lanes of the unit, we
	// can use lower latency instructions that will operate on all four
	// 128-bit lanes.
	SmallVector<int, 2> Repeated128Mask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
	SmallVector<int, 4> PSHUFDMask;
	scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
	return DAG.getBitcast(
	MVT::v8i64,
	DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
	DAG.getBitcast(MVT::v16i32, V1),
	getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
	}

	SmallVector<int, 4> Repeated256Mask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
	return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
	getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use PALIGNR.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue Unpck =
	lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
	return Unpck;
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
	V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 16-lane 32-bit integer shuffles.
	static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
	assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// If the shuffle mask is repeated in each 128-bit lane we can use more
	// efficient instructions that mirror the shuffles across the four 128-bit
	// lanes.
	SmallVector<int, 4> RepeatedMask;
	bool Is128BitLaneRepeatedShuffle =
	is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
	if (Is128BitLaneRepeatedShuffle) {
	assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
	if (V2.isUndef())
	return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
	getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
	return V;
	}

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use VALIGN.
	if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
	Mask, Subtarget, DAG))
	return Rotate;

	// Try to use byte rotation instructions.
	if (Subtarget.hasBWI())
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	// Assume that a single SHUFPS is faster than using a permv shuffle.
	// If some CPU is harmed by the domain switch, we can fix it in a later pass.
	if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
	SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
	SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
	SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
	CastV1, CastV2, DAG);
	return DAG.getBitcast(MVT::v16i32, ShufPS);
	}
	// If we have AVX512F support, we can use VEXPAND.
	if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
	V1, V2, DAG, Subtarget))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;
	return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 32-lane 16-bit integer shuffles.
	static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
	assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (V2.isUndef()) {
	SmallVector<int, 8> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
	// As this is a single-input shuffle, the repeated mask should be
	// a strictly valid v8i16 mask that we can pass through to the v8i16
	// lowering to handle even the v32 case.
	return lowerV8I16GeneralSingleInputVectorShuffle(
	DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
	}
	}

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
	}

	/// \brief Handle lowering of 64-lane 8-bit integer shuffles.
	static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	const APInt &Zeroable,
	SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
	assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
	assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");

	// Whenever we can lower this as a zext, that instruction is strictly faster
	// than any alternative. It also allows us to fold memory operands into the
	// shuffle in many cases.
	if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
	DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return ZExt;

	// Use dedicated unpack instructions for masks that match their pattern.
	if (SDValue V =
	lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
	return V;

	// Try to use shift instructions.
	if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Shift;

	// Try to use byte rotation instructions.
	if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return Rotate;

	if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
	DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
	return PSHUFB;

	// VBMI can use VPERMV/VPERMV3 byte shuffles.
	if (Subtarget.hasVBMI())
	return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);

	// Try to create an in-lane repeating shuffle mask and then shuffle the
	// the results into the target lanes.
	if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
	DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
	return V;

	if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
	Zeroable, Subtarget, DAG))
	return Blend;

	// FIXME: Implement direct support for this type!
	return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
	}

	/// \brief High-level routine to lower various 512-bit x86 vector shuffles.
	///
	/// This routine either breaks down the specific type of a 512-bit x86 vector
	/// shuffle or splits it into two 256-bit shuffles and fuses the results back
	/// together based on the available instructions.
	static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const APInt &Zeroable,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/ basic ISA!");

	// If we have a single input to the zero element, insert that into V1 if we
	// can do so cheaply.
	int NumElts = Mask.size();
	int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });

	if (NumV2Elements == 1 && Mask[0] >= NumElts)
	if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
	DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
	return Insertion;

	// Check for being able to broadcast a single element.
	if (SDValue Broadcast =
	lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
	return Broadcast;

	// Dispatch to each element type for lowering. If we don't have support for
	// specific element type shuffles at 512 bits, immediately split them and
	// lower them. Each lowering routine of a given type is allowed to assume that
	// the requisite ISA extensions for that element type are available.
	switch (VT.SimpleTy) {
	case MVT::v8f64:
	return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16f32:
	return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v8i64:
	return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v16i32:
	return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v32i16:
	return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
	case MVT::v64i8:
	return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);

	default:
	llvm_unreachable("Not a valid 512-bit x86 vector type!");
	}
	}

	// Lower vXi1 vector shuffles.
	// There is no a dedicated instruction on AVX-512 that shuffles the masks.
	// The only way to shuffle bits is to sign-extend the mask vector to SIMD
	// vector, shuffle and then truncate it back.
	static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
	MVT VT, SDValue V1, SDValue V2,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"Cannot lower 512-bit vectors w/o basic ISA!");
	MVT ExtVT;
	switch (VT.SimpleTy) {
	default:
	llvm_unreachable("Expected a vector of i1 elements");
	case MVT::v2i1:
	ExtVT = MVT::v2i64;
	break;
	case MVT::v4i1:
	ExtVT = MVT::v4i32;
	break;
	case MVT::v8i1:
	ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
	break;
	case MVT::v16i1:
	ExtVT = MVT::v16i32;
	break;
	case MVT::v32i1:
	ExtVT = MVT::v32i16;
	break;
	case MVT::v64i1:
	ExtVT = MVT::v64i8;
	break;
	}

	if (ISD::isBuildVectorAllZeros(V1.getNode()))
	V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V1.getNode()))
	V1 = getOnesVector(ExtVT, DAG, DL);
	else
	V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);

	if (V2.isUndef())
	V2 = DAG.getUNDEF(ExtVT);
	else if (ISD::isBuildVectorAllZeros(V2.getNode()))
	V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
	else if (ISD::isBuildVectorAllOnes(V2.getNode()))
	V2 = getOnesVector(ExtVT, DAG, DL);
	else
	V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);

	SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
	// i1 was sign extended we can use X86ISD::CVT2MASK.
	int NumElems = VT.getVectorNumElements();
	if ((Subtarget.hasBWI() && (NumElems >= 32)) \|\|
	(Subtarget.hasDQI() && (NumElems < 32)))
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);

	return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
	}

	/// Helper function that returns true if the shuffle mask should be
	/// commuted to improve canonicalization.
	static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
	int NumElements = Mask.size();

	int NumV1Elements = 0, NumV2Elements = 0;
	for (int M : Mask)
	if (M < 0)
	continue;
	else if (M < NumElements)
	++NumV1Elements;
	else
	++NumV2Elements;

	// Commute the shuffle as needed such that more elements come from V1 than
	// V2. This allows us to match the shuffle pattern strictly on how many
	// elements come from V1 without handling the symmetric cases.
	if (NumV2Elements > NumV1Elements)
	return true;

	assert(NumV1Elements > 0 && "No V1 indices");

	if (NumV2Elements == 0)
	return false;

	// When the number of V1 and V2 elements are the same, try to minimize the
	// number of uses of V2 in the low half of the vector. When that is tied,
	// ensure that the sum of indices for V1 is equal to or lower than the sum
	// indices for V2. When those are equal, try to ensure that the number of odd
	// indices for V1 is lower than the number of odd indices for V2.
	if (NumV1Elements == NumV2Elements) {
	int LowV1Elements = 0, LowV2Elements = 0;
	for (int M : Mask.slice(0, NumElements / 2))
	if (M >= NumElements)
	++LowV2Elements;
	else if (M >= 0)
	++LowV1Elements;
	if (LowV2Elements > LowV1Elements)
	return true;
	if (LowV2Elements == LowV1Elements) {
	int SumV1Indices = 0, SumV2Indices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	SumV2Indices += i;
	else if (Mask[i] >= 0)
	SumV1Indices += i;
	if (SumV2Indices < SumV1Indices)
	return true;
	if (SumV2Indices == SumV1Indices) {
	int NumV1OddIndices = 0, NumV2OddIndices = 0;
	for (int i = 0, Size = Mask.size(); i < Size; ++i)
	if (Mask[i] >= NumElements)
	NumV2OddIndices += i % 2;
	else if (Mask[i] >= 0)
	NumV1OddIndices += i % 2;
	if (NumV2OddIndices < NumV1OddIndices)
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Top-level lowering for x86 vector shuffles.
	///
	/// This handles decomposition, canonicalization, and lowering of all x86
	/// vector shuffles. Most of the specific lowering strategies are encapsulated
	/// above in helper routines. The canonicalization attempts to widen shuffles
	/// to involve fewer lanes of wider elements, consolidate symmetric patterns
	/// s.t. only one of the two inputs needs to be tested, etc.
	static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
	ArrayRef<int> Mask = SVOp->getMask();
	SDValue V1 = Op.getOperand(0);
	SDValue V2 = Op.getOperand(1);
	MVT VT = Op.getSimpleValueType();
	int NumElements = VT.getVectorNumElements();
	SDLoc DL(Op);
	bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);

	assert((VT.getSizeInBits() != 64 \|\| Is1BitVector) &&
	"Can't lower MMX shuffles");

	bool V1IsUndef = V1.isUndef();
	bool V2IsUndef = V2.isUndef();
	if (V1IsUndef && V2IsUndef)
	return DAG.getUNDEF(VT);

	// When we create a shuffle node we put the UNDEF node to second operand,
	// but in some cases the first operand may be transformed to UNDEF.
	// In this case we should just commute the node.
	if (V1IsUndef)
	return DAG.getCommutedVectorShuffle(*SVOp);

	// Check for non-undef masks pointing at an undef vector and make the masks
	// undef as well. This makes it easier to match the shuffle based solely on
	// the mask.
	if (V2IsUndef)
	for (int M : Mask)
	if (M >= NumElements) {
	SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
	for (int &M : NewMask)
	if (M >= NumElements)
	M = -1;
	return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
	}

	// Check for illegal shuffle mask element index values.
	int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
	assert(llvm::all_of(Mask,
	[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
	"Out of bounds shuffle index");

	// We actually see shuffles that are entirely re-arrangements of a set of
	// zero inputs. This mostly happens while decomposing complex shuffles into
	// simple ones. Directly lower these as a buildvector of zeros.
	APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
	if (Zeroable.isAllOnesValue())
	return getZeroVector(VT, Subtarget, DAG, DL);

	// Try to collapse shuffles into using a vector type with fewer elements but
	// wider element types. We cap this to not form integers or floating point
	// elements wider than 64 bits, but it might be interesting to form i128
	// integers to handle flipping the low and high halves of AVX 256-bit vectors.
	SmallVector<int, 16> WidenedMask;
	if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
	canWidenShuffleElements(Mask, WidenedMask)) {
	MVT NewEltVT = VT.isFloatingPoint()
	? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
	: MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
	MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
	// Make sure that the new vector type is legal. For example, v2f64 isn't
	// legal on SSE1.
	if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
	V1 = DAG.getBitcast(NewVT, V1);
	V2 = DAG.getBitcast(NewVT, V2);
	return DAG.getBitcast(
	VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
	}
	}

	// Commute the shuffle if it will improve canonicalization.
	if (canonicalizeShuffleMaskWithCommute(Mask))
	return DAG.getCommutedVectorShuffle(*SVOp);

	// For each vector width, delegate to a specialized lowering routine.
	if (VT.is128BitVector())
	return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is256BitVector())
	return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (VT.is512BitVector())
	return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
	DAG);

	if (Is1BitVector)
	return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);

	llvm_unreachable("Unimplemented!");
	}

	/// \brief Try to lower a VSELECT instruction to a vector shuffle.
	static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Cond = Op.getOperand(0);
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
	return SDValue();
	auto *CondBV = cast<BuildVectorSDNode>(Cond);

	// Only non-legal VSELECTs reach this lowering, convert those into generic
	// shuffles and re-use the shuffle lowering path for blends.
	SmallVector<int, 32> Mask;
	for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
	SDValue CondElt = CondBV->getOperand(i);
	Mask.push_back(
	isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
	: -1);
	}
	return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
	}

	SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
	// A vselect where all conditions and data are constants can be optimized into
	// a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
	if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
	ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
	return SDValue();

	// If this VSELECT has a vector if i1 as a mask, it will be directly matched
	// with patterns on the mask registers on AVX-512.
	if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
	return Op;

	// Try to lower this to a blend-style vector shuffle. This can handle all
	// constant condition cases.
	if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
	return BlendOp;

	// Variable blends are only legal from SSE4.1 onward.
	if (!Subtarget.hasSSE41())
	return SDValue();

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
	// into an i1 condition so that we can use the mask-based 512-bit blend
	// instructions.
	if (VT.getSizeInBits() == 512) {
	SDValue Cond = Op.getOperand(0);
	// The vNi1 condition case should be handled above as it can be trivially
	// lowered.
	assert(Cond.getValueType().getScalarSizeInBits() ==
	VT.getScalarSizeInBits() &&
	"Should have a size-matched integer condition!");
	// Build a mask by testing the condition against itself (tests for zero).
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
	// Now return a new VSELECT using the mask.
	return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
	}

	// Only some types will be legal on some subtargets. If we can emit a legal
	// VSELECT-matching blend, return Op, and but if we need to expand, return
	// a null value.
	switch (VT.SimpleTy) {
	default:
	// Most of the vector types have blends past SSE4.1.
	return Op;

	case MVT::v32i8:
	// The byte blends for AVX vectors were introduced only in AVX2.
	if (Subtarget.hasAVX2())
	return Op;

	return SDValue();

	case MVT::v8i16:
	case MVT::v16i16:
	// AVX-512 BWI and VLX features support VSELECT with i16 elements.
	if (Subtarget.hasBWI() && Subtarget.hasVLX())
	return Op;

	// FIXME: We should custom lower this by fixing the condition and using i8
	// blends.
	return SDValue();
	}
	}

	static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
	return SDValue();

	if (VT.getSizeInBits() == 8) {
	SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
	DAG.getValueType(VT));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
	}

	if (VT == MVT::f32) {
	// EXTRACTPS outputs to a GPR32 register which will require a movd to copy
	// the result back to FR32 register. It's only worth matching if the
	// result has a single use which is a store or a bitcast to i32. And in
	// the case of a store, it's not worth it if the index is a constant 0,
	// because a MOVSSmr can be used instead, which is smaller and faster.
	if (!Op.hasOneUse())
	return SDValue();
	SDNode User = Op.getNode()->use_begin();
	if ((User->getOpcode() != ISD::STORE \|\|
	isNullConstant(Op.getOperand(1))) &&
	(User->getOpcode() != ISD::BITCAST \|\|
	User->getValueType(0) != MVT::i32))
	return SDValue();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
	Op.getOperand(1));
	return DAG.getBitcast(MVT::f32, Extract);
	}

	if (VT == MVT::i32 \|\| VT == MVT::i64) {
	// ExtractPS/pextrq works with constant index.
	if (isa<ConstantSDNode>(Op.getOperand(1)))
	return Op;
	}

	return SDValue();
	}

	/// Extract one bit from mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	SDValue
	X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
	SDValue Vec = Op.getOperand(0);
	SDLoc dl(Vec);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);
	MVT EltVT = Op.getSimpleValueType();

	assert((VecVT.getVectorNumElements() <= 16 \|\| Subtarget.hasBWI()) &&
	"Unexpected vector type in ExtractBitFromMaskVector");

	// variable index can't be handled in mask registers,
	// extend vector to VR512/128
	if (!isa<ConstantSDNode>(Idx)) {
	unsigned NumElts = VecVT.getVectorNumElements();
	// Extending v8i1/v16i1 to 512-bit get better performance on KNL
	// than extending to 128/256bit.
	unsigned VecSize = (NumElts <= 4 ? 128 : 512);
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
	SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
	SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	ExtVT.getVectorElementType(), Ext, Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) \|\|
	(VecVT.getVectorNumElements() < 8)) {
	// Use kshiftlw/rw instruction.
	VecVT = MVT::v16i1;
	Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
	DAG.getUNDEF(VecVT),
	Vec,
	DAG.getIntPtrConstant(0, dl));
	}
	unsigned MaxSift = VecVT.getVectorNumElements() - 1;
	if (MaxSift - IdxVal)
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(MaxSift, dl, MVT::i8));
	return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	SDValue
	X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	MVT VecVT = Vec.getSimpleValueType();
	SDValue Idx = Op.getOperand(1);

	if (VecVT.getVectorElementType() == MVT::i1)
	return ExtractBitFromMaskVector(Op, DAG);

	if (!isa<ConstantSDNode>(Idx)) {
	// Its more profitable to go through memory (1 cycles throughput)
	// than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
	// IACA tool was used to get performance estimation
	// (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
	//
	// example : extractelement <16 x i8> %a, i32 %i
	//
	// Block Throughput: 3.00 Cycles
	// Throughput Bottleneck: Port5
	//
	// \| Num Of \| Ports pressure in cycles \| \|
	// \| Uops \| 0 - DV \| 5 \| 6 \| 7 \| \|
	// ---------------------------------------------
	// \| 1 \| \| 1.0 \| \| \| CP \| vmovd xmm1, edi
	// \| 1 \| \| 1.0 \| \| \| CP \| vpshufb xmm0, xmm0, xmm1
	// \| 2 \| 1.0 \| 1.0 \| \| \| CP \| vpextrb eax, xmm0, 0x0
	// Total Num Of Uops: 4
	//
	//
	// Block Throughput: 1.00 Cycles
	// Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
	//
	// \| \| Ports pressure in cycles \| \|
	// \|Uops\| 1 \| 2 - D \|3 - D \| 4 \| 5 \| \|
	// ---------------------------------------------------------
	// \|2^ \| \| 0.5 \| 0.5 \|1.0\| \|CP\| vmovaps xmmword ptr [rsp-0x18], xmm0
	// \|1 \|0.5\| \| \| \|0.5\| \| lea rax, ptr [rsp-0x18]
	// \|1 \| \|0.5, 0.5\|0.5, 0.5\| \| \|CP\| mov al, byte ptr [rdi+rax*1]
	// Total Num Of Uops: 4

	return SDValue();
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();

	// If this is a 256-bit vector result, first extract the 128-bit vector and
	// then extract the element from the 128-bit vector.
	if (VecVT.is256BitVector() \|\| VecVT.is512BitVector()) {
	// Get the 128-bit vector.
	Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
	MVT EltVT = VecVT.getVectorElementType();

	unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");

	// Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
	// this can be done with a mask.
	IdxVal &= ElemsPerChunk - 1;
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
	DAG.getConstant(IdxVal, dl, MVT::i32));
	}

	assert(VecVT.is128BitVector() && "Unexpected vector length");

	MVT VT = Op.getSimpleValueType();

	if (VT.getSizeInBits() == 16) {
	// If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
	// we're going to zero extend the register or fold the store (SSE41 only).
	if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
	!(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec), Idx));

	// Transform it so it match pextrw which produces a 32-bit result.
	SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
	Op.getOperand(0), Op.getOperand(1));
	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
	DAG.getValueType(VT));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
	}

	if (Subtarget.hasSSE41())
	if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
	return Res;

	// TODO: We only extract a single element from v16i8, we can probably afford
	// to be more aggressive here before using the default approach of spilling to
	// stack.
	if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
	// Extract either the lowest i32 or any i16, and extract the sub-byte.
	int DWordIdx = IdxVal / 4;
	if (DWordIdx == 0) {
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
	DAG.getBitcast(MVT::v4i32, Vec),
	DAG.getIntPtrConstant(DWordIdx, dl));
	int ShiftVal = (IdxVal % 4) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i32));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	int WordIdx = IdxVal / 2;
	SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
	DAG.getBitcast(MVT::v8i16, Vec),
	DAG.getIntPtrConstant(WordIdx, dl));
	int ShiftVal = (IdxVal % 2) * 8;
	if (ShiftVal != 0)
	Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
	DAG.getConstant(ShiftVal, dl, MVT::i16));
	return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
	}

	if (VT.getSizeInBits() == 32) {
	if (IdxVal == 0)
	return Op;

	// SHUFPS the element to the lowest double word, then movss.
	int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	if (VT.getSizeInBits() == 64) {
	// FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
	// FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
	// to match extract_elt for f64.
	if (IdxVal == 0)
	return Op;

	// UNPCKHPD the element to the lowest double word, then movsd.
	// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
	// to a f64mem, the whole operation is folded into a single MOVHPDmr.
	int Mask[2] = { 1, -1 };
	Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
	DAG.getIntPtrConstant(0, dl));
	}

	return SDValue();
	}

	/// Insert one bit to mask vector, like v16i1 or v8i1.
	/// AVX-512 feature.
	SDValue
	X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
	SDLoc dl(Op);
	SDValue Vec = Op.getOperand(0);
	SDValue Elt = Op.getOperand(1);
	SDValue Idx = Op.getOperand(2);
	MVT VecVT = Vec.getSimpleValueType();

	if (!isa<ConstantSDNode>(Idx)) {
	// Non constant index. Extend source and destination,
	// insert element and then truncate the result.
	MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
	MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
	SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
	DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
	DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
	return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
	}

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
	unsigned NumElems = VecVT.getVectorNumElements();

	if(Vec.isUndef()) {
	if (IdxVal)
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	return EltInVec;
	}

	// Insertion of one bit into first position
	if (IdxVal == 0 ) {
	// Clean top bits of vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
	DAG.getConstant(NumElems - 1, dl, MVT::i8));
	// Clean the first bit in source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}
	// Insertion of one bit into last position
	if (IdxVal == NumElems -1) {
	// Move the bit to the last position inside the vector.
	EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
	DAG.getConstant(IdxVal, dl, MVT::i8));
	// Clean the last bit in the source vector.
	Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
	DAG.getConstant(1, dl, MVT::i8));
	Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
	DAG.getConstant(1 , dl, MVT::i8));

	return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
	}

	// Use shuffle to insert element.
	SmallVector<int, 64> MaskVec(NumElems);
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = (i == IdxVal) ? NumElems : i;

	return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
	}

	SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
	SelectionDAG &DAG) const {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();

	if (EltVT == MVT::i1)
	return InsertBitToMaskVector(Op, DAG);

	SDLoc dl(Op);
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	SDValue N2 = Op.getOperand(2);
	if (!isa<ConstantSDNode>(N2))
	return SDValue();
	auto *N2C = cast<ConstantSDNode>(N2);
	unsigned IdxVal = N2C->getZExtValue();

	bool IsZeroElt = X86::isZeroNode(N1);
	bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);

	// If we are inserting a element, see if we can do this more efficiently with
	// a blend shuffle with a rematerializable vector than a costly integer
	// insertion.
	if ((IsZeroElt \|\| IsAllOnesElt) && Subtarget.hasSSE41() &&
	16 <= EltVT.getSizeInBits()) {
	SmallVector<int, 8> BlendMask;
	for (unsigned i = 0; i != NumElts; ++i)
	BlendMask.push_back(i == IdxVal ? i + NumElts : i);
	SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
	: DAG.getConstant(-1, dl, VT);
	return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
	}

	// If the vector is wider than 128 bits, extract the 128-bit subvector, insert
	// into that, and then insert the subvector back into the result.
	if (VT.is256BitVector() \|\| VT.is512BitVector()) {
	// With a 256-bit vector, we can insert into the zero element efficiently
	// using a blend if we have AVX or AVX2 and the right data type.
	if (VT.is256BitVector() && IdxVal == 0) {
	// TODO: It is worthwhile to cast integer to floating point and back
	// and incur a domain crossing penalty if that's what we'll end up
	// doing anyway after extracting to a 128-bit vector.
	if ((Subtarget.hasAVX() && (EltVT == MVT::f64 \|\| EltVT == MVT::f32)) \|\|
	(Subtarget.hasAVX2() && EltVT == MVT::i32)) {
	SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
	N2 = DAG.getIntPtrConstant(1, dl);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
	}
	}

	// Get the desired 128-bit vector chunk.
	SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);

	// Insert the element into the desired chunk.
	unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
	assert(isPowerOf2_32(NumEltsIn128));
	// Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
	unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);

	V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
	DAG.getConstant(IdxIn128, dl, MVT::i32));

	// Insert the changed part back into the bigger vector
	return insert128BitVector(N0, V, IdxVal, DAG, dl);
	}
	assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");

	// Transform it so it match pinsr{b,w} which expects a GR32 as its second
	// argument. SSE41 required for pinsrb.
	if (VT == MVT::v8i16 \|\| (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
	unsigned Opc;
	if (VT == MVT::v8i16) {
	assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
	Opc = X86ISD::PINSRW;
	} else {
	assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
	assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
	Opc = X86ISD::PINSRB;
	}

	if (N1.getValueType() != MVT::i32)
	N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
	if (N2.getValueType() != MVT::i32)
	N2 = DAG.getIntPtrConstant(IdxVal, dl);
	return DAG.getNode(Opc, dl, VT, N0, N1, N2);
	}

	if (Subtarget.hasSSE41()) {
	if (EltVT == MVT::f32) {
	// Bits [7:6] of the constant are the source select. This will always be
	// zero here. The DAG Combiner may combine an extract_elt index into
	// these bits. For example (insert (extract, 3), 2) could be matched by
	// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
	// Bits [5:4] of the constant are the destination select. This is the
	// value of the incoming immediate.
	// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
	// combine either bitwise AND or insert of float 0.0 to set these bits.

	bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
	if (IdxVal == 0 && (!MinSize \|\| !MayFoldLoad(N1))) {
	// If this is an insertion of 32-bits into the low 32-bits of
	// a vector, we prefer to generate a blend with immediate rather
	// than an insertps. Blends are simpler operations in hardware and so
	// will always have equal or better performance than insertps.
	// But if optimizing for size and there's a load folding opportunity,
	// generate insertps because blendps does not have a 32-bit memory
	// operand form.
	N2 = DAG.getIntPtrConstant(1, dl);
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
	}
	N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
	// Create this as a scalar to vector..
	N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
	return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
	}

	// PINSR* works with constant index.
	if (EltVT == MVT::i32 \|\| EltVT == MVT::i64)
	return Op;
	}

	return SDValue();
	}

	static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT OpVT = Op.getSimpleValueType();

	// It's always cheaper to replace a xor+movd with xorps and simplifies further
	// combines.
	if (X86::isZeroNode(Op.getOperand(0)))
	return getZeroVector(OpVT, Subtarget, DAG, dl);

	// If this is a 256-bit vector result, first insert into a 128-bit
	// vector and then insert into the 256-bit vector.
	if (!OpVT.is128BitVector()) {
	// Insert into a 128-bit vector.
	unsigned SizeFactor = OpVT.getSizeInBits() / 128;
	MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
	OpVT.getVectorNumElements() / SizeFactor);

	Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));

	// Insert the 128-bit vector.
	return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
	}
	assert(OpVT.is128BitVector() && "Expected an SSE type!");

	// Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
	if (OpVT == MVT::v4i32)
	return Op;

	SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
	return DAG.getBitcast(
	OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
	}

	// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
	// a simple subregister reference or explicit instructions to grab
	// upper bits of a vector.
	static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");

	SDLoc dl(Op);
	SDValue In = Op.getOperand(0);
	SDValue Idx = Op.getOperand(1);
	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	MVT ResVT = Op.getSimpleValueType();

	assert((In.getSimpleValueType().is256BitVector() \|\|
	In.getSimpleValueType().is512BitVector()) &&
	"Can only extract from 256-bit or 512-bit vectors");

	// If the input is a buildvector just emit a smaller one.
	unsigned ElemsPerChunk = ResVT.getVectorNumElements();
	if (In.getOpcode() == ISD::BUILD_VECTOR)
	return DAG.getBuildVector(
	ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));

	// Everything else is legal.
	return Op;
	}

	// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
	// simple superregister reference or explicit instructions to insert
	// the upper bits of a vector.
	static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);

	return insert1BitVector(Op, DAG, Subtarget);
	}

	// Returns the appropriate wrapper opcode for a global reference.
	unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
	// References to absolute symbols are never PC-relative.
	if (GV && GV->isAbsoluteSymbolRef())
	return X86ISD::Wrapper;

	CodeModel::Model M = getTargetMachine().getCodeModel();
	if (Subtarget.isPICStyleRIPRel() &&
	(M == CodeModel::Small \|\| M == CodeModel::Kernel))
	return X86ISD::WrapperRIP;

	return X86ISD::Wrapper;
	}

	// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
	// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
	// one of the above mentioned nodes. It has to be wrapped because otherwise
	// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
	// be used to form addressing mode. These wrapped nodes will be selected
	// into MOV32ri.
	SDValue
	X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
	ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetConstantPool(
	CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
	SDLoc DL(CP);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
	// With PIC, the address is actually $g + Offset.
	if (OpFlag) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
	JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
	SDLoc DL(JT);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (OpFlag)
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);

	return Result;
	}

	SDValue
	X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
	const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
	unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);

	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);

	SDLoc DL(Op);
	Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isPositionIndependent() && !Subtarget.is64Bit()) {
	Result =
	DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
	}

	// For symbols that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlag))
	Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
	// Create the TargetBlockAddressAddress node.
	unsigned char OpFlags =
	Subtarget.classifyBlockAddressReference();
	const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
	int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
	Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	return Result;
	}

	SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
	const SDLoc &dl, int64_t Offset,
	SelectionDAG &DAG) const {
	// Create the TargetGlobalAddress node, folding in the constant
	// offset if it is legal.
	unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
	CodeModel::Model M = DAG.getTarget().getCodeModel();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue Result;
	if (OpFlags == X86II::MO_NO_FLAG &&
	X86::isOffsetSuitableForCodeModel(Offset, M)) {
	// A direct static reference to a global.
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
	Offset = 0;
	} else {
	Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
	}

	Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);

	// With PIC, the address is actually $g + Offset.
	if (isGlobalRelativeToPICBase(OpFlags)) {
	Result = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
	}

	// For globals that require a load from a stub to get the address, emit the
	// load.
	if (isGlobalStubReference(OpFlags))
	Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));

	// If there was a non-zero offset that we didn't fold, create an explicit
	// addition for it.
	if (Offset != 0)
	Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
	DAG.getConstant(Offset, dl, PtrVT));

	return Result;
	}

	SDValue
	X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
	const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
	int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
	return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
	}

	static SDValue
	GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
	SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
	unsigned char OperandFlags, bool LocalDynamic = false) {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDLoc dl(GA);
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(),
	OperandFlags);

	X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
	: X86ISD::TLSADDR;

	if (InFlag) {
	SDValue Ops[] = { Chain, TGA, *InFlag };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	} else {
	SDValue Ops[] = { Chain, TGA };
	Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
	}

	// TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
	MFI.setAdjustsStack(true);
	MFI.setHasCalls(true);

	SDValue Flag = Chain.getValue(1);
	return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
	static SDValue
	LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	SDValue InFlag;
	SDLoc dl(GA); // ? function entry point might be better
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg,
	SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);

	return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
	}

	// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
	static SDValue
	LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT) {
	return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
	X86::RAX, X86II::MO_TLSGD);
	}

	static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
	SelectionDAG &DAG,
	const EVT PtrVT,
	bool is64Bit) {
	SDLoc dl(GA);

	// Get the start address of the TLS block for this module.
	X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
	.getInfo<X86MachineFunctionInfo>();
	MFI->incNumLocalDynamicTLSAccesses();

	SDValue Base;
	if (is64Bit) {
	Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
	X86II::MO_TLSLD, /LocalDynamic=/true);
	} else {
	SDValue InFlag;
	SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
	InFlag = Chain.getValue(1);
	Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
	X86II::MO_TLSLDM, /LocalDynamic=/true);
	}

	// Note: the CleanupLocalDynamicTLSPass will remove redundant computations
	// of Base.

	// Build x@dtpoff.
	unsigned char OperandFlags = X86II::MO_DTPOFF;
	unsigned WrapperKind = X86ISD::Wrapper;
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	// Add x@dtpoff with the base.
	return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
	}

	// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
	static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
	const EVT PtrVT, TLSModel::Model model,
	bool is64Bit, bool isPIC) {
	SDLoc dl(GA);

	// Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
	Value Ptr = Constant::getNullValue(Type::getInt8PtrTy(DAG.getContext(),
	is64Bit ? 257 : 256));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
	MachinePointerInfo(Ptr));

	unsigned char OperandFlags = 0;
	// Most TLS accesses are not RIP relative, even on x86-64. One exception is
	// initialexec.
	unsigned WrapperKind = X86ISD::Wrapper;
	if (model == TLSModel::LocalExec) {
	OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
	} else if (model == TLSModel::InitialExec) {
	if (is64Bit) {
	OperandFlags = X86II::MO_GOTTPOFF;
	WrapperKind = X86ISD::WrapperRIP;
	} else {
	OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
	}
	} else {
	llvm_unreachable("Unexpected model");
	}

	// emit "addl x@ntpoff,%eax" (local exec)
	// or "addl x@indntpoff,%eax" (initial exec)
	// or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
	SDValue TGA =
	DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
	GA->getOffset(), OperandFlags);
	SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);

	if (model == TLSModel::InitialExec) {
	if (isPIC && !is64Bit) {
	Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);
	}

	Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
	MachinePointerInfo::getGOT(DAG.getMachineFunction()));
	}

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
	}

	SDValue
	X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {

	GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);

	if (DAG.getTarget().Options.EmulatedTLS)
	return LowerToTLSEmulatedModel(GA, DAG);

	const GlobalValue *GV = GA->getGlobal();
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	bool PositionIndependent = isPositionIndependent();

	if (Subtarget.isTargetELF()) {
	TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
	switch (model) {
	case TLSModel::GeneralDynamic:
	if (Subtarget.is64Bit())
	return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
	return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
	case TLSModel::LocalDynamic:
	return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
	Subtarget.is64Bit());
	case TLSModel::InitialExec:
	case TLSModel::LocalExec:
	return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
	PositionIndependent);
	}
	llvm_unreachable("Unknown TLS model.");
	}

	if (Subtarget.isTargetDarwin()) {
	// Darwin only has one model of TLS. Lower to that.
	unsigned char OpFlag = 0;
	unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
	X86ISD::WrapperRIP : X86ISD::Wrapper;

	// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
	// global base reg.
	bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
	if (PIC32)
	OpFlag = X86II::MO_TLVP_PIC_BASE;
	else
	OpFlag = X86II::MO_TLVP;
	SDLoc DL(Op);
	SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
	GA->getValueType(0),
	GA->getOffset(), OpFlag);
	SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);

	// With PIC32, the address is actually $g + Offset.
	if (PIC32)
	Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
	DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
	Offset);

	// Lowering the machine isd will make sure everything is in the right
	// location.
	SDValue Chain = DAG.getEntryNode();
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
	SDValue Args[] = { Chain, Offset };
	Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
	DAG.getIntPtrConstant(0, DL, true),
	Chain.getValue(1), DL);

	// TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setAdjustsStack(true);

	// And our return value (tls address) is in the standard call return value
	// location.
	unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
	}

	if (Subtarget.isTargetKnownWindowsMSVC() \|\|
	Subtarget.isTargetWindowsItanium() \|\|
	Subtarget.isTargetWindowsGNU()) {
	// Just use the implicit TLS architecture
	// Need to generate something similar to:
	// mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
	// ; from TEB
	// mov ecx, dword [rel _tls_index]: Load index (from C runtime)
	// mov rcx, qword [rdx+rcx*8]
	// mov eax, .tls$:tlsvar
	// [rax+rcx] contains the address
	// Windows 64bit: gs:0x58
	// Windows 32bit: fs:__tls_array

	SDLoc dl(GA);
	SDValue Chain = DAG.getEntryNode();

	// Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
	// %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
	// use its literal value of 0x2C.
	Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
	? Type::getInt8PtrTy(*DAG.getContext(),
	256)
	: Type::getInt32PtrTy(*DAG.getContext(),
	257));

	SDValue TlsArray = Subtarget.is64Bit()
	? DAG.getIntPtrConstant(0x58, dl)
	: (Subtarget.isTargetWindowsGNU()
	? DAG.getIntPtrConstant(0x2C, dl)
	: DAG.getExternalSymbol("_tls_array", PtrVT));

	SDValue ThreadPointer =
	DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));

	SDValue res;
	if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
	res = ThreadPointer;
	} else {
	// Load the _tls_index variable
	SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
	if (Subtarget.is64Bit())
	IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
	MachinePointerInfo(), MVT::i32);
	else
	IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());

	auto &DL = DAG.getDataLayout();
	SDValue Scale =
	DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
	IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);

	res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
	}

	res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());

	// Get the offset of start of .tls section
	SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
	GA->getValueType(0),
	GA->getOffset(), X86II::MO_SECREL);
	SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);

	// The address of the thread local variable is the add of the thread
	// pointer with the offset of the variable.
	return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
	}

	llvm_unreachable("TLS not implemented for this target.");
	}

	/// Lower SRA_PARTS and friends, which return two i32 values
	/// and take a 2 x i32 value to shift plus a shift amount.
	static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getNumOperands() == 3 && "Not a double-shift!");
	MVT VT = Op.getSimpleValueType();
	unsigned VTBits = VT.getSizeInBits();
	SDLoc dl(Op);
	bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
	SDValue ShOpLo = Op.getOperand(0);
	SDValue ShOpHi = Op.getOperand(1);
	SDValue ShAmt = Op.getOperand(2);
	// X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
	// generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
	// during isel.
	SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits - 1, dl, MVT::i8));
	SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
	DAG.getConstant(VTBits - 1, dl, MVT::i8))
	: DAG.getConstant(0, dl, VT);

	SDValue Tmp2, Tmp3;
	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
	Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
	} else {
	Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
	Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
	}

	// If the shift amount is larger or equal than the width of a part we can't
	// rely on the results of shld/shrd. Insert a test and select the appropriate
	// values for large shift amounts.
	SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
	DAG.getConstant(VTBits, dl, MVT::i8));
	SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	AndNode, DAG.getConstant(0, dl, MVT::i8));

	SDValue Hi, Lo;
	SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
	SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };

	if (Op.getOpcode() == ISD::SHL_PARTS) {
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	} else {
	Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
	Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
	}

	SDValue Ops[2] = { Lo, Hi };
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Src = Op.getOperand(0);
	MVT SrcVT = Src.getSimpleValueType();
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (SrcVT.isVector()) {
	if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
	return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
	DAG.getUNDEF(SrcVT)));
	}
	if (SrcVT.getVectorElementType() == MVT::i1) {
	if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
	MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
	}
	return SDValue();
	}

	assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
	"Unknown SINT_TO_FP to lower!");

	// These are really Legal; return the operand so the caller accepts it as
	// Legal.
	if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
	return Op;
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	Subtarget.is64Bit()) {
	return Op;
	}

	SDValue ValueToStore = Op.getOperand(0);
	if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
	!Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);

	unsigned Size = SrcVT.getSizeInBits()/8;
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	SDValue Chain = DAG.getStore(
	DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
	}

	SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
	SDValue StackSlot,
	SelectionDAG &DAG) const {
	// Build the FILD
	SDLoc DL(Op);
	SDVTList Tys;
	bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
	if (useSSE)
	Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
	else
	Tys = DAG.getVTList(Op.getValueType(), MVT::Other);

	unsigned ByteSize = SrcVT.getSizeInBits()/8;

	FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
	MachineMemOperand *MMO;
	if (FI) {
	int SSFI = FI->getIndex();
	MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, ByteSize, ByteSize);
	} else {
	MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
	StackSlot = StackSlot.getOperand(1);
	}
	SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
	SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
	X86ISD::FILD, DL,
	Tys, Ops, SrcVT, MMO);

	if (useSSE) {
	Chain = Result.getValue(1);
	SDValue InFlag = Result.getValue(2);

	// FIXME: Currently the FST is flagged to the FILD_FLAG. This
	// shouldn't be necessary except that RFP cannot be live across
	// multiple blocks. When stackifier is fixed, they can be uncoupled.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned SSFISize = Op.getValueSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
	auto PtrVT = getPointerTy(MF.getDataLayout());
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	Tys = DAG.getVTList(MVT::Other);
	SDValue Ops[] = {
	Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
	};
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOStore, SSFISize, SSFISize);

	Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
	Ops, Op.getValueType(), MMO);
	Result = DAG.getLoad(
	Op.getValueType(), DL, Chain, StackSlot,
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
	}

	return Result;
	}

	/// 64-bit unsigned integer to double expansion.
	SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
	SelectionDAG &DAG) const {
	// This algorithm is not obvious. Here it is what we're trying to output:
	/*
	movq %rax, %xmm0
	punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
	subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
	#ifdef __SSE3__
	haddpd %xmm0, %xmm0
	#else
	pshufd $0x4e, %xmm0, %xmm1
	addpd %xmm1, %xmm0
	#endif
	*/

	SDLoc dl(Op);
	LLVMContext *Context = DAG.getContext();

	// Build some magic constants.
	static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
	Constant C0 = ConstantDataVector::get(Context, CV0);
	auto PtrVT = getPointerTy(DAG.getDataLayout());
	SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);

	SmallVector<Constant*,2> CV1;
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4330000000000000ULL))));
	CV1.push_back(
	ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
	APInt(64, 0x4530000000000000ULL))));
	Constant *C1 = ConstantVector::get(CV1);
	SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);

	// Load the 64-bit value into an XMM register.
	SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
	Op.getOperand(0));
	SDValue CLod0 =
	DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue Unpck1 =
	getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);

	SDValue CLod1 =
	DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
	/* Alignment = */ 16);
	SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
	SDValue Result;

	if (Subtarget.hasSSE3()) {
	// FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
	Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
	} else {
	SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
	SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
	Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
	DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
	}

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
	DAG.getIntPtrConstant(0, dl));
	}

	/// 32-bit unsigned integer to float expansion.
	SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc dl(Op);
	// FP constant to bias correct the final result.
	SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
	MVT::f64);

	// Load the 32-bit value into an XMM register.
	SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
	Op.getOperand(0));

	// Zero out the upper parts of the register.
	Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);

	Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Load),
	DAG.getIntPtrConstant(0, dl));

	// Or the load with the bias.
	SDValue Or = DAG.getNode(
	ISD::OR, dl, MVT::v2i64,
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
	DAG.getBitcast(MVT::v2i64,
	DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
	Or =
	DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));

	// Subtract the bias.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);

	// Handle final rounding.
	MVT DestVT = Op.getSimpleValueType();

	if (DestVT.bitsLT(MVT::f64))
	return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
	DAG.getIntPtrConstant(0, dl));
	if (DestVT.bitsGT(MVT::f64))
	return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);

	// Handle final rounding.
	return Sub;
	}

	static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget, SDLoc &DL) {
	if (Op.getSimpleValueType() != MVT::v2f64)
	return SDValue();

	SDValue N0 = Op.getOperand(0);
	assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");

	// Legalize to v4i32 type.
	N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));

	if (Subtarget.hasAVX512())
	return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);

	// Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
	// but using v2i32 to v2f64 with X86ISD::CVTSI2P.
	SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
	SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);

	// Two to the power of half-word-size.
	SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);

	// Clear upper part of LO, lower HI.
	SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
	SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);

	SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
	fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
	SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);

	// Add the two halves.
	return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
	}

	static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// The algorithm is the following:
	// #ifdef __SSE4_1__
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	// #else
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	// #endif
	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	// return (float4) lo + fhi;

	// We shouldn't use it when unsafe-fp-math is enabled though: we might later
	// reassociate the two FADDs, and if we do that, the algorithm fails
	// spectacularly (PR24512).
	// FIXME: If we ever have some kind of Machine FMF, this should be marked
	// as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
	// there's also the MachineCombiner reassociations happening on Machine IR.
	if (DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	SDLoc DL(Op);
	SDValue V = Op->getOperand(0);
	MVT VecIntVT = V.getSimpleValueType();
	bool Is128 = VecIntVT == MVT::v4i32;
	MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
	// If we convert to something else than the supported type, e.g., to v4f64,
	// abort early.
	if (VecFloatVT != Op->getSimpleValueType(0))
	return SDValue();

	assert((VecIntVT == MVT::v4i32 \|\| VecIntVT == MVT::v8i32) &&
	"Unsupported custom type");

	// In the #idef/#else code, we have in common:
	// - The vector of constants:
	// -- 0x4b000000
	// -- 0x53000000
	// - A shift:
	// -- v >> 16

	// Create the splat vector for 0x4b000000.
	SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
	// Create the splat vector for 0x53000000.
	SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);

	// Create the right shift.
	SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
	SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);

	SDValue Low, High;
	if (Subtarget.hasSSE41()) {
	MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
	// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
	SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
	SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
	// Low will be bitcasted right away, so do not bother bitcasting back to its
	// original type.
	Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
	VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	// uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
	// (uint4) 0x53000000, 0xaa);
	SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
	SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
	// High will be bitcasted right away, so do not bother bitcasting back to
	// its original type.
	High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
	VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
	} else {
	SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
	// uint4 lo = (v & (uint4) 0xffff) \| (uint4) 0x4b000000;
	SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
	Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);

	// uint4 hi = (v >> 16) \| (uint4) 0x53000000;
	High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
	}

	// Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
	SDValue VecCstFAdd = DAG.getConstantFP(
	APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);

	// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
	SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue FHigh =
	DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
	// return (float4) lo + fhi;
	SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
	return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
	}

	SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	MVT SrcVT = N0.getSimpleValueType();
	SDLoc dl(Op);

	if (SrcVT.getVectorElementType() == MVT::i1) {
	if (SrcVT == MVT::v2i1)
	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
	MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
	}

	switch (SrcVT.SimpleTy) {
	default:
	llvm_unreachable("Custom UINT_TO_FP is not supported!");
	case MVT::v4i8:
	case MVT::v4i16:
	case MVT::v8i8:
	case MVT::v8i16: {
	MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
	}
	case MVT::v2i32:
	return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
	case MVT::v4i32:
	case MVT::v8i32:
	return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
	case MVT::v16i8:
	case MVT::v16i16:
	assert(Subtarget.hasAVX512());
	return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
	DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
	}
	}

	SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(N0))
	return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);

	if (Op.getSimpleValueType().isVector())
	return lowerUINT_TO_FP_vec(Op, DAG);

	MVT SrcVT = N0.getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
	(SrcVT == MVT::i32 \|\| (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
	// Conversions from unsigned i32 to f32/f64 are legal,
	// using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
	return Op;
	}

	if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i64(Op, DAG);
	if (SrcVT == MVT::i32 && X86ScalarSSEf64)
	return LowerUINT_TO_FP_i32(Op, DAG);
	if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
	return SDValue();

	// Make a 64-bit buffer, and use it to build an FILD.
	SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
	if (SrcVT == MVT::i32) {
	SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
	SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
	StackSlot, MachinePointerInfo());
	SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
	OffsetSlot, MachinePointerInfo());
	SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
	return Fild;
	}

	assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
	SDValue ValueToStore = Op.getOperand(0);
	if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
	// Bitcasting to f64 here allows us to do a single 64-bit store from
	// an SSE register, avoiding the store forwarding penalty that would come
	// with two 32-bit stores.
	ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
	SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
	MachinePointerInfo());
	// For i64 source, we need to add the appropriate power of 2 if the input
	// was negative. This is the same as the optimization in
	// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
	// we must be careful to do the computation in x87 extended precision, not
	// in SSE. (The generic code can't know it's OK to do this, or how to.)
	int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
	MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
	MachineMemOperand::MOLoad, 8, 8);

	SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
	SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
	SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
	MVT::i64, MMO);

	APInt FF(32, 0x5F800000ULL);

	// Check whether the sign bit is set.
	SDValue SignSet = DAG.getSetCC(
	dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
	Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);

	// Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
	SDValue FudgePtr = DAG.getConstantPool(
	ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);

	// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
	SDValue Zero = DAG.getIntPtrConstant(0, dl);
	SDValue Four = DAG.getIntPtrConstant(4, dl);
	SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
	FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);

	// Load the value out, extending it from f32 to f80.
	// FIXME: Avoid the extend by constructing the right constant pool?
	SDValue Fudge = DAG.getExtLoad(
	ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
	MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
	/* Alignment = */ 4);
	// Extend everything to 80 bits to force it to be done on x87.
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
	return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
	DAG.getIntPtrConstant(0, dl));
	}

	// If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
	// is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
	// just return an <SDValue(), SDValue()> pair.
	// Otherwise it is assumed to be a conversion from one of f32, f64 or f80
	// to i16, i32 or i64, and we lower it to a legal sequence.
	// If lowered to the final integer result we return a <result, SDValue()> pair.
	// Otherwise we lower it to a sequence ending with a FIST, return a
	// <FIST, StackSlot> pair, and the caller is responsible for loading
	// the final integer result from StackSlot.
	std::pair<SDValue,SDValue>
	X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
	bool IsSigned, bool IsReplace) const {
	SDLoc DL(Op);

	EVT DstTy = Op.getValueType();
	EVT TheVT = Op.getOperand(0).getValueType();
	auto PtrVT = getPointerTy(DAG.getDataLayout());

	if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
	// f16 must be promoted before using the lowering in this routine.
	// fp128 does not use this lowering.
	return std::make_pair(SDValue(), SDValue());
	}

	// If using FIST to compute an unsigned i64, we'll need some fixup
	// to handle values above the maximum signed i64. A FIST is always
	// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
	bool UnsignedFixup = !IsSigned &&
	DstTy == MVT::i64 &&
	(!Subtarget.is64Bit() \|\|
	!isScalarFPTypeInSSEReg(TheVT));

	if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
	// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
	// The low 32 bits of the fist result will have the correct uint32 result.
	assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
	DstTy = MVT::i64;
	}

	assert(DstTy.getSimpleVT() <= MVT::i64 &&
	DstTy.getSimpleVT() >= MVT::i16 &&
	"Unknown FP_TO_INT to lower!");

	// These are really Legal.
	if (DstTy == MVT::i32 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());
	if (Subtarget.is64Bit() &&
	DstTy == MVT::i64 &&
	isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
	return std::make_pair(SDValue(), SDValue());

	// We lower FP->int64 into FISTP64 followed by a load from a temporary
	// stack slot.
	MachineFunction &MF = DAG.getMachineFunction();
	unsigned MemSize = DstTy.getSizeInBits()/8;
	int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);

	unsigned Opc;
	switch (DstTy.getSimpleVT().SimpleTy) {
	default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
	case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
	case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
	case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
	}

	SDValue Chain = DAG.getEntryNode();
	SDValue Value = Op.getOperand(0);
	SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.

	if (UnsignedFixup) {
	//
	// Conversion to unsigned i64 is implemented with a select,
	// depending on whether the source value fits in the range
	// of a signed i64. Let Thresh be the FP equivalent of
	// 0x8000000000000000ULL.
	//
	// Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
	// FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
	// Fist-to-mem64 FistSrc
	// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
	// to XOR'ing the high 32 bits with Adjust.
	//
	// Being a power of 2, Thresh is exactly representable in all FP formats.
	// For X87 we'd like to use the smallest FP type for this constant, but
	// for DAG type consistency we have to match the FP operand type.

	APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
	LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
	bool LosesInfo = false;
	if (TheVT == MVT::f64)
	// The rounding mode is irrelevant as the conversion should be exact.
	Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
	&LosesInfo);
	else if (TheVT == MVT::f80)
	Status = Thresh.convert(APFloat::x87DoubleExtended(),
	APFloat::rmNearestTiesToEven, &LosesInfo);

	assert(Status == APFloat::opOK && !LosesInfo &&
	"FP conversion should have been exact");

	SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);

	SDValue Cmp = DAG.getSetCC(DL,
	getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
	DAG.getConstant(0, DL, MVT::i32),
	DAG.getConstant(0x80000000, DL, MVT::i32));
	SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
	Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), TheVT),
	Value, ThreshVal, ISD::SETLT);
	Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
	}

	// FIXME This causes a redundant load/store if the SSE-class value is already
	// in memory, such as if it is on the callstack.
	if (isScalarFPTypeInSSEReg(TheVT)) {
	assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
	Chain = DAG.getStore(Chain, DL, Value, StackSlot,
	MachinePointerInfo::getFixedStack(MF, SSFI));
	SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
	SDValue Ops[] = {
	Chain, StackSlot, DAG.getValueType(TheVT)
	};

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOLoad, MemSize, MemSize);
	Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
	Chain = Value.getValue(1);
	SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
	StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
	}

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, MemSize, MemSize);

	if (UnsignedFixup) {

	// Insert the FIST, load its result as two i32's,
	// and XOR the high i32 with Adjust.

	SDValue FistOps[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	FistOps, DstTy, MMO);

	SDValue Low32 =
	DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
	SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);

	SDValue High32 =
	DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
	High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);

	if (Subtarget.is64Bit()) {
	// Join High32 and Low32 into a 64-bit result.
	// (High32 << 32) \| Low32
	Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
	High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
	High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
	DAG.getConstant(32, DL, MVT::i8));
	SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
	return std::make_pair(Result, SDValue());
	}

	SDValue ResultOps[] = { Low32, High32 };

	SDValue pair = IsReplace
	? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
	: DAG.getMergeValues(ResultOps, DL);
	return std::make_pair(pair, SDValue());
	} else {
	// Build the FP_TO_INT*_IN_MEM
	SDValue Ops[] = { Chain, Value, StackSlot };
	SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
	Ops, DstTy, MMO);
	return std::make_pair(FIST, StackSlot);
	}
	}

	static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.is512BitVector() \|\| InVT.getVectorElementType() == MVT::i1)
	return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);

	// Optimize vectors in AVX mode:
	//
	// v8i16 -> v8i32
	// Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
	// Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
	// Concat upper and lower parts.
	//
	// v4i32 -> v4i64
	// Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
	// Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
	// Concat upper and lower parts.
	//

	if (((VT != MVT::v16i16) \|\| (InVT != MVT::v16i8)) &&
	((VT != MVT::v8i32) \|\| (InVT != MVT::v8i16)) &&
	((VT != MVT::v4i64) \|\| (InVT != MVT::v4i32)))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VZEXT, dl, VT, In);

	SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
	SDValue Undef = DAG.getUNDEF(InVT);
	bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
	SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
	SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);

	MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements()/2);

	OpLo = DAG.getBitcast(HVT, OpLo);
	OpHi = DAG.getBitcast(HVT, OpHi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
	const X86Subtarget &Subtarget, SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc DL(Op);
	unsigned NumElts = VT.getVectorNumElements();

	if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
	(NumElts == 8 \|\| NumElts == 16 \|\| Subtarget.hasBWI()))
	return DAG.getNode(X86ISD::VZEXT, DL, VT, In);

	if (InVT.getVectorElementType() != MVT::i1)
	return SDValue();

	// Extend VT if the target is 256 or 128bit vector and VLX is not supported.
	MVT ExtVT = VT;
	if (!VT.is512BitVector() && !Subtarget.hasVLX())
	ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);

	SDValue One =
	DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
	SDValue Zero =
	DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);

	SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
	if (VT == ExtVT)
	return SelectedVal;
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
	}

	static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	return SDValue();
	}

	static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	if (VT.is512BitVector() \|\| SVT.getVectorElementType() == MVT::i1)
	return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);

	if (Subtarget.hasFp256())
	if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
	return Res;

	assert(!VT.is256BitVector() \|\| !SVT.is128BitVector() \|\|
	VT.getVectorNumElements() != SVT.getVectorNumElements());
	return SDValue();
	}

	/// Helper to recursively truncate vector elements in half with PACKSS.
	/// It makes use of the fact that vector comparison results will be all-zeros
	/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
	/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
	/// within each 128-bit lane.
	static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
	const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	EVT SrcVT = In.getValueType();

	// No truncation required, we might get here due to recursive calls.
	if (SrcVT == DstVT)
	return In;

	// We only support vector truncation to 128bits or greater from a
	// 256bits or greater source.
	if ((DstVT.getSizeInBits() % 128) != 0)
	return SDValue();
	if ((SrcVT.getSizeInBits() % 256) != 0)
	return SDValue();

	unsigned NumElems = SrcVT.getVectorNumElements();
	assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
	assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");

	EVT PackedSVT =
	EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);

	// Extract lower/upper subvectors.
	unsigned NumSubElts = NumElems / 2;
	unsigned SrcSizeInBits = SrcVT.getSizeInBits();
	SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
	SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);

	// 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
	if (SrcVT.is256BitVector()) {
	Lo = DAG.getBitcast(MVT::v8i16, Lo);
	Hi = DAG.getBitcast(MVT::v8i16, Hi);
	SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
	return DAG.getBitcast(DstVT, Res);
	}

	// AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
	// AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
	if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
	Lo = DAG.getBitcast(MVT::v16i16, Lo);
	Hi = DAG.getBitcast(MVT::v16i16, Hi);
	SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);

	// 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
	// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
	Res = DAG.getBitcast(MVT::v4i64, Res);
	Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});

	if (DstVT.is256BitVector())
	return DAG.getBitcast(DstVT, Res);

	// If 512bit -> 128bit truncate another stage.
	EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
	Res = DAG.getBitcast(PackedVT, Res);
	return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
	}

	// Recursively pack lower/upper subvectors, concat result and pack again.
	assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
	EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
	Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
	Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);

	PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
	return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
	}

	static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {

	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");

	// Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
	unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
	if (InVT.getScalarSizeInBits() <= 16) {
	if (Subtarget.hasBWI()) {
	// legal, will go to VPMOVB2M, VPMOVW2M
	// Shift packed bytes not supported natively, bitcast to word
	MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
	SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
	DAG.getBitcast(ExtVT, In),
	DAG.getConstant(ShiftInx, DL, ExtVT));
	ShiftNode = DAG.getBitcast(InVT, ShiftNode);
	return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
	}
	// Use TESTD/Q, extended vector to packed dword/qword.
	assert((InVT.is256BitVector() \|\| InVT.is128BitVector()) &&
	"Unexpected vector type.");
	unsigned NumElts = InVT.getVectorNumElements();
	MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
	In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
	InVT = ExtVT;
	ShiftInx = InVT.getScalarSizeInBits() - 1;
	}

	SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
	DAG.getConstant(ShiftInx, DL, InVT));
	return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
	}

	SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT InVT = In.getSimpleValueType();

	if (VT == MVT::i1) {
	assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
	"Invalid scalar TRUNCATE operation");
	if (InVT.getSizeInBits() >= 32)
	return SDValue();
	In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
	return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
	}
	assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
	"Invalid TRUNCATE operation");

	if (VT.getVectorElementType() == MVT::i1)
	return LowerTruncateVecI1(Op, DAG, Subtarget);

	// vpmovqb/w/d, vpmovdb/w, vpmovwb
	if (Subtarget.hasAVX512()) {
	// word to byte only under BWI
	if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
	return DAG.getNode(X86ISD::VTRUNC, DL, VT,
	getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
	}

	// Truncate with PACKSS if we are truncating a vector zero/all-bits result.
	if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
	if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
	return V;

	if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
	// On AVX2, v4i64 -> v4i32 becomes VPERMD.
	if (Subtarget.hasInt256()) {
	static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
	In = DAG.getBitcast(MVT::v8i32, In);
	In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(2, DL));
	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
	static const int ShufMask[] = {0, 2, 4, 6};
	return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
	}

	if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
	// On AVX2, v8i32 -> v8i16 becomes PSHUFB.
	if (Subtarget.hasInt256()) {
	In = DAG.getBitcast(MVT::v32i8, In);

	// The PSHUFB mask:
	static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1,
	16, 17, 20, 21, 24, 25, 28, 29,
	-1, -1, -1, -1, -1, -1, -1, -1 };
	In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
	In = DAG.getBitcast(MVT::v4i64, In);

	static const int ShufMask2[] = {0, 2, -1, -1};
	In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
	In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
	DAG.getIntPtrConstant(0, DL));
	return DAG.getBitcast(VT, In);
	}

	SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(0, DL));

	SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
	DAG.getIntPtrConstant(4, DL));

	OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
	OpHi = DAG.getBitcast(MVT::v16i8, OpHi);

	// The PSHUFB mask:
	static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
	-1, -1, -1, -1, -1, -1, -1, -1};

	OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
	OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);

	OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
	OpHi = DAG.getBitcast(MVT::v4i32, OpHi);

	// The MOVLHPS Mask:
	static const int ShufMask2[] = {0, 1, 4, 5};
	SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
	return DAG.getBitcast(MVT::v8i16, res);
	}

	// Handle truncation of V256 to V128 using shuffles.
	if (!VT.is128BitVector() \|\| !InVT.is256BitVector())
	return SDValue();

	assert(Subtarget.hasFp256() && "256-bit vector without AVX!");

	unsigned NumElems = VT.getVectorNumElements();
	MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);

	SmallVector<int, 16> MaskVec(NumElems * 2, -1);
	// Prepare truncation shuffle mask
	for (unsigned i = 0; i != NumElems; ++i)
	MaskVec[i] = i * 2;
	In = DAG.getBitcast(NVT, In);
	SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
	DAG.getIntPtrConstant(0, DL));
	}

	SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
	bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = Op.getOperand(0);
	SDLoc dl(Op);
	if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
	return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32)));
	}

	return SDValue();
	}

	assert(!VT.isVector());

	std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
	IsSigned, /IsReplace=/ false);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	// If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
	if (!FIST.getNode())
	return Op;

	if (StackSlot.getNode())
	// Load the result.
	return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());

	// The node is the result.
	return FIST;
	}

	static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
	SDLoc DL(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	MVT SVT = In.getSimpleValueType();

	assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");

	return DAG.getNode(X86ISD::VFPEXT, DL, VT,
	DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
	In, DAG.getUNDEF(SVT)));
	}

	/// The only differences between FABS and FNEG are the mask and the logic op.
	/// FNEG also has a folding opportunity for FNEG(FABS(x)).
	static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
	assert((Op.getOpcode() == ISD::FABS \|\| Op.getOpcode() == ISD::FNEG) &&
	"Wrong opcode for lowering FABS or FNEG.");

	bool IsFABS = (Op.getOpcode() == ISD::FABS);

	// If this is a FABS and it has an FNEG user, bail out to fold the combination
	// into an FNABS. We'll lower the FABS after that if it is still in use.
	if (IsFABS)
	for (SDNode *User : Op->uses())
	if (User->getOpcode() == ISD::FNEG)
	return Op;

	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	bool IsF128 = (VT == MVT::f128);

	// FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
	// decide if we should generate a 16-byte constant mask when we only need 4 or
	// 8 bytes for the scalar case.

	MVT LogicVT;
	MVT EltVT;

	if (VT.isVector()) {
	LogicVT = VT;
	EltVT = VT.getVectorElementType();
	} else if (IsF128) {
	// SSE instructions are used for optimized f128 logical operations.
	LogicVT = MVT::f128;
	EltVT = VT;
	} else {
	// There are no scalar bitwise logical SSE/AVX instructions, so we
	// generate a 16-byte vector constant and logic op even for the scalar case.
	// Using a 16-byte mask allows folding the load of the mask with
	// the logic op, so it can save (~4 bytes) on code size.
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
	EltVT = VT;
	}

	unsigned EltBits = EltVT.getSizeInBits();
	// For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
	APInt MaskElt =
	IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble() :
	(IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
	SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);

	SDValue Op0 = Op.getOperand(0);
	bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
	unsigned LogicOp =
	IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
	SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;

	if (VT.isVector() \|\| IsF128)
	return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);

	// For the scalar case extend to a 128-bit vector, perform the logic op,
	// and extract the scalar result back out.
	Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
	SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue Mag = Op.getOperand(0);
	SDValue Sign = Op.getOperand(1);
	SDLoc dl(Op);

	// If the sign operand is smaller, extend it first.
	MVT VT = Op.getSimpleValueType();
	if (Sign.getSimpleValueType().bitsLT(VT))
	Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);

	// And if it is bigger, shrink it first.
	if (Sign.getSimpleValueType().bitsGT(VT))
	Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));

	// At this point the operands and the result should have the same
	// type, and that won't be f80 since that is not custom lowered.
	bool IsF128 = (VT == MVT::f128);
	assert((VT == MVT::f64 \|\| VT == MVT::f32 \|\| VT == MVT::f128 \|\|
	VT == MVT::v2f64 \|\| VT == MVT::v4f64 \|\| VT == MVT::v4f32 \|\|
	VT == MVT::v8f32 \|\| VT == MVT::v8f64 \|\| VT == MVT::v16f32) &&
	"Unexpected type in LowerFCOPYSIGN");

	MVT EltVT = VT.getScalarType();
	const fltSemantics &Sem =
	EltVT == MVT::f64 ? APFloat::IEEEdouble()
	: (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());

	// Perform all scalar logic operations as 16-byte vectors because there are no
	// scalar FP logic instructions in SSE.
	// TODO: This isn't necessary. If we used scalar types, we might avoid some
	// unnecessary splats, but we might miss load folding opportunities. Should
	// this decision be based on OptimizeForSize?
	bool IsFakeVector = !VT.isVector() && !IsF128;
	MVT LogicVT = VT;
	if (IsFakeVector)
	LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;

	// The mask constants are automatically splatted for vector types.
	unsigned EltSizeInBits = VT.getScalarSizeInBits();
	SDValue SignMask = DAG.getConstantFP(
	APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
	SDValue MagMask = DAG.getConstantFP(
	APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);

	// First, clear all bits but the sign bit from the second operand (sign).
	if (IsFakeVector)
	Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
	SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);

	// Next, clear the sign bit from the first operand (magnitude).
	// TODO: If we had general constant folding for FP logic ops, this check
	// wouldn't be necessary.
	SDValue MagBits;
	if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
	APFloat APF = Op0CN->getValueAPF();
	APF.clearSign();
	MagBits = DAG.getConstantFP(APF, dl, LogicVT);
	} else {
	// If the magnitude operand wasn't a constant, we need to AND out the sign.
	if (IsFakeVector)
	Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
	MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
	}

	// OR the magnitude value with the sign bit.
	SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
	return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
	DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
	SDValue N0 = Op.getOperand(0);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	MVT OpVT = N0.getSimpleValueType();
	assert((OpVT == MVT::f32 \|\| OpVT == MVT::f64) &&
	"Unexpected type for FGETSIGN");

	// Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
	MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
	Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
	Res = DAG.getZExtOrTrunc(Res, dl, VT);
	Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
	return Res;
	}

	// Check whether an OR'd tree is PTEST-able.
	static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");

	if (!Subtarget.hasSSE41())
	return SDValue();

	if (!Op->hasOneUse())
	return SDValue();

	SDNode *N = Op.getNode();
	SDLoc DL(N);

	SmallVector<SDValue, 8> Opnds;
	DenseMap<SDValue, unsigned> VecInMap;
	SmallVector<SDValue, 8> VecIns;
	EVT VT = MVT::Other;

	// Recognize a special case where a vector is casted into wide integer to
	// test all 0s.
	Opnds.push_back(N->getOperand(0));
	Opnds.push_back(N->getOperand(1));

	for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
	SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
	// BFS traverse all OR'd operands.
	if (I->getOpcode() == ISD::OR) {
	Opnds.push_back(I->getOperand(0));
	Opnds.push_back(I->getOperand(1));
	// Re-evaluate the number of nodes to be traversed.
	e += 2; // 2 more nodes (LHS and RHS) are pushed.
	continue;
	}

	// Quit if a non-EXTRACT_VECTOR_ELT
	if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	// Quit if without a constant index.
	SDValue Idx = I->getOperand(1);
	if (!isa<ConstantSDNode>(Idx))
	return SDValue();

	SDValue ExtractedFromVec = I->getOperand(0);
	DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
	if (M == VecInMap.end()) {
	VT = ExtractedFromVec.getValueType();
	// Quit if not 128/256-bit vector.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	// Quit if not the same type.
	if (VecInMap.begin() != VecInMap.end() &&
	VT != VecInMap.begin()->first.getValueType())
	return SDValue();
	M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
	VecIns.push_back(ExtractedFromVec);
	}
	M->second \|= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
	}

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Not extracted from 128-/256-bit vector.");

	unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;

	for (DenseMap<SDValue, unsigned>::const_iterator
	I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
	// Quit if not all elements are used.
	if (I->second != FullMask)
	return SDValue();
	}

	MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;

	// Cast all vectors into TestVT for PTEST.
	for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
	VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);

	// If more than one full vector is evaluated, OR them first before PTEST.
	for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
	// Each iteration will OR 2 nodes and append the result until there is only
	// 1 node left, i.e. the final OR'd value of all vectors.
	SDValue LHS = VecIns[Slot];
	SDValue RHS = VecIns[Slot + 1];
	VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
	}

	return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
	}

	/// \brief return true if \c Op has a use that doesn't just read flags.
	static bool hasNonFlagsUse(SDValue Op) {
	for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
	++UI) {
	SDNode User = UI;
	unsigned UOpNo = UI.getOperandNo();
	if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
	// Look pass truncate.
	UOpNo = User->use_begin().getOperandNo();
	User = *User->use_begin();
	}

	if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
	!(User->getOpcode() == ISD::SELECT && UOpNo == 0))
	return true;
	}
	return false;
	}

	// Emit KTEST instruction for bit vectors on AVX-512
	static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Op.getOpcode() == ISD::BITCAST) {
	auto hasKTEST = [&](MVT VT) {
	unsigned SizeInBits = VT.getSizeInBits();
	return (Subtarget.hasDQI() && (SizeInBits == 8 \|\| SizeInBits == 16)) \|\|
	(Subtarget.hasBWI() && (SizeInBits == 32 \|\| SizeInBits == 64));
	};
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = Op0.getValueType().getSimpleVT();
	if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
	hasKTEST(Op0VT))
	return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
	}
	return SDValue();
	}

	/// Emit nodes that will be selected as "test Op0,Op0", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
	SelectionDAG &DAG) const {
	if (Op.getValueType() == MVT::i1) {
	SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
	DAG.getConstant(0, dl, MVT::i8));
	}
	// CF and OF aren't always set the way we want. Determine which
	// of these we need.
	bool NeedCF = false;
	bool NeedOF = false;
	switch (X86CC) {
	default: break;
	case X86::COND_A: case X86::COND_AE:
	case X86::COND_B: case X86::COND_BE:
	NeedCF = true;
	break;
	case X86::COND_G: case X86::COND_GE:
	case X86::COND_L: case X86::COND_LE:
	case X86::COND_O: case X86::COND_NO: {
	// Check if we really need to set the
	// Overflow flag. If NoSignedWrap is present
	// that is not actually needed.
	switch (Op->getOpcode()) {
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	case ISD::SHL:
	if (Op.getNode()->getFlags().hasNoSignedWrap())
	break;
	LLVM_FALLTHROUGH;
	default:
	NeedOF = true;
	break;
	}
	break;
	}
	}
	// See if we can use the EFLAGS value from the operand instead of
	// doing a separate TEST. TEST always sets OF and CF to 0, so unless
	// we prove that the arithmetic won't overflow, we can't use OF or CF.
	if (Op.getResNo() != 0 \|\| NeedOF \|\| NeedCF) {
	// Emit KTEST for bit vectors
	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	return Node;
	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	unsigned Opcode = 0;
	unsigned NumOperands = 0;

	// Truncate operations may prevent the merge of the SETCC instruction
	// and the arithmetic instruction before it. Attempt to truncate the operands
	// of the arithmetic instruction and use a reduced bit-width instruction.
	bool NeedTruncation = false;
	SDValue ArithOp = Op;
	if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
	SDValue Arith = Op->getOperand(0);
	// Both the trunc and the arithmetic op need to have one user each.
	if (Arith->hasOneUse())
	switch (Arith.getOpcode()) {
	default: break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR: {
	NeedTruncation = true;
	ArithOp = Arith;
	}
	}
	}

	// Sometimes flags can be set either with an AND or with an SRL/SHL
	// instruction. SRL/SHL variant should be preferred for masks longer than this
	// number of bits.
	const int ShiftToAndMaxMaskWidth = 32;
	const bool ZeroCheck = (X86CC == X86::COND_E \|\| X86CC == X86::COND_NE);

	// NOTICE: In the code below we use ArithOp to hold the arithmetic operation
	// which may be the result of a CAST. We use the variable 'Op', which is the
	// non-casted variable when we check for possible users.
	switch (ArithOp.getOpcode()) {
	case ISD::ADD:
	// Due to an isel shortcoming, be conservative if this add is likely to be
	// selected as part of a load-modify-store instruction. When the root node
	// in a match is a store, isel doesn't know how to remap non-chain non-flag
	// uses of other nodes in the match, such as the ADD in this case. This
	// leads to the ADD being left around and reselected, with the result being
	// two adds in the output. Alas, even if none our users are stores, that
	// doesn't prove we're O.K. Ergo, if we have any parents that aren't
	// CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
	// climbing the DAG back to the root, and it doesn't seem to be worth the
	// effort.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() != ISD::CopyToReg &&
	UI->getOpcode() != ISD::SETCC &&
	UI->getOpcode() != ISD::STORE)
	goto default_case;

	if (ConstantSDNode *C =
	dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
	// An add of one will be selected as an INC.
	if (C->isOne() && !Subtarget.slowIncDec()) {
	Opcode = X86ISD::INC;
	NumOperands = 1;
	break;
	}

	// An add of negative one (subtract of one) will be selected as a DEC.
	if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
	Opcode = X86ISD::DEC;
	NumOperands = 1;
	break;
	}
	}

	// Otherwise use a regular EFLAGS-setting add.
	Opcode = X86ISD::ADD;
	NumOperands = 2;
	break;
	case ISD::SHL:
	case ISD::SRL:
	// If we have a constant logical shift that's only used in a comparison
	// against zero turn it into an equivalent AND. This allows turning it into
	// a TEST instruction later.
	if (ZeroCheck && Op->hasOneUse() &&
	isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
	EVT VT = Op.getValueType();
	unsigned BitWidth = VT.getSizeInBits();
	unsigned ShAmt = Op->getConstantOperandVal(1);
	if (ShAmt >= BitWidth) // Avoid undefined shifts.
	break;
	APInt Mask = ArithOp.getOpcode() == ISD::SRL
	? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
	: APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
	if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break;
	Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
	DAG.getConstant(Mask, dl, VT));
	}
	break;

	case ISD::AND:
	// If the primary 'and' result isn't used, don't bother using X86ISD::AND,
	// because a TEST instruction will be better. However, AND should be
	// preferred if the instruction can be combined into ANDN.
	if (!hasNonFlagsUse(Op)) {
	SDValue Op0 = ArithOp->getOperand(0);
	SDValue Op1 = ArithOp->getOperand(1);
	EVT VT = ArithOp.getValueType();
	bool isAndn = isBitwiseNot(Op0) \|\| isBitwiseNot(Op1);
	bool isLegalAndnType = VT == MVT::i32 \|\| VT == MVT::i64;
	bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();

	// If we cannot select an ANDN instruction, check if we can replace
	// AND+IMM64 with a shift before giving up. This is possible for masks
	// like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
	if (!isProperAndn) {
	if (!ZeroCheck)
	break;

	assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
	auto *CN = dyn_cast<ConstantSDNode>(Op1);
	if (!CN)
	break;

	const APInt &Mask = CN->getAPIntValue();
	if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
	break; // Prefer TEST instruction.

	unsigned BitWidth = Mask.getBitWidth();
	unsigned LeadingOnes = Mask.countLeadingOnes();
	unsigned TrailingZeros = Mask.countTrailingZeros();

	if (LeadingOnes + TrailingZeros == BitWidth) {
	assert(TrailingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
	break;
	}

	unsigned LeadingZeros = Mask.countLeadingZeros();
	unsigned TrailingOnes = Mask.countTrailingOnes();

	if (LeadingZeros + TrailingOnes == BitWidth) {
	assert(LeadingZeros < VT.getSizeInBits() &&
	"Shift amount should be less than the type width");
	MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
	SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
	Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
	break;
	}

	break;
	}
	}
	LLVM_FALLTHROUGH;
	case ISD::SUB:
	case ISD::OR:
	case ISD::XOR:
	// Due to the ISEL shortcoming noted above, be conservative if this op is
	// likely to be selected as part of a load-modify-store instruction.
	for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
	UE = Op.getNode()->use_end(); UI != UE; ++UI)
	if (UI->getOpcode() == ISD::STORE)
	goto default_case;

	// Otherwise use a regular EFLAGS-setting instruction.
	switch (ArithOp.getOpcode()) {
	default: llvm_unreachable("unexpected operator!");
	case ISD::SUB: Opcode = X86ISD::SUB; break;
	case ISD::XOR: Opcode = X86ISD::XOR; break;
	case ISD::AND: Opcode = X86ISD::AND; break;
	case ISD::OR: {
	if (!NeedTruncation && ZeroCheck) {
	if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
	return EFLAGS;
	}
	Opcode = X86ISD::OR;
	break;
	}
	}

	NumOperands = 2;
	break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::INC:
	case X86ISD::DEC:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	return SDValue(Op.getNode(), 1);
	default:
	default_case:
	break;
	}

	// If we found that truncation is beneficial, perform the truncation and
	// update 'Op'.
	if (NeedTruncation) {
	EVT VT = Op.getValueType();
	SDValue WideVal = Op->getOperand(0);
	EVT WideVT = WideVal.getValueType();
	unsigned ConvertedOp = 0;
	// Use a target machine opcode to prevent further DAGCombine
	// optimizations that may separate the arithmetic operations
	// from the setcc node.
	switch (WideVal.getOpcode()) {
	default: break;
	case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
	case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
	case ISD::AND: ConvertedOp = X86ISD::AND; break;
	case ISD::OR: ConvertedOp = X86ISD::OR; break;
	case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
	}

	if (ConvertedOp) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
	SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
	SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
	Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
	}
	}
	}

	if (Opcode == 0) {
	// Emit KTEST for bit vectors
	if (auto Node = EmitKTEST(Op, DAG, Subtarget))
	return Node;

	// Emit a CMP with 0, which is the TEST pattern.
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
	DAG.getConstant(0, dl, Op.getValueType()));
	}
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);

	SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
	DAG.ReplaceAllUsesWith(Op, New);
	return SDValue(New.getNode(), 1);
	}

	/// Emit nodes that will be selected as "cmp Op0,Op1", or something
	/// equivalent.
	SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (isNullConstant(Op1))
	return EmitTest(Op0, X86CC, dl, DAG);

	assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
	"Unexpected comparison operation for MVT::i1 operands");

	if ((Op0.getValueType() == MVT::i8 \|\| Op0.getValueType() == MVT::i16 \|\|
	Op0.getValueType() == MVT::i32 \|\| Op0.getValueType() == MVT::i64)) {
	// Only promote the compare up to I32 if it is a 16 bit operation
	// with an immediate. 16 bit immediates are to be avoided.
	if ((Op0.getValueType() == MVT::i16 &&
	(isa<ConstantSDNode>(Op0) \|\| isa<ConstantSDNode>(Op1))) &&
	!DAG.getMachineFunction().getFunction()->optForMinSize() &&
	!Subtarget.isAtom()) {
	unsigned ExtendOp =
	isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
	Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
	}
	// Use SUB instead of CMP to enable CSE between SUB and CMP.
	SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
	SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
	Op0, Op1);
	return SDValue(Sub.getNode(), 1);
	}
	return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
	}

	/// Convert a comparison if required by the subtarget.
	SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
	SelectionDAG &DAG) const {
	// If the subtarget does not support the FUCOMI instruction, floating-point
	// comparisons have to be converted.
	if (Subtarget.hasCMov() \|\|
	Cmp.getOpcode() != X86ISD::CMP \|\|
	!Cmp.getOperand(0).getValueType().isFloatingPoint() \|\|
	!Cmp.getOperand(1).getValueType().isFloatingPoint())
	return Cmp;

	// The instruction selector will select an FUCOM instruction instead of
	// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
	// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
	// (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
	SDLoc dl(Cmp);
	SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
	SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
	SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
	DAG.getConstant(8, dl, MVT::i8));
	SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);

	// Some 64-bit targets lack SAHF support, but they do support FCOMI.
	assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
	return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
	}

	/// Check if replacement of SQRT with RSQRT should be disabled.
	bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
	EVT VT = Op.getValueType();

	// We never want to use both SQRT and RSQRT instructions for the same input.
	if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
	return false;

	if (VT.isVector())
	return Subtarget.hasFastVectorFSQRT();
	return Subtarget.hasFastScalarFSQRT();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
	SelectionDAG &DAG, int Enabled,
	int &RefinementSteps,
	bool &UseOneConstNR,
	bool Reciprocal) const {
	EVT VT = Op.getValueType();

	// SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// rsqrt estimate with refinement on x86 prior to FMA requires at least 16
	// instructions: convert to single, rsqrtss, convert back to double, refine
	// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.
	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	UseOneConstNR = false;
	return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// The minimum architected relative accuracy is 2^-12. We need one
	/// Newton-Raphson step to have a good float result (24 bits of precision).
	SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
	int Enabled,
	int &RefinementSteps) const {
	EVT VT = Op.getValueType();

	// SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
	// TODO: Add support for AVX512 (v16f32).
	// It is likely not profitable to do this for f64 because a double-precision
	// reciprocal estimate with refinement on x86 prior to FMA requires
	// 15 instructions: convert to single, rcpss, convert back to double, refine
	// (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
	// along with FMA, this could be a throughput win.

	if ((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::v8f32 && Subtarget.hasAVX())) {
	// Enable estimate codegen with 1 refinement step for vector division.
	// Scalar division estimates are disabled because they break too much
	// real-world code. These defaults are intended to match GCC behavior.
	if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
	return SDValue();

	if (RefinementSteps == ReciprocalEstimate::Unspecified)
	RefinementSteps = 1;

	return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
	}
	return SDValue();
	}

	/// If we have at least two divisions that use the same divisor, convert to
	/// multiplication by a reciprocal. This may need to be adjusted for a given
	/// CPU if a division's cost is not at least twice the cost of a multiplication.
	/// This is because we still need one division to calculate the reciprocal and
	/// then we need two multiplies by that reciprocal as replacements for the
	/// original divisions.
	unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
	return 2;
	}

	/// Helper for creating a X86ISD::SETCC node.
	static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
	SelectionDAG &DAG) {
	return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
	DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
	}

	/// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
	/// according to equal/not-equal condition code \p CC.
	static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	// If Src is i8, promote it to i32 with any_extend. There is no i8 BT
	// instruction. Since the shift amount is in-range-or-undefined, we know
	// that doing a bittest on the i32 value is ok. We extend to i32 because
	// the encoding for the i16 version is larger than the i32 version.
	// Also promote i16 to i32 for performance / code size reason.
	if (Src.getValueType() == MVT::i8 \|\| Src.getValueType() == MVT::i16)
	Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);

	// See if we can use the 32-bit instruction instead of the 64-bit one for a
	// shorter encoding. Since the former takes the modulo 32 of BitNo and the
	// latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
	// known to be zero.
	if (Src.getValueType() == MVT::i64 &&
	DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
	Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);

	// If the operand types disagree, extend the shift amount to match. Since
	// BT ignores high bits (like shifts) we can use anyextend.
	if (Src.getValueType() != BitNo.getValueType())
	BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);

	SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
	X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
	return getSETCC(Cond, BT, dl , DAG);
	}

	/// Result of 'and' is compared against zero. Change to a BT node if possible.
	static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {
	SDValue Op0 = And.getOperand(0);
	SDValue Op1 = And.getOperand(1);
	if (Op0.getOpcode() == ISD::TRUNCATE)
	Op0 = Op0.getOperand(0);
	if (Op1.getOpcode() == ISD::TRUNCATE)
	Op1 = Op1.getOperand(0);

	SDValue LHS, RHS;
	if (Op1.getOpcode() == ISD::SHL)
	std::swap(Op0, Op1);
	if (Op0.getOpcode() == ISD::SHL) {
	if (isOneConstant(Op0.getOperand(0))) {
	// If we looked past a truncate, check that it's only truncating away
	// known zeros.
	unsigned BitWidth = Op0.getValueSizeInBits();
	unsigned AndBitWidth = And.getValueSizeInBits();
	if (BitWidth > AndBitWidth) {
	KnownBits Known;
	DAG.computeKnownBits(Op0, Known);
	if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
	return SDValue();
	}
	LHS = Op1;
	RHS = Op0.getOperand(1);
	}
	} else if (Op1.getOpcode() == ISD::Constant) {
	ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
	uint64_t AndRHSVal = AndRHS->getZExtValue();
	SDValue AndLHS = Op0;

	if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
	LHS = AndLHS.getOperand(0);
	RHS = AndLHS.getOperand(1);
	}

	// Use BT if the immediate can't be encoded in a TEST instruction.
	if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
	LHS = AndLHS;
	RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
	}
	}

	if (LHS.getNode())
	return getBitTestCondition(LHS, RHS, CC, dl, DAG);

	return SDValue();
	}

	// Convert (truncate (srl X, N) to i1) to (bt X, N)
	static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) {

	assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
	"Expected TRUNCATE to i1 node");

	if (Op.getOperand(0).getOpcode() != ISD::SRL)
	return SDValue();

	SDValue ShiftRight = Op.getOperand(0);
	return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
	CC, dl, DAG);
	}

	/// Result of 'and' or 'trunc to i1' is compared against zero.
	/// Change to a BT node if possible.
	SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
	const SDLoc &dl, SelectionDAG &DAG) const {
	if (Op.getOpcode() == ISD::AND)
	return LowerAndToBT(Op, CC, dl, DAG);
	if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
	return LowerTruncateToBT(Op, CC, dl, DAG);
	return SDValue();
	}

	/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
	/// CMPs.
	static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
	SDValue &Op1) {
	unsigned SSECC;
	bool Swap = false;

	// SSE Condition code mapping:
	// 0 - EQ
	// 1 - LT
	// 2 - LE
	// 3 - UNORD
	// 4 - NEQ
	// 5 - NLT
	// 6 - NLE
	// 7 - ORD
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETOEQ:
	case ISD::SETEQ: SSECC = 0; break;
	case ISD::SETOGT:
	case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLT:
	case ISD::SETOLT: SSECC = 1; break;
	case ISD::SETOGE:
	case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETLE:
	case ISD::SETOLE: SSECC = 2; break;
	case ISD::SETUO: SSECC = 3; break;
	case ISD::SETUNE:
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGE: SSECC = 5; break;
	case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETUGT: SSECC = 6; break;
	case ISD::SETO: SSECC = 7; break;
	case ISD::SETUEQ:
	case ISD::SETONE: SSECC = 8; break;
	}
	if (Swap)
	std::swap(Op0, Op1);

	return SSECC;
	}

	/// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
	/// concatenate the result back.
	static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);
	SDValue CC = Op.getOperand(2);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	// Issue the operation on the smaller types and concatenate the result back
	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
	}

	static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
	"Unexpected type for boolean compare operation");
	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
	DAG.getConstant(-1, dl, VT));
	SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
	DAG.getConstant(-1, dl, VT));
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETEQ:
	// (x == y) -> ~(x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT,
	DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
	DAG.getConstant(-1, dl, VT));
	case ISD::SETNE:
	// (x != y) -> (x ^ y)
	return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
	case ISD::SETUGT:
	case ISD::SETGT:
	// (x > y) -> (x & ~y)
	return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
	case ISD::SETULT:
	case ISD::SETLT:
	// (x < y) -> (~x & y)
	return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
	case ISD::SETULE:
	case ISD::SETLE:
	// (x <= y) -> (~x \| y)
	return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
	case ISD::SETUGE:
	case ISD::SETGE:
	// (x >=y) -> (x \| ~y)
	return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
	}
	}

	static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {

	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	assert(VT.getVectorElementType() == MVT::i1 &&
	"Cannot set masked compare for this operation");

	ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
	unsigned Opc = 0;
	bool Unsigned = false;
	bool Swap = false;
	unsigned SSECC;
	switch (SetCCOpcode) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETNE: SSECC = 4; break;
	case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
	case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
	case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
	case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
	case ISD::SETULT: SSECC = 1; Unsigned = true; break;
	case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
	case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
	case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
	case ISD::SETLE: SSECC = 2; break;
	}

	if (Swap)
	std::swap(Op0, Op1);
	if (Opc)
	return DAG.getNode(Opc, dl, VT, Op0, Op1);
	Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	/// \brief Try to turn a VSETULT into a VSETULE by modifying its second
	/// operand \p Op1. If non-trivial (for example because it's not constant)
	/// return an empty value.
	static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
	SelectionDAG &DAG) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
	if (!BV)
	return SDValue();

	MVT VT = Op1.getSimpleValueType();
	MVT EVT = VT.getVectorElementType();
	unsigned n = VT.getVectorNumElements();
	SmallVector<SDValue, 8> ULTOp1;

	for (unsigned i = 0; i < n; ++i) {
	ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
	if (!Elt \|\| Elt->isOpaque() \|\| Elt->getSimpleValueType(0) != EVT)
	return SDValue();

	// Avoid underflow.
	APInt Val = Elt->getAPIntValue();
	if (Val == 0)
	return SDValue();

	ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
	}

	return DAG.getBuildVector(VT, dl, ULTOp1);
	}

	static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue CC = Op.getOperand(2);
	MVT VT = Op.getSimpleValueType();
	ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
	bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
	SDLoc dl(Op);

	if (isFP) {
	#ifndef NDEBUG
	MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
	assert(EltVT == MVT::f32 \|\| EltVT == MVT::f64);
	#endif

	unsigned Opc;
	if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
	assert(VT.getVectorNumElements() <= 16);
	Opc = X86ISD::CMPM;
	} else {
	Opc = X86ISD::CMPP;
	// The SSE/AVX packed FP comparison nodes are defined with a
	// floating-point vector result that matches the operand type. This allows
	// them to work with an SSE1 target (integer vector types are not legal).
	VT = Op0.getSimpleValueType();
	}

	// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
	// emit two comparisons and a logic op to tie them together.
	// TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
	// available.
	SDValue Cmp;
	unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
	if (SSECC == 8) {
	// LLVM predicate is SETUEQ or SETONE.
	unsigned CC0, CC1;
	unsigned CombineOpc;
	if (Cond == ISD::SETUEQ) {
	CC0 = 3; // UNORD
	CC1 = 0; // EQ
	CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
	static_cast<unsigned>(ISD::OR);
	} else {
	assert(Cond == ISD::SETONE);
	CC0 = 7; // ORD
	CC1 = 4; // NEQ
	CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
	static_cast<unsigned>(ISD::AND);
	}

	SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC0, dl, MVT::i8));
	SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CC1, dl, MVT::i8));
	Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
	} else {
	// Handle all other FP comparisons here.
	Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(SSECC, dl, MVT::i8));
	}

	// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
	// result type of SETCC. The bitcast is expected to be optimized away
	// during combining/isel.
	if (Opc == X86ISD::CMPP)
	Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);

	return Cmp;
	}

	MVT VTOp0 = Op0.getSimpleValueType();
	assert(VTOp0 == Op1.getSimpleValueType() &&
	"Expected operands with same type!");
	assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
	"Invalid number of packed elements for source and destination!");

	if (VT.is128BitVector() && VTOp0.is256BitVector()) {
	// On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
	// legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
	// legalizer firstly checks if the first operand in input to the setcc has
	// a legal type. If so, then it promotes the return type to that same type.
	// Otherwise, the return type is promoted to the 'next legal type' which,
	// for a vector of MVT::i1 is always a 128-bit integer vector type.
	//
	// We reach this code only if the following two conditions are met:
	// 1. Both return type and operand type have been promoted to wider types
	// by the type legalizer.
	// 2. The original operand type has been promoted to a 256-bit vector.
	//
	// Note that condition 2. only applies for AVX targets.
	SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
	return DAG.getZExtOrTrunc(NewOp, dl, VT);
	}

	// The non-AVX512 code below works under the assumption that source and
	// destination types are the same.
	assert((Subtarget.hasAVX512() \|\| (VT == VTOp0)) &&
	"Value types for source and destination must be the same!");

	// Break 256-bit integer vector compare into smaller ones.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntVSETCC(Op, DAG);

	// Operands are boolean (vectors of i1)
	MVT OpVT = Op1.getSimpleValueType();
	if (OpVT.getVectorElementType() == MVT::i1)
	return LowerBoolVSETCC_AVX512(Op, DAG);

	// The result is boolean, but operands are int/float
	if (VT.getVectorElementType() == MVT::i1) {
	// In AVX-512 architecture setcc returns mask with i1 elements,
	// But there is no compare instruction for i8 and i16 elements in KNL.
	// In this case use SSE compare
	bool UseAVX512Inst =
	(OpVT.is512BitVector() \|\|
	OpVT.getScalarSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX()));

	if (UseAVX512Inst)
	return LowerIntVSETCC_AVX512(Op, DAG);

	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
	}

	// Lower using XOP integer comparisons.
	if ((VT == MVT::v16i8 \|\| VT == MVT::v8i16 \|\|
	VT == MVT::v4i32 \|\| VT == MVT::v2i64) && Subtarget.hasXOP()) {
	// Translate compare code to XOP PCOM compare mode.
	unsigned CmpMode = 0;
	switch (Cond) {
	default: llvm_unreachable("Unexpected SETCC condition");
	case ISD::SETULT:
	case ISD::SETLT: CmpMode = 0x00; break;
	case ISD::SETULE:
	case ISD::SETLE: CmpMode = 0x01; break;
	case ISD::SETUGT:
	case ISD::SETGT: CmpMode = 0x02; break;
	case ISD::SETUGE:
	case ISD::SETGE: CmpMode = 0x03; break;
	case ISD::SETEQ: CmpMode = 0x04; break;
	case ISD::SETNE: CmpMode = 0x05; break;
	}

	// Are we comparing unsigned or signed integers?
	unsigned Opc =
	ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;

	return DAG.getNode(Opc, dl, VT, Op0, Op1,
	DAG.getConstant(CmpMode, dl, MVT::i8));
	}

	// We are handling one of the integer comparisons here. Since SSE only has
	// GT and EQ comparisons for integer, swapping operands and multiple
	// operations may be required for some comparisons.
	unsigned Opc = (Cond == ISD::SETEQ \|\| Cond == ISD::SETNE) ? X86ISD::PCMPEQ
	: X86ISD::PCMPGT;
	bool Swap = Cond == ISD::SETLT \|\| Cond == ISD::SETULT \|\|
	Cond == ISD::SETGE \|\| Cond == ISD::SETUGE;
	bool Invert = Cond == ISD::SETNE \|\|
	(Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));

	// If both operands are known non-negative, then an unsigned compare is the
	// same as a signed compare and there's no need to flip signbits.
	// TODO: We could check for more general simplifications here since we're
	// computing known bits.
	bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
	!(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));

	// Special case: Use min/max operations for SETULE/SETUGE
	MVT VET = VT.getVectorElementType();
	bool HasMinMax =
	(Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) \|\|
	(Subtarget.hasSSE2() && (VET == MVT::i8));
	bool MinMax = false;
	if (HasMinMax) {
	switch (Cond) {
	default: break;
	case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
	case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
	}

	if (MinMax)
	Swap = Invert = FlipSigns = false;
	}

	bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 \|\| VET == MVT::i16);
	bool Subus = false;
	if (!MinMax && HasSubus) {
	// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
	// Op0 u<= Op1:
	// t = psubus Op0, Op1
	// pcmpeq t, <0..0>
	switch (Cond) {
	default: break;
	case ISD::SETULT: {
	// If the comparison is against a constant we can turn this into a
	// setule. With psubus, setule does not require a swap. This is
	// beneficial because the constant in the register is no longer
	// destructed as the destination so it can be hoisted out of a loop.
	// Only do this pre-AVX since vpcmp* is no longer destructive.
	if (Subtarget.hasAVX())
	break;
	if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
	Op1 = ULEOp1;
	Subus = true; Invert = false; Swap = false;
	}
	break;
	}
	// Psubus is better than flip-sign because it requires no inversion.
	case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
	case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
	}

	if (Subus) {
	Opc = X86ISD::SUBUS;
	FlipSigns = false;
	}
	}

	if (Swap)
	std::swap(Op0, Op1);

	// Check that the operation in question is available (most are plain SSE2,
	// but PCMPGTQ and PCMPEQQ have different requirements).
	if (VT == MVT::v2i64) {
	if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
	assert(Subtarget.hasSSE2() && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations. The lower
	// compare is always unsigned.
	SDValue SB;
	if (FlipSigns) {
	SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
	} else {
	SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
	SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
	SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
	}
	Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
	Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);

	// Emulate PCMPGTQ with (hi1 > hi2) \| ((hi1 == hi2) & (lo1 > lo2))
	SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
	SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);

	// Create masks for only the low parts/high parts of the 64 bit integers.
	static const int MaskHi[] = { 1, 1, 3, 3 };
	static const int MaskLo[] = { 0, 0, 2, 2 };
	SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
	SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
	SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);

	SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
	Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}

	if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
	// If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
	// pcmpeqd + pshufd + pand.
	assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");

	// First cast everything to the right type.
	Op0 = DAG.getBitcast(MVT::v4i32, Op0);
	Op1 = DAG.getBitcast(MVT::v4i32, Op1);

	// Do the compare.
	SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);

	// Make sure the lower and upper halves are both all-ones.
	static const int Mask[] = { 1, 0, 3, 2 };
	SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
	Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);

	if (Invert)
	Result = DAG.getNOT(dl, Result, MVT::v4i32);

	return DAG.getBitcast(VT, Result);
	}
	}

	// Since SSE has no unsigned integer comparisons, we need to flip the sign
	// bits of the inputs before performing those operations.
	if (FlipSigns) {
	MVT EltVT = VT.getVectorElementType();
	SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
	VT);
	Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
	Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
	}

	SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);

	// If the logical-not of the result is required, perform that now.
	if (Invert)
	Result = DAG.getNOT(dl, Result, VT);

	if (MinMax)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);

	if (Subus)
	Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
	getZeroVector(VT, Subtarget, DAG, dl));

	return Result;
	}

	SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {

	MVT VT = Op.getSimpleValueType();

	if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);

	assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
	SDValue Op0 = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDLoc dl(Op);
	ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();

	// Optimize to BT if possible.
	// Lower (X & (1 << N)) == 0 to BT(X, N).
	// Lower ((X >>u N) & 1) != 0 to BT(X, N).
	// Lower ((X >>s N) & 1) != 0 to BT(X, N).
	// Lower (trunc (X >> N) to i1) to BT(X, N).
	if (Op0.hasOneUse() && isNullConstant(Op1) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
	if (VT == MVT::i1)
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
	return NewSetCC;
	}
	}

	// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
	// these.
	if ((isOneConstant(Op1) \|\| isNullConstant(Op1)) &&
	(CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {

	// If the input is a setcc, then reuse the input setcc or use a new one with
	// the inverted condition.
	if (Op0.getOpcode() == X86ISD::SETCC) {
	X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
	bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
	if (!Invert)
	return Op0;

	CCode = X86::GetOppositeBranchCondition(CCode);
	SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
	if (VT == MVT::i1)
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
	return SetCC;
	}
	}
	if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ \|\| CC == ISD::SETNE)) {
	if (isOneConstant(Op1)) {
	ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
	return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
	}
	if (!isNullConstant(Op1)) {
	SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
	return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
	}
	}

	bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
	X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
	if (X86CC == X86::COND_INVALID)
	return SDValue();

	SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
	EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
	SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
	if (VT == MVT::i1)
	return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
	return SetCC;
	}

	SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
	SDValue LHS = Op.getOperand(0);
	SDValue RHS = Op.getOperand(1);
	SDValue Carry = Op.getOperand(2);
	SDValue Cond = Op.getOperand(3);
	SDLoc DL(Op);

	assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
	X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());

	// Recreate the carry if needed.
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
	SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
	SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
	if (Op.getSimpleValueType() == MVT::i1)
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
	return SetCC;
	}

	/// Return true if opcode is a X86 logical comparison.
	static bool isX86LogicalCmp(SDValue Op) {
	unsigned Opc = Op.getOpcode();
	if (Opc == X86ISD::CMP \|\| Opc == X86ISD::COMI \|\| Opc == X86ISD::UCOMI \|\|
	Opc == X86ISD::SAHF)
	return true;
	if (Op.getResNo() == 1 &&
	(Opc == X86ISD::ADD \|\| Opc == X86ISD::SUB \|\| Opc == X86ISD::ADC \|\|
	Opc == X86ISD::SBB \|\| Opc == X86ISD::SMUL \|\| Opc == X86ISD::UMUL \|\|
	Opc == X86ISD::INC \|\| Opc == X86ISD::DEC \|\| Opc == X86ISD::OR \|\|
	Opc == X86ISD::XOR \|\| Opc == X86ISD::AND))
	return true;

	if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
	return true;

	return false;
	}

	static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
	if (V.getOpcode() != ISD::TRUNCATE)
	return false;

	SDValue VOp0 = V.getOperand(0);
	unsigned InBits = VOp0.getValueSizeInBits();
	unsigned Bits = V.getValueSizeInBits();
	return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
	}

	SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
	bool AddTest = true;
	SDValue Cond = Op.getOperand(0);
	SDValue Op1 = Op.getOperand(1);
	SDValue Op2 = Op.getOperand(2);
	SDLoc DL(Op);
	MVT VT = Op1.getSimpleValueType();
	SDValue CC;

	// Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
	// are available or VBLENDV if AVX is available.
	// Otherwise FP cmovs get lowered into a less efficient branch sequence later.
	if (Cond.getOpcode() == ISD::SETCC &&
	((Subtarget.hasSSE2() && (VT == MVT::f32 \|\| VT == MVT::f64)) \|\|
	(Subtarget.hasSSE1() && VT == MVT::f32)) &&
	VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
	SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
	int SSECC = translateX86FSETCC(
	cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);

	if (SSECC != 8) {
	if (Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
	CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
	return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
	DL, VT, Cmp, Op1, Op2);
	}

	SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
	DAG.getConstant(SSECC, DL, MVT::i8));

	// If we have AVX, we can use a variable vector select (VBLENDV) instead
	// of 3 logic instructions for size savings and potentially speed.
	// Unfortunately, there is no scalar form of VBLENDV.

	// If either operand is a constant, don't try this. We can expect to
	// optimize away at least one of the logic instructions later in that
	// case, so that sequence would be faster than a variable blend.

	// BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
	// uses XMM0 as the selection register. That may need just as many
	// instructions as the AND/ANDN/OR sequence due to register moves, so
	// don't bother.

	if (Subtarget.hasAVX() &&
	!isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {

	// Convert to vectors, do a VSELECT, and convert back to scalar.
	// All of the conversions should be optimized away.

	MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
	SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
	SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
	SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);

	MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
	VCmp = DAG.getBitcast(VCmpVT, VCmp);

	SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);

	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
	VSel, DAG.getIntPtrConstant(0, DL));
	}
	SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
	SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
	return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
	}
	}

	// AVX512 fallback is to lower selects of scalar floats to masked moves.
	if ((VT == MVT::f64 \|\| VT == MVT::f32) && Subtarget.hasAVX512()) {
	SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
	return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
	}

	if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
	SDValue Op1Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
	Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
	else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
	Op1Scalar = Op1.getOperand(0);
	SDValue Op2Scalar;
	if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
	Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
	else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
	Op2Scalar = Op2.getOperand(0);
	if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
	SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
	Op1Scalar, Op2Scalar);
	if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
	return DAG.getBitcast(VT, newSelect);
	SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
	DAG.getIntPtrConstant(0, DL));
	}
	}

	if (VT == MVT::v4i1 \|\| VT == MVT::v2i1) {
	SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
	Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
	Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
	SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
	}

	if (Cond.getOpcode() == ISD::SETCC) {
	if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
	Cond = NewCond;
	// If the condition was updated, it's possible that the operands of the
	// select were also updated (for example, EmitTest has a RAUW). Refresh
	// the local references to the select operands in case they got stale.
	Op1 = Op.getOperand(1);
	Op2 = Op.getOperand(2);
	}
	}

	// (select (x == 0), -1, y) -> (sign_bit (x - 1)) \| y
	// (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) \| y
	// (select (x != 0), y, -1) -> (sign_bit (x - 1)) \| y
	// (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) \| y
	// (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
	// (select (and (x , 0x1) == 0), y, (z \| y) ) -> (-(and (x , 0x1)) & z ) \| y
	if (Cond.getOpcode() == X86ISD::SETCC &&
	Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(Cond.getOperand(1).getOperand(1))) {
	SDValue Cmp = Cond.getOperand(1);
	unsigned CondCode =
	cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();

	if ((isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(CondCode == X86::COND_E \|\| CondCode == X86::COND_NE)) {
	SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
	SDValue CmpOp0 = Cmp.getOperand(0);

	// Apply further optimizations for special cases
	// (select (x != 0), -1, 0) -> neg & sbb
	// (select (x == 0), 0, -1) -> neg & sbb
	if (isNullConstant(Y) &&
	(isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
	SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
	SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	return Res;
	}

	Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
	CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);

	SDValue Res = // Res = 0 or -1.
	DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);

	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
	Res = DAG.getNOT(DL, Res, Res.getValueType());

	if (!isNullConstant(Op2))
	Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
	return Res;
	} else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
	Cmp.getOperand(0).getOpcode() == ISD::AND &&
	isOneConstant(Cmp.getOperand(0).getOperand(1))) {
	SDValue CmpOp0 = Cmp.getOperand(0);
	SDValue Src1, Src2;
	// true if Op2 is XOR or OR operator and one of its operands
	// is equal to Op1
	// ( a , a op b) \|\| ( b , a op b)
	auto isOrXorPattern = [&]() {
	if ((Op2.getOpcode() == ISD::XOR \|\| Op2.getOpcode() == ISD::OR) &&
	(Op2.getOperand(0) == Op1 \|\| Op2.getOperand(1) == Op1)) {
	Src1 =
	Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
	Src2 = Op1;
	return true;
	}
	return false;
	};

	if (isOrXorPattern()) {
	SDValue Neg;
	unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
	// we need mask of all zeros or ones with same size of the other
	// operands.
	if (CmpSz > VT.getSizeInBits())
	Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
	else if (CmpSz < VT.getSizeInBits())
	Neg = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
	DAG.getConstant(1, DL, VT));
	else
	Neg = CmpOp0;
	SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
	Neg); // -(and (x, 0x1))
	SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
	return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
	}
	}
	}

	// Look past (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	MVT VT = Op.getSimpleValueType();

	bool IllegalFPCMov = false;
	if (VT.isFloatingPoint() && !VT.isVector() &&
	!isScalarFPTypeInSSEReg(VT)) // FPStack?
	IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());

	if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) \|\|
	Opc == X86ISD::BT) { // FIXME
	Cond = Cmp;
	AddTest = false;
	}
	} else if (CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, DL, MVT::i8);
	AddTest = false;
	}

	if (AddTest) {
	// Look past the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result of AND is compared against zero. Try to match
	// it to BT.
	if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	AddTest = false;
	}
	}
	}

	if (AddTest) {
	CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
	Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
	}

	// a < b ? -1 : 0 -> RES = ~setcc_carry
	// a < b ? 0 : -1 -> RES = setcc_carry
	// a >= b ? -1 : 0 -> RES = setcc_carry
	// a >= b ? 0 : -1 -> RES = ~setcc_carry
	if (Cond.getOpcode() == X86ISD::SUB) {
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();

	if ((CondCode == X86::COND_AE \|\| CondCode == X86::COND_B) &&
	(isAllOnesConstant(Op1) \|\| isAllOnesConstant(Op2)) &&
	(isNullConstant(Op1) \|\| isNullConstant(Op2))) {
	SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Cond);
	if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
	return DAG.getNOT(DL, Res, Res.getValueType());
	return Res;
	}
	}

	// X86 doesn't have an i8 cmov. If both operands are the result of a truncate
	// widen the cmov and push the truncate through. This avoids introducing a new
	// branch during isel and doesn't add any extensions.
	if (Op.getValueType() == MVT::i8 &&
	Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
	SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
	if (T1.getValueType() == T2.getValueType() &&
	// Blacklist CopyFromReg to avoid partial register stalls.
	T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
	SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
	SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
	return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
	}
	}

	// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
	// condition is true.
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
	SDValue Ops[] = { Op2, Op1, CC, Cond };
	return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
	}

	static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	MVT VTElt = VT.getVectorElementType();
	MVT InVTElt = InVT.getVectorElementType();
	SDLoc dl(Op);

	// SKX processor
	if ((InVTElt == MVT::i1) &&
	(((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) \|\|

	((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))

	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

	unsigned NumElts = VT.getVectorNumElements();

	if (VT.is512BitVector() && InVTElt != MVT::i1 &&
	(NumElts == 8 \|\| NumElts == 16 \|\| Subtarget.hasBWI())) {
	if (In.getOpcode() == X86ISD::VSEXT \|\| In.getOpcode() == X86ISD::VZEXT)
	return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
	return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
	}

	if (InVTElt != MVT::i1)
	return SDValue();

	MVT ExtVT = VT;
	if (!VT.is512BitVector() && !Subtarget.hasVLX())
	ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);

	SDValue V;
	if (Subtarget.hasDQI()) {
	V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
	assert(!VT.is512BitVector() && "Unexpected vector type");
	} else {
	SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
	SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
	V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
	if (ExtVT == VT)
	return V;
	}

	return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
	}

	// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
	// For sign extend this needs to handle all vector sizes and SSE4.1 and
	// non-SSE4.1 targets. For zero extend this should only handle inputs of
	// MVT::v64i8 when BWI is not supported, but AVX512 is.
	static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue In = Op->getOperand(0);
	MVT VT = Op->getSimpleValueType(0);
	MVT InVT = In.getSimpleValueType();
	assert(VT.getSizeInBits() == InVT.getSizeInBits());

	MVT SVT = VT.getVectorElementType();
	MVT InSVT = InVT.getVectorElementType();
	assert(SVT.getSizeInBits() > InSVT.getSizeInBits());

	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();
	if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
	!(VT.is256BitVector() && Subtarget.hasInt256()) &&
	!(VT.is512BitVector() && Subtarget.hasAVX512()))
	return SDValue();

	SDLoc dl(Op);

	// For 256-bit vectors, we only need the lower (128-bit) half of the input.
	// For 512-bit vectors, we need 128-bits or 256-bits.
	if (VT.getSizeInBits() > 128) {
	// Input needs to be at least the same number of elements as output, and
	// at least 128-bits.
	int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
	In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
	}

	assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG \|\|
	InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");

	// SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
	// so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
	// need to be handled here for 256/512-bit results.
	if (Subtarget.hasInt256()) {
	assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
	unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
	X86ISD::VSEXT : X86ISD::VZEXT;
	return DAG.getNode(ExtOpc, dl, VT, In);
	}

	// We should only get here for sign extend.
	assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
	"Unexpected opcode!");

	// pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
	SDValue Curr = In;
	MVT CurrVT = InVT;

	// As SRAI is only available on i16/i32 types, we expand only up to i32
	// and handle i64 separately.
	while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
	Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
	MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
	CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
	Curr = DAG.getBitcast(CurrVT, Curr);
	}

	SDValue SignExt = Curr;
	if (CurrVT != InVT) {
	unsigned SignExtShift =
	CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
	SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(SignExtShift, dl, MVT::i8));
	}

	if (CurrVT == VT)
	return SignExt;

	if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
	SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
	DAG.getConstant(31, dl, MVT::i8));
	SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
	return DAG.getBitcast(VT, Ext);
	}

	return SDValue();
	}

	static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op->getSimpleValueType(0);
	SDValue In = Op->getOperand(0);
	MVT InVT = In.getSimpleValueType();
	SDLoc dl(Op);

	if (VT.is512BitVector() \|\| InVT.getVectorElementType() == MVT::i1)
	return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);

	if ((VT != MVT::v4i64 \|\| InVT != MVT::v4i32) &&
	(VT != MVT::v8i32 \|\| InVT != MVT::v8i16) &&
	(VT != MVT::v16i16 \|\| InVT != MVT::v16i8))
	return SDValue();

	if (Subtarget.hasInt256())
	return DAG.getNode(X86ISD::VSEXT, dl, VT, In);

	// Optimize vectors in AVX mode
	// Sign extend v8i16 to v8i32 and
	// v4i32 to v4i64
	//
	// Divide input vector into two parts
	// for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
	// use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
	// concat the vectors to original VT

	unsigned NumElems = InVT.getVectorNumElements();
	SDValue Undef = DAG.getUNDEF(InVT);

	SmallVector<int,8> ShufMask1(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask1[i] = i;

	SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);

	SmallVector<int,8> ShufMask2(NumElems, -1);
	for (unsigned i = 0; i != NumElems/2; ++i)
	ShufMask2[i] = i + NumElems/2;

	SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);

	MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
	VT.getVectorNumElements() / 2);

	OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
	OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
	}

	// Lower truncating store. We need a special lowering to vXi1 vectors
	static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
	SDLoc dl(St);
	EVT MemVT = St->getMemoryVT();
	assert(St->isTruncatingStore() && "We only custom truncating store.");
	assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
	"Expected truncstore of i1 vector");

	SDValue Op = St->getValue();
	MVT OpVT = Op.getValueType().getSimpleVT();
	unsigned NumElts = OpVT.getVectorNumElements();
	if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) \|\|
	NumElts == 16) {
	// Truncate and store - everything is legal
	Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
	if (MemVT.getSizeInBits() < 8)
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
	DAG.getUNDEF(MVT::v8i1), Op,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}

	// A subset, assume that we have only AVX-512F
	if (NumElts <= 8) {
	if (NumElts < 8) {
	// Extend to 8-elts vector
	MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
	Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
	DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
	}
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
	return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
	St->getMemOperand());
	}
	// v32i8
	assert(OpVT == MVT::v32i8 && "Unexpected operand type");
	// Divide the vector into 2 parts and store each part separately
	SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(0, dl));
	Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
	SDValue BasePtr = St->getBasePtr();
	SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
	St->getMemOperand());
	SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
	DAG.getIntPtrConstant(16, dl));
	Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);

	SDValue BasePtrHi =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(2, dl, BasePtr.getValueType()));

	SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
	BasePtrHi, St->getMemOperand());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
	}

	static SDValue LowerExtended1BitVectorLoad(SDValue Op,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
	"Expected i1 vector load");
	unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
	ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
	MVT VT = Op.getValueType().getSimpleVT();
	unsigned NumElts = VT.getVectorNumElements();

	if ((Subtarget.hasBWI() && NumElts >= 32) \|\|
	(Subtarget.hasDQI() && NumElts < 16) \|\|
	NumElts == 16) {
	// Load and extend - everything is legal
	if (NumElts < 8) {
	SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);

	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}
	SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
	}

	if (NumElts <= 8) {
	// A subset, assume that we have only AVX-512F
	unsigned NumBitsToLoad = 8;
	MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
	SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
	SDValue BitVec = DAG.getBitcast(MaskVT, Load);

	if (NumElts == 8)
	return DAG.getNode(ExtOpcode, dl, VT, BitVec);

	// we should take care to v4i1 and v2i1

	MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
	SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(VT == MVT::v32i8 && "Unexpected extload type");

	SmallVector<SDValue, 2> Chains;

	SDValue BasePtr = Ld->getBasePtr();
	SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
	Ld->getBasePtr(),
	Ld->getMemOperand());
	Chains.push_back(LoadLo.getValue(1));

	SDValue BasePtrHi =
	DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
	DAG.getConstant(2, dl, BasePtr.getValueType()));

	SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
	BasePtrHi,
	Ld->getMemOperand());
	Chains.push_back(LoadHi.getValue(1));
	SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);

	SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
	SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
	}

	// Lower vector extended loads using a shuffle. If SSSE3 is not available we
	// may emit an illegal shuffle but the expansion is still better than scalar
	// code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
	// we'll emit a shuffle and a arithmetic shift.
	// FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
	// TODO: It is possible to support ZExt by zeroing the undef values during
	// the shuffle phase or after the shuffle.
	static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT RegVT = Op.getSimpleValueType();
	assert(RegVT.isVector() && "We only custom lower vector sext loads.");
	assert(RegVT.isInteger() &&
	"We only custom lower integer vector sext loads.");

	// Nothing useful we can do without SSE2 shuffles.
	assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");

	LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
	SDLoc dl(Ld);
	EVT MemVT = Ld->getMemoryVT();
	if (MemVT.getScalarType() == MVT::i1)
	return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned RegSz = RegVT.getSizeInBits();

	ISD::LoadExtType Ext = Ld->getExtensionType();

	assert((Ext == ISD::EXTLOAD \|\| Ext == ISD::SEXTLOAD)
	&& "Only anyext and sext are currently implemented.");
	assert(MemVT != RegVT && "Cannot extend to the same type");
	assert(MemVT.isVector() && "Must load a vector from memory");

	unsigned NumElems = RegVT.getVectorNumElements();
	unsigned MemSz = MemVT.getSizeInBits();
	assert(RegSz > MemSz && "Register size must be greater than the mem size");

	if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
	// The only way in which we have a legal 256-bit vector result but not the
	// integer 256-bit operations needed to directly lower a sextload is if we
	// have AVX1 but not AVX2. In that case, we can always emit a sextload to
	// a 128-bit vector and a normal sign_extend to 256-bits that should get
	// correctly legalized. We do this late to allow the canonical form of
	// sextload to persist throughout the rest of the DAG combiner -- it wants
	// to fold together any extensions it can, and so will fuse a sign_extend
	// of an sextload into a sextload targeting a wider value.
	SDValue Load;
	if (MemSz == 128) {
	// Just switch this to a normal load.
	assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
	"it must be a legal 128-bit vector "
	"type!");
	Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	} else {
	assert(MemSz < 128 &&
	"Can't extend a type wider than 128 bits to a 256 bit vector!");
	// Do an sext load to a 128-bit vector type. We want to use the same
	// number of elements, but elements half as wide. This will end up being
	// recursively lowered by this routine, but will succeed as we definitely
	// have all the necessary features if we're using AVX1.
	EVT HalfEltVT =
	EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
	EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
	Load =
	DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	}

	// Replace chain users with the new chain.
	assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));

	// Finally, do a normal sign-extend to the desired register.
	return DAG.getSExtOrTrunc(Load, dl, RegVT);
	}

	// All sizes must be a power of two.
	assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
	"Non-power-of-two elements are not custom lowered!");

	// Attempt to load the original value using scalar loads.
	// Find the largest scalar type that divides the total loaded size.
	MVT SclrLoadTy = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
	SclrLoadTy = Tp;
	}
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
	(64 <= MemSz))
	SclrLoadTy = MVT::f64;

	// Calculate the number of scalar loads that we need to perform
	// in order to load our vector from memory.
	unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();

	assert((Ext != ISD::SEXTLOAD \|\| NumLoads == 1) &&
	"Can only lower sext loads with a single scalar load!");

	unsigned loadRegZize = RegSz;
	if (Ext == ISD::SEXTLOAD && RegSz >= 256)
	loadRegZize = 128;

	// Represent our vector as a sequence of elements which are the
	// largest scalar that we can load.
	EVT LoadUnitVecVT = EVT::getVectorVT(
	*DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());

	// Represent the data using the same element type that is stored in
	// memory. In practice, we ''widen'' MemVT.
	EVT WideVecVT =
	EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	loadRegZize / MemVT.getScalarSizeInBits());

	assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
	"Invalid vector type");

	// We can't shuffle using an illegal type.
	assert(TLI.isTypeLegal(WideVecVT) &&
	"We only lower types that form legal widened vector types");

	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = Ld->getBasePtr();
	SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
	TLI.getPointerTy(DAG.getDataLayout()));
	SDValue Res = DAG.getUNDEF(LoadUnitVecVT);

	for (unsigned i = 0; i < NumLoads; ++i) {
	// Perform a single load.
	SDValue ScalarLoad =
	DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Ld->getAlignment(), Ld->getMemOperand()->getFlags());
	Chains.push_back(ScalarLoad.getValue(1));
	// Create the first element type using SCALAR_TO_VECTOR in order to avoid
	// another round of DAGCombining.
	if (i == 0)
	Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
	else
	Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
	ScalarLoad, DAG.getIntPtrConstant(i, dl));

	Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
	}

	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);

	// Bitcast the loaded value to a vector of the original element type, in
	// the size of the target vector type.
	SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
	unsigned SizeRatio = RegSz / MemSz;

	if (Ext == ISD::SEXTLOAD) {
	// If we have SSE4.1, we can directly emit a VSEXT node.
	if (Subtarget.hasSSE41()) {
	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Sext;
	}

	// Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
	// lanes.
	assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
	"We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");

	SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	// Redistribute the loaded elements into the different locations.
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i * SizeRatio] = i;

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
	DAG.getUNDEF(WideVecVT), ShuffleVec);

	// Bitcast to the requested type.
	Shuff = DAG.getBitcast(RegVT, Shuff);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
	return Shuff;
	}

	/// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
	/// each of which has no other use apart from the AND / OR.
	static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
	Opc = Op.getOpcode();
	if (Opc != ISD::OR && Opc != ISD::AND)
	return false;
	return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse() &&
	Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(1).hasOneUse());
	}

	/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
	/// SETCC node has a single use.
	static bool isXor1OfSetCC(SDValue Op) {
	if (Op.getOpcode() != ISD::XOR)
	return false;
	if (isOneConstant(Op.getOperand(1)))
	return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
	Op.getOperand(0).hasOneUse();
	return false;
	}

	SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
	bool addTest = true;
	SDValue Chain = Op.getOperand(0);
	SDValue Cond = Op.getOperand(1);
	SDValue Dest = Op.getOperand(2);
	SDLoc dl(Op);
	SDValue CC;
	bool Inverted = false;

	if (Cond.getOpcode() == ISD::SETCC) {
	// Check for setcc([su]{add,sub,mul}o == 0).
	if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
	isNullConstant(Cond.getOperand(1)) &&
	Cond.getOperand(0).getResNo() == 1 &&
	(Cond.getOperand(0).getOpcode() == ISD::SADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UADDO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SSUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::USUBO \|\|
	Cond.getOperand(0).getOpcode() == ISD::SMULO \|\|
	Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
	Inverted = true;
	Cond = Cond.getOperand(0);
	} else {
	if (SDValue NewCond = LowerSETCC(Cond, DAG))
	Cond = NewCond;
	}
	}
	#if 0
	// FIXME: LowerXALUO doesn't handle these!!
	else if (Cond.getOpcode() == X86ISD::ADD \|\|
	Cond.getOpcode() == X86ISD::SUB \|\|
	Cond.getOpcode() == X86ISD::SMUL \|\|
	Cond.getOpcode() == X86ISD::UMUL)
	Cond = LowerXALUO(Cond, DAG);
	#endif

	// Look pass (and (setcc_carry (cmp ...)), 1).
	if (Cond.getOpcode() == ISD::AND &&
	Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
	isOneConstant(Cond.getOperand(1)))
	Cond = Cond.getOperand(0);

	// If condition flag is set by a X86ISD::CMP, then use it as the condition
	// setting operand in place of the X86ISD::SETCC.
	unsigned CondOpcode = Cond.getOpcode();
	if (CondOpcode == X86ISD::SETCC \|\|
	CondOpcode == X86ISD::SETCC_CARRY) {
	CC = Cond.getOperand(0);

	SDValue Cmp = Cond.getOperand(1);
	unsigned Opc = Cmp.getOpcode();
	// FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
	if (isX86LogicalCmp(Cmp) \|\| Opc == X86ISD::BT) {
	Cond = Cmp;
	addTest = false;
	} else {
	switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
	default: break;
	case X86::COND_O:
	case X86::COND_B:
	// These can only come from an arithmetic instruction with overflow,
	// e.g. SADDO, UADDO.
	Cond = Cond.getOperand(1);
	addTest = false;
	break;
	}
	}
	}
	CondOpcode = Cond.getOpcode();
	if (CondOpcode == ISD::UADDO \|\| CondOpcode == ISD::SADDO \|\|
	CondOpcode == ISD::USUBO \|\| CondOpcode == ISD::SSUBO \|\|
	((CondOpcode == ISD::UMULO \|\| CondOpcode == ISD::SMULO) &&
	Cond.getOperand(0).getValueType() != MVT::i8)) {
	SDValue LHS = Cond.getOperand(0);
	SDValue RHS = Cond.getOperand(1);
	unsigned X86Opcode;
	unsigned X86Cond;
	SDVTList VTs;
	// Keep this in sync with LowerXALUO, otherwise we might create redundant
	// instructions that can't be removed afterwards (i.e. X86ISD::ADD and
	// X86ISD::INC).
	switch (CondOpcode) {
	case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
	case ISD::SADDO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
	case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
	case ISD::SSUBO:
	if (isOneConstant(RHS)) {
	X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
	break;
	}
	X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
	case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
	case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
	default: llvm_unreachable("unexpected overflowing operator");
	}
	if (Inverted)
	X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
	if (CondOpcode == ISD::UMULO)
	VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
	MVT::i32);
	else
	VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);

	SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);

	if (CondOpcode == ISD::UMULO)
	Cond = X86Op.getValue(2);
	else
	Cond = X86Op.getValue(1);

	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	addTest = false;
	} else {
	unsigned CondOpc;
	if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
	SDValue Cmp = Cond.getOperand(0).getOperand(1);
	if (CondOpc == ISD::OR) {
	// Also, recognize the pattern generated by an FCMP_UNE. We can emit
	// two branches instead of an explicit OR instruction with a
	// separate test.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp)) {
	CC = Cond.getOperand(0).getOperand(0);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = Cond.getOperand(1).getOperand(0);
	Cond = Cmp;
	addTest = false;
	}
	} else { // ISD::AND
	// Also, recognize the pattern generated by an FCMP_OEQ. We can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Cmp == Cond.getOperand(1).getOperand(1) &&
	isX86LogicalCmp(Cmp) &&
	Op.getNode()->hasOneUse()) {
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	}
	} else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
	// Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
	// It should be transformed during dag combiner except when the condition
	// is set by a arithmetics with overflow node.
	X86::CondCode CCode =
	(X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
	CCode = X86::GetOppositeBranchCondition(CCode);
	CC = DAG.getConstant(CCode, dl, MVT::i8);
	Cond = Cond.getOperand(0).getOperand(1);
	addTest = false;
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
	// For FCMP_OEQ, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_OEQ.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;
	Dest = FalseBB;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	}
	}
	} else if (Cond.getOpcode() == ISD::SETCC &&
	cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
	// For FCMP_UNE, we can emit
	// two branches instead of an explicit AND instruction with a
	// separate test. However, we only do this if this block doesn't
	// have a fall-through edge, because this requires an explicit
	// jmp when the condition is false.
	if (Op.getNode()->hasOneUse()) {
	SDNode User = Op.getNode()->use_begin();
	// Look for an unconditional branch following this conditional branch.
	// We need this because we need to reverse the successors in order
	// to implement FCMP_UNE.
	if (User->getOpcode() == ISD::BR) {
	SDValue FalseBB = User->getOperand(1);
	SDNode *NewBR =
	DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
	assert(NewBR == User);
	(void)NewBR;

	SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
	Cond.getOperand(0), Cond.getOperand(1));
	Cmp = ConvertCmpIfNecessary(Cmp, DAG);
	CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
	Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cmp);
	CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
	Cond = Cmp;
	addTest = false;
	Dest = FalseBB;
	}
	}
	}
	}

	if (addTest) {
	// Look pass the truncate if the high bits are known zero.
	if (isTruncWithZeroHighBitsInput(Cond, DAG))
	Cond = Cond.getOperand(0);

	// We know the result is compared against zero. Try to match it to BT.
	if (Cond.hasOneUse()) {
	if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
	CC = NewSetCC.getOperand(0);
	Cond = NewSetCC.getOperand(1);
	addTest = false;
	}
	}
	}

	if (addTest) {
	X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
	CC = DAG.getConstant(X86Cond, dl, MVT::i8);
	Cond = EmitTest(Cond, X86Cond, dl, DAG);
	}
	Cond = ConvertCmpIfNecessary(Cond, DAG);
	return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
	Chain, Dest, CC, Cond);
	}

	// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
	// Calls to _alloca are needed to probe the stack when allocating more than 4k
	// bytes in one go. Touching the stack at 4K increments is necessary to ensure
	// that the guard pages used by the OS virtual memory manager are allocated in
	// correct sequence.
	SDValue
	X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	bool SplitStack = MF.shouldSplitStack();
	bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
	bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) \|\|
	SplitStack \|\| EmitStackProbe;
	SDLoc dl(Op);

	// Get the inputs.
	SDNode *Node = Op.getNode();
	SDValue Chain = Op.getOperand(0);
	SDValue Size = Op.getOperand(1);
	unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	EVT VT = Node->getValueType(0);

	// Chain the dynamic stack allocation so that it doesn't modify the stack
	// pointer when other instructions are using the stack.
	Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);

	bool Is64Bit = Subtarget.is64Bit();
	MVT SPTy = getPointerTy(DAG.getDataLayout());

	SDValue Result;
	if (!Lower) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
	assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
	" not tell us which reg is the stack pointer!");

	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
	Chain = SP.getValue(1);
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlign = TFI.getStackAlignment();
	Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
	if (Align > StackAlign)
	Result = DAG.getNode(ISD::AND, dl, VT, Result,
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
	} else if (SplitStack) {
	MachineRegisterInfo &MRI = MF.getRegInfo();

	if (Is64Bit) {
	// The 64 bit implementation of segmented stacks needs to clobber both r10
	// r11. This makes it impossible to use it along with nested parameters.
	const Function *F = MF.getFunction();
	for (const auto &A : F->args()) {
	if (A.hasNestAttr())
	report_fatal_error("Cannot use segmented stacks with functions that "
	"have nested arguments.");
	}
	}

	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
	Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
	Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
	DAG.getRegister(Vreg, SPTy));
	} else {
	SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
	Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
	MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned SPReg = RegInfo->getStackRegister();
	SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
	Chain = SP.getValue(1);

	if (Align) {
	SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
	DAG.getConstant(-(uint64_t)Align, dl, VT));
	Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
	}

	Result = SP;
	}

	Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
	DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);

	SDValue Ops[2] = {Result, Chain};
	return DAG.getMergeValues(Ops, dl);
	}

	SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	auto PtrVT = getPointerTy(MF.getDataLayout());
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();

	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	SDLoc DL(Op);

	if (!Subtarget.is64Bit() \|\|
	Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
	// vastart just stores the address of the VarArgsFrameIndex slot into the
	// memory location argument.
	SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
	MachinePointerInfo(SV));
	}

	// __va_list_tag:
	// gp_offset (0 - 6 * 8)
	// fp_offset (48 - 48 + 8 * 16)
	// overflow_arg_area (point to parameters coming in memory).
	// reg_save_area
	SmallVector<SDValue, 8> MemOps;
	SDValue FIN = Op.getOperand(1);
	// Store gp_offset
	SDValue Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV));
	MemOps.push_back(Store);

	// Store fp_offset
	FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
	Store = DAG.getStore(
	Op.getOperand(0), DL,
	DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
	MachinePointerInfo(SV, 4));
	MemOps.push_back(Store);

	// Store ptr to overflow_arg_area
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
	SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
	Store =
	DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
	MemOps.push_back(Store);

	// Store ptr to reg_save_area.
	FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
	Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
	SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
	Store = DAG.getStore(
	Op.getOperand(0), DL, RSFIN, FIN,
	MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
	MemOps.push_back(Store);
	return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
	}

	SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.is64Bit() &&
	"LowerVAARG only handles 64-bit va_arg!");
	assert(Op.getNumOperands() == 4);

	MachineFunction &MF = DAG.getMachineFunction();
	if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
	// The Win64 ABI uses char* instead of a structure.
	return DAG.expandVAArg(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue SrcPtr = Op.getOperand(1);
	const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
	unsigned Align = Op.getConstantOperandVal(3);
	SDLoc dl(Op);

	EVT ArgVT = Op.getNode()->getValueType(0);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
	uint8_t ArgMode;

	// Decide which area this value should be read from.
	// TODO: Implement the AMD64 ABI in its entirety. This simple
	// selection mechanism works only for the basic types.
	if (ArgVT == MVT::f80) {
	llvm_unreachable("va_arg for f80 not yet implemented");
	} else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /bytes/) {
	ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
	} else if (ArgVT.isInteger() && ArgSize <= 32 /bytes/) {
	ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
	} else {
	llvm_unreachable("Unhandled argument type in LowerVAARG");
	}

	if (ArgMode == 2) {
	// Sanity Check: Make sure using fp_offset makes sense.
	assert(!Subtarget.useSoftFloat() &&
	!(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
	Subtarget.hasSSE1());
	}

	// Insert VAARG_64 node into the DAG
	// VAARG_64 returns two values: Variable Argument Address, Chain
	SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
	DAG.getConstant(ArgMode, dl, MVT::i8),
	DAG.getConstant(Align, dl, MVT::i32)};
	SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
	SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
	VTs, InstOps, MVT::i64,
	MachinePointerInfo(SV),
	/Align=/0,
	/Volatile=/false,
	/ReadMem=/true,
	/WriteMem=/true);
	Chain = VAARG.getValue(1);

	// Load the next argument and return it
	return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
	}

	static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// X86-64 va_list is a struct { i32, i32, i8, i8 }, except on Windows,
	// where a va_list is still an i8*.
	assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
	if (Subtarget.isCallingConvWin64(
	DAG.getMachineFunction().getFunction()->getCallingConv()))
	// Probably a Win64 va_copy.
	return DAG.expandVACopy(Op.getNode());

	SDValue Chain = Op.getOperand(0);
	SDValue DstPtr = Op.getOperand(1);
	SDValue SrcPtr = Op.getOperand(2);
	const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
	const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	SDLoc DL(Op);

	return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
	DAG.getIntPtrConstant(24, DL), 8, /isVolatile/false,
	false, false,
	MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
	}

	/// Handle vector element shifts where the shift amount is a constant.
	/// Takes immediate version of shift as input.
	static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, uint64_t ShiftAmt,
	SelectionDAG &DAG) {
	MVT ElementType = VT.getVectorElementType();

	// Bitcast the source vector to the output type, this is mainly necessary for
	// vXi8/vXi64 shifts.
	if (VT != SrcOp.getSimpleValueType())
	SrcOp = DAG.getBitcast(VT, SrcOp);

	// Fold this packed shift into its first operand if ShiftAmt is 0.
	if (ShiftAmt == 0)
	return SrcOp;

	// Check for ShiftAmt >= element width
	if (ShiftAmt >= ElementType.getSizeInBits()) {
	if (Opc == X86ISD::VSRAI)
	ShiftAmt = ElementType.getSizeInBits() - 1;
	else
	return DAG.getConstant(0, dl, VT);
	}

	assert((Opc == X86ISD::VSHLI \|\| Opc == X86ISD::VSRLI \|\| Opc == X86ISD::VSRAI)
	&& "Unknown target vector shift-by-constant node");

	// Fold this packed vector shift into a build vector if SrcOp is a
	// vector of Constants or UNDEFs.
	if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
	SmallVector<SDValue, 8> Elts;
	unsigned NumElts = SrcOp->getNumOperands();
	ConstantSDNode *ND;

	switch(Opc) {
	default: llvm_unreachable("Unknown opcode!");
	case X86ISD::VSHLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRLI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
	}
	break;
	case X86ISD::VSRAI:
	for (unsigned i=0; i!=NumElts; ++i) {
	SDValue CurrentOp = SrcOp->getOperand(i);
	if (CurrentOp->isUndef()) {
	Elts.push_back(CurrentOp);
	continue;
	}
	ND = cast<ConstantSDNode>(CurrentOp);
	const APInt &C = ND->getAPIntValue();
	Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
	}
	break;
	}

	return DAG.getBuildVector(VT, dl, Elts);
	}

	return DAG.getNode(Opc, dl, VT, SrcOp,
	DAG.getConstant(ShiftAmt, dl, MVT::i8));
	}

	/// Handle vector element shifts where the shift amount may or may not be a
	/// constant. Takes immediate version of shift as input.
	static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
	SDValue SrcOp, SDValue ShAmt,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SVT = ShAmt.getSimpleValueType();
	assert((SVT == MVT::i32 \|\| SVT == MVT::i64) && "Unexpected value type!");

	// Catch shift-by-constant.
	if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
	return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
	CShAmt->getZExtValue(), DAG);

	// Change opcode to non-immediate version
	switch (Opc) {
	default: llvm_unreachable("Unknown target vector shift node");
	case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
	case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
	case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
	}

	// Need to build a vector containing shift amount.
	// SSE/AVX packed shifts only use the lower 64-bit of the shift count.
	// +=================+============+=======================================+
	// \| ShAmt is \| HasSSE4.1? \| Construct ShAmt vector as \|
	// +=================+============+=======================================+
	// \| i64 \| Yes, No \| Use ShAmt as lowest elt \|
	// \| i32 \| Yes \| zero-extend in-reg \|
	// \| (i32 zext(i16)) \| Yes \| zero-extend in-reg \|
	// \| i16/i32 \| No \| v4i32 build_vector(ShAmt, 0, ud, ud)) \|
	// +=================+============+=======================================+

	if (SVT == MVT::i64)
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
	else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
	ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
	ShAmt = ShAmt.getOperand(0);
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else if (Subtarget.hasSSE41() &&
	ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
	ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
	} else {
	SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
	DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
	ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
	}

	// The return type has to be a 128-bit type with the same element
	// type as the input type.
	MVT EltVT = VT.getVectorElementType();
	MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());

	ShAmt = DAG.getBitcast(ShVT, ShAmt);
	return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
	}

	/// \brief Return Mask with the necessary casting or extending
	/// for \p Mask according to \p MaskVT when lowering masking intrinsics
	static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
	const X86Subtarget &Subtarget, SelectionDAG &DAG,
	const SDLoc &dl) {

	if (isAllOnesConstant(Mask))
	return DAG.getTargetConstant(1, dl, MaskVT);
	if (X86::isZeroNode(Mask))
	return DAG.getTargetConstant(0, dl, MaskVT);

	if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
	// Mask should be extended
	Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
	MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
	}

	if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
	if (MaskVT == MVT::v64i1) {
	assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
	// In case 32bit mode, bitcast i64 is illegal, extend/split it.
	SDValue Lo, Hi;
	Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(0, dl, MVT::i32));
	Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
	DAG.getConstant(1, dl, MVT::i32));

	Lo = DAG.getBitcast(MVT::v32i1, Lo);
	Hi = DAG.getBitcast(MVT::v32i1, Hi);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
	} else {
	// MaskVT require < 64bit. Truncate mask (should succeed in any case),
	// and bitcast.
	MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
	return DAG.getBitcast(MaskVT,
	DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
	}

	} else {
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	// In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
	// are extracted by EXTRACT_SUBVECTOR.
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
	DAG.getBitcast(BitcastVT, Mask),
	DAG.getIntPtrConstant(0, dl));
	}
	}

	/// \brief Return (and \p Op, \p Mask) for compare instructions or
	/// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
	/// necessary casting or extending for \p Mask when lowering masking intrinsics
	static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	unsigned OpcodeSelect = ISD::VSELECT;
	SDLoc dl(Op);

	if (isAllOnesConstant(Mask))
	return Op;

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	switch (Op.getOpcode()) {
	default: break;
	case X86ISD::PCMPEQM:
	case X86ISD::PCMPGTM:
	case X86ISD::CMPM:
	case X86ISD::CMPMU:
	return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
	case X86ISD::VFPCLASS:
	case X86ISD::VFPCLASSS:
	return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
	case X86ISD::VTRUNC:
	case X86ISD::VTRUNCS:
	case X86ISD::VTRUNCUS:
	case X86ISD::CVTPS2PH:
	// We can't use ISD::VSELECT here because it is not always "Legal"
	// for the destination type. For example vpmovqb require only AVX512
	// and vselect that can operate on byte element type require BWI
	OpcodeSelect = X86ISD::SELECT;
	break;
	}
	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
	}

	/// \brief Creates an SDNode for a predicated scalar operation.
	/// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
	/// The mask is coming as MVT::i8 and it should be transformed
	/// to MVT::v1i1 while lowering masking intrinsics.
	/// The main difference between ScalarMaskingNode and VectorMaskingNode is using
	/// "X86select" instead of "vselect". We just can't create the "vselect" node
	/// for a scalar instruction.
	static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
	SDValue PreservedSrc,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
	if (MaskConst->getZExtValue() & 0x1)
	return Op;

	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);

	SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
	if (Op.getOpcode() == X86ISD::FSETCCM \|\|
	Op.getOpcode() == X86ISD::FSETCCM_RND)
	return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
	if (Op.getOpcode() == X86ISD::VFPCLASSS)
	return DAG.getNode(ISD::OR, dl, VT, Op, IMask);

	if (PreservedSrc.isUndef())
	PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
	return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
	}

	static int getSEHRegistrationNodeSize(const Function *Fn) {
	if (!Fn->hasPersonalityFn())
	report_fatal_error(
	"querying registration node size for function without personality");
	// The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
	// WinEHStatePass for the full struct definition.
	switch (classifyEHPersonality(Fn->getPersonalityFn())) {
	case EHPersonality::MSVC_X86SEH: return 24;
	case EHPersonality::MSVC_CXX: return 16;
	default: break;
	}
	report_fatal_error(
	"can only recover FP for 32-bit MSVC EH personality functions");
	}

	/// When the MSVC runtime transfers control to us, either to an outlined
	/// function or when returning to a parent frame after catching an exception, we
	/// recover the parent frame pointer by doing arithmetic on the incoming EBP.
	/// Here's the math:
	/// RegNodeBase = EntryEBP - RegNodeSize
	/// ParentFP = RegNodeBase - ParentFrameOffset
	/// Subtracting RegNodeSize takes us to the offset of the registration node, and
	/// subtracting the offset (negative on x86) takes us back to the parent FP.
	static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
	SDValue EntryEBP) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDLoc dl;

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());

	// It's possible that the parent function no longer has a personality function
	// if the exceptional code was optimized away, in which case we just return
	// the incoming EBP.
	if (!Fn->hasPersonalityFn())
	return EntryEBP;

	// Get an MCSymbol that will ultimately resolve to the frame offset of the EH
	// registration, or the .set_setframe offset.
	MCSymbol *OffsetSym =
	MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));
	SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
	SDValue ParentFrameOffset =
	DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);

	// Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
	// prologue to RBP in the parent function.
	const X86Subtarget &Subtarget =
	static_cast<const X86Subtarget &>(DAG.getSubtarget());
	if (Subtarget.is64Bit())
	return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);

	int RegNodeSize = getSEHRegistrationNodeSize(Fn);
	// RegNodeBase = EntryEBP - RegNodeSize
	// ParentFP = RegNodeBase - ParentFrameOffset
	SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
	DAG.getConstant(RegNodeSize, dl, PtrVT));
	return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
	}

	static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	// Helper to detect if the operand is CUR_DIRECTION rounding mode.
	auto isRoundModeCurDirection = [](SDValue Rnd) {
	if (!isa<ConstantSDNode>(Rnd))
	return false;

	unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
	return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
	};

	SDLoc dl(Op);
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	MVT VT = Op.getSimpleValueType();
	const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
	if (IntrData) {
	switch(IntrData->Type) {
	case INTR_TYPE_1OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
	case INTR_TYPE_2OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2));
	case INTR_TYPE_3OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3));
	case INTR_TYPE_4OP:
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
	Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
	case INTR_TYPE_1OP_MASK_RM: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue RoundingMode;
	// We always add rounding mode to the Node.
	// If the rounding mode is not specified, we add the
	// "current direction" mode.
	if (Op.getNumOperands() == 4)
	RoundingMode =
	DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	else
	RoundingMode = Op.getOperand(4);
	assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	RoundingMode),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_1OP_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue passThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, VT, Src1, Src2, Rnd),
	Mask, passThru, Subtarget, DAG);
	}
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
	Mask, passThru, Subtarget, DAG);
	}
	case INTR_TYPE_SCALAR_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src0 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// There are 2 kinds of intrinsics in this group:
	// (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
	// (2) With rounding mode and sae - 7 operands.
	if (Op.getNumOperands() == 6) {
	SDValue Sae = Op.getOperand(5);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	Sae),
	Mask, Src0, Subtarget, DAG);
	}
	assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
	SDValue RoundingMode = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
	RoundingMode, Sae),
	Mask, Src0, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK:
	case INTR_TYPE_2OP_IMM8_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
	Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	// TODO: Intrinsics should have fast-math-flags to propagate.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_2OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (6 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 6)
	Rnd = Op.getOperand(5);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_SCALAR_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Sae = Op.getOperand(6);

	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
	Src2, Src3, Sae),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_MASK_RM: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Imm = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Imm, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case INTR_TYPE_3OP_IMM8_MASK:
	case INTR_TYPE_3OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue PassThru = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);

	if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
	Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(6);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_2OP_MASK : {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue PassThru = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
	Mask, PassThru, Subtarget, DAG);
	}
	case VPERM_3OP_MASKZ:
	case VPERM_3OP_MASK:{
	MVT VT = Op.getSimpleValueType();
	// Src2 is the PassThru
	SDValue Src1 = Op.getOperand(1);
	// PassThru needs to be the same type as the destination in order
	// to pattern match correctly.
	SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == VPERM_3OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else
	PassThru = Src2;

	// Swap Src1 and Src2 in the node creation
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src2, Src1, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_MASK3:
	case FMA_OP_MASKZ:
	case FMA_OP_MASK: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src1, Src2, Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
	dl, Op.getValueType(),
	Src1, Src2, Src3),
	Mask, PassThru, Subtarget, DAG);
	}
	case FMA_OP_SCALAR_MASK:
	case FMA_OP_SCALAR_MASK3:
	case FMA_OP_SCALAR_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Mask = Op.getOperand(4);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = SDValue();

	// set PassThru element
	if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);
	else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
	PassThru = Src3;
	else
	PassThru = Src1;

	SDValue Rnd = Op.getOperand(5);
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
	Op.getValueType(), Src1, Src2,
	Src3, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	case TERLOG_OP_MASK:
	case TERLOG_OP_MASKZ: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
	SDValue Mask = Op.getOperand(5);
	MVT VT = Op.getSimpleValueType();
	SDValue PassThru = Src1;
	// Set PassThru element.
	if (IntrData->Type == TERLOG_OP_MASKZ)
	PassThru = getZeroVector(VT, Subtarget, DAG, dl);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Src4),
	Mask, PassThru, Subtarget, DAG);
	}
	case CVTPD2PS:
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
	DAG.getIntPtrConstant(0, dl));
	case CVTPD2PS_MASK: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	// We add rounding mode to the Node when
	// - RM Opcode is specified and
	// - RM is not "current direction".
	unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
	if (IntrWithRoundingModeOpcode != 0) {
	SDValue Rnd = Op.getOperand(4);
	if (!isRoundModeCurDirection(Rnd)) {
	return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
	dl, Op.getValueType(),
	Src, Rnd),
	Mask, PassThru, Subtarget, DAG);
	}
	}
	assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
	// ISD::FP_ROUND has a second argument that indicates if the truncation
	// does not change the value. Set it to 0 since it can change.
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
	DAG.getIntPtrConstant(0, dl)),
	Mask, PassThru, Subtarget, DAG);
	}
	case FPCLASS: {
	// FPclass intrinsics with mask
	SDValue Src1 = Op.getOperand(1);
	MVT VT = Src1.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
	SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
	DAG.getTargetConstant(0, dl, MaskVT),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case FPCLASSS: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Imm = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
	SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
	DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case CMP_MASK:
	case CMP_MASK_CC: {
	// Comparison intrinsics with masks.
	// Example of transformation:
	// (i8 (int_x86_avx512_mask_pcmpeq_q_128
	// (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
	// (i8 (bitcast
	// (v8i1 (insert_subvector undef,
	// (v2i1 (and (PCMPEQM %a, %b),
	// (extract_subvector
	// (v8i1 (bitcast %mask)), 0))), 0))))
	MVT VT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
	MVT BitcastVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	SDValue Cmp;
	if (IntrData->Type == CMP_MASK_CC) {
	SDValue CC = Op.getOperand(3);
	CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
	// We specify 2 possible opcodes for intrinsics with rounding modes.
	// First, we check if the intrinsic may have non-default rounding mode,
	// (IntrData->Opc1 != 0), then we check the rounding mode operand.
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2), CC);

	} else {
	assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
	Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
	Op.getOperand(2));
	}
	SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
	DAG.getTargetConstant(0, dl,
	MaskVT),
	Subtarget, DAG);
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CmpMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case CMP_MASK_SCALAR_CC: {
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
	SDValue Mask = Op.getOperand(4);

	SDValue Cmp;
	if (IntrData->Opc1 != 0) {
	SDValue Rnd = Op.getOperand(5);
	if (!isRoundModeCurDirection(Rnd))
	Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
	}
	//default rounding mode
	if(!Cmp.getNode())
	Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);

	SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
	DAG.getTargetConstant(0, dl,
	MVT::i1),
	Subtarget, DAG);
	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
	DAG.getIntPtrConstant(0, dl));
	}
	case COMI: { // Comparison intrinsics
	ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
	SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
	SDValue SetCC;
	switch (CC) {
	case ISD::SETEQ: { // (ZF = 0 and PF = 0)
	SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
	SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
	break;
	}
	case ISD::SETNE: { // (ZF = 1 or PF = 1)
	SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
	SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
	SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
	break;
	}
	case ISD::SETGT: // (CF = 0 and ZF = 0)
	SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
	break;
	case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
	SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
	break;
	}
	case ISD::SETGE: // CF = 0
	SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
	break;
	case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
	SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
	break;
	default:
	llvm_unreachable("Unexpected illegal condition!");
	}
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case COMI_RM: { // Comparison intrinsics with Sae
	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
	SDValue Sae = Op.getOperand(4);

	SDValue FCmp;
	if (isRoundModeCurDirection(Sae))
	FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8));
	else
	FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
	DAG.getConstant(CondVal, dl, MVT::i8), Sae);
	return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
	DAG.getIntPtrConstant(0, dl));
	}
	case VSHIFT:
	return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
	Op.getOperand(1), Op.getOperand(2), Subtarget,
	DAG);
	case COMPRESS_EXPAND_IN_REG: {
	SDValue Mask = Op.getOperand(3);
	SDValue DataToCompress = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	if (isAllOnesConstant(Mask)) // return data as is
	return Op.getOperand(1);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	DataToCompress),
	Mask, PassThru, Subtarget, DAG);
	}
	case BROADCASTM: {
	SDValue Mask = Op.getOperand(1);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Mask.getSimpleValueType().getSizeInBits());
	Mask = DAG.getBitcast(MaskVT, Mask);
	return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
	}
	case KUNPCK: {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);

	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
	// Arguments should be swapped.
	SDValue Res = DAG.getNode(IntrData->Opc0, dl,
	MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
	Src2, Src1);
	return DAG.getBitcast(VT, Res);
	}
	case MASK_BINOP: {
	MVT VT = Op.getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
	SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
	return DAG.getBitcast(VT, Res);
	}
	case FIXUPIMMS:
	case FIXUPIMMS_MASKZ:
	case FIXUPIMM:
	case FIXUPIMM_MASKZ:{
	SDValue Src1 = Op.getOperand(1);
	SDValue Src2 = Op.getOperand(2);
	SDValue Src3 = Op.getOperand(3);
	SDValue Imm = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Passthru = (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMMS ) ?
	Src1 : getZeroVector(VT, Subtarget, DAG, dl);
	// We specify 2 possible modes for intrinsics, with/without rounding
	// modes.
	// First, we check if the intrinsic have rounding mode (7 operands),
	// if not, we set rounding mode to "current".
	SDValue Rnd;
	if (Op.getNumOperands() == 7)
	Rnd = Op.getOperand(6);
	else
	Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
	if (IntrData->Type == FIXUPIMM \|\| IntrData->Type == FIXUPIMM_MASKZ)
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
	return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	Src1, Src2, Src3, Imm, Rnd),
	Mask, Passthru, Subtarget, DAG);
	}
	case CONVERT_TO_MASK: {
	MVT SrcVT = Op.getOperand(1).getSimpleValueType();
	MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
	MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());

	SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
	Op.getOperand(1));
	SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
	DAG.getUNDEF(BitcastVT), CvtMask,
	DAG.getIntPtrConstant(0, dl));
	return DAG.getBitcast(Op.getValueType(), Res);
	}
	case BRCST_SUBVEC_TO_VEC: {
	SDValue Src = Op.getOperand(1);
	SDValue Passthru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	EVT resVT = Passthru.getValueType();
	SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
	DAG.getUNDEF(resVT), Src,
	DAG.getIntPtrConstant(0, dl));
	SDValue immVal;
	if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
	immVal = DAG.getConstant(0x44, dl, MVT::i8);
	else
	immVal = DAG.getConstant(0, dl, MVT::i8);
	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
	subVec, subVec, immVal),
	Mask, Passthru, Subtarget, DAG);
	}
	case BRCST32x2_TO_VEC: {
	SDValue Src = Op.getOperand(1);
	SDValue PassThru = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);

	assert((VT.getScalarType() == MVT::i32 \|\|
	VT.getScalarType() == MVT::f32) && "Unexpected type!");
	//bitcast Src to packed 64
	MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
	MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
	Src = DAG.getBitcast(BitcastVT, Src);

	return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
	Mask, PassThru, Subtarget, DAG);
	}
	default:
	break;
	}
	}

	switch (IntNo) {
	default: return SDValue(); // Don't custom lower most intrinsics.

	case Intrinsic::x86_avx2_permd:
	case Intrinsic::x86_avx2_permps:
	// Operands intentionally swapped. Mask is last operand to intrinsic,
	// but second operand for node/instruction.
	return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
	Op.getOperand(2), Op.getOperand(1));

	// ptest and testp intrinsics. The intrinsic these come from are designed to
	// return an integer value, not just an instruction so lower it to the ptest
	// or testp pattern and a setcc for the result.
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestz_256:
	case Intrinsic::x86_avx_ptestc_256:
	case Intrinsic::x86_avx_ptestnzc_256:
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256: {
	bool IsTestPacked = false;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
	case Intrinsic::x86_avx_vtestz_ps:
	case Intrinsic::x86_avx_vtestz_pd:
	case Intrinsic::x86_avx_vtestz_ps_256:
	case Intrinsic::x86_avx_vtestz_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestz:
	case Intrinsic::x86_avx_ptestz_256:
	// ZF = 1
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_avx_vtestc_ps:
	case Intrinsic::x86_avx_vtestc_pd:
	case Intrinsic::x86_avx_vtestc_ps_256:
	case Intrinsic::x86_avx_vtestc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestc:
	case Intrinsic::x86_avx_ptestc_256:
	// CF = 1
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_avx_vtestnzc_ps:
	case Intrinsic::x86_avx_vtestnzc_pd:
	case Intrinsic::x86_avx_vtestnzc_ps_256:
	case Intrinsic::x86_avx_vtestnzc_pd_256:
	IsTestPacked = true;
	LLVM_FALLTHROUGH;
	case Intrinsic::x86_sse41_ptestnzc:
	case Intrinsic::x86_avx_ptestnzc_256:
	// ZF and CF = 0
	X86CC = X86::COND_A;
	break;
	}

	SDValue LHS = Op.getOperand(1);
	SDValue RHS = Op.getOperand(2);
	unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
	SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}
	case Intrinsic::x86_avx512_kortestz_w:
	case Intrinsic::x86_avx512_kortestc_w: {
	X86::CondCode X86CC =
	(IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
	SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_avx512_knot_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kandn_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	// Invert LHS for the not.
	LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
	DAG.getConstant(1, dl, MVT::v16i1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_avx512_kxnor_w: {
	SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
	SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
	SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
	// Invert result for the not.
	Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
	DAG.getConstant(1, dl, MVT::v16i1));
	return DAG.getBitcast(MVT::i16, Res);
	}

	case Intrinsic::x86_sse42_pcmpistria128:
	case Intrinsic::x86_sse42_pcmpestria128:
	case Intrinsic::x86_sse42_pcmpistric128:
	case Intrinsic::x86_sse42_pcmpestric128:
	case Intrinsic::x86_sse42_pcmpistrio128:
	case Intrinsic::x86_sse42_pcmpestrio128:
	case Intrinsic::x86_sse42_pcmpistris128:
	case Intrinsic::x86_sse42_pcmpestris128:
	case Intrinsic::x86_sse42_pcmpistriz128:
	case Intrinsic::x86_sse42_pcmpestriz128: {
	unsigned Opcode;
	X86::CondCode X86CC;
	switch (IntNo) {
	default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
	case Intrinsic::x86_sse42_pcmpistria128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpestria128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_A;
	break;
	case Intrinsic::x86_sse42_pcmpistric128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpestric128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_B;
	break;
	case Intrinsic::x86_sse42_pcmpistrio128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpestrio128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_O;
	break;
	case Intrinsic::x86_sse42_pcmpistris128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpestris128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_S;
	break;
	case Intrinsic::x86_sse42_pcmpistriz128:
	Opcode = X86ISD::PCMPISTRI;
	X86CC = X86::COND_E;
	break;
	case Intrinsic::x86_sse42_pcmpestriz128:
	Opcode = X86ISD::PCMPESTRI;
	X86CC = X86::COND_E;
	break;
	}
	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
	SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
	return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
	}

	case Intrinsic::x86_sse42_pcmpistri128:
	case Intrinsic::x86_sse42_pcmpestri128: {
	unsigned Opcode;
	if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
	Opcode = X86ISD::PCMPISTRI;
	else
	Opcode = X86ISD::PCMPESTRI;

	SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
	return DAG.getNode(Opcode, dl, VTs, NewOps);
	}

	case Intrinsic::eh_sjlj_lsda: {
	MachineFunction &MF = DAG.getMachineFunction();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	auto &Context = MF.getMMI().getContext();
	MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
	Twine(MF.getFunctionNumber()));
	return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
	}

	case Intrinsic::x86_seh_lsda: {
	// Compute the symbol for the LSDA. We know it'll get emitted later.
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Op1 = Op.getOperand(1);
	auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
	MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
	GlobalValue::dropLLVMManglingEscape(Fn->getName()));

	// Generate a simple absolute symbol reference. This intrinsic is only
	// supported on 32-bit Windows, which isn't PIC.
	SDValue Result = DAG.getMCSymbol(LSDASym, VT);
	return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
	}

	case Intrinsic::x86_seh_recoverfp: {
	SDValue FnOp = Op.getOperand(1);
	SDValue IncomingFPOp = Op.getOperand(2);
	GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
	auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
	if (!Fn)
	report_fatal_error(
	"llvm.x86.seh.recoverfp must take a function as the first argument");
	return recoverFramePointer(DAG, Fn, IncomingFPOp);
	}

	case Intrinsic::localaddress: {
	// Returns one of the stack, base, or frame pointer registers, depending on
	// which is used to reference local variables.
	MachineFunction &MF = DAG.getMachineFunction();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned Reg;
	if (RegInfo->hasBasePointer(MF))
	Reg = RegInfo->getBaseRegister();
	else // This function handles the SP or FP case.
	Reg = RegInfo->getPtrSizedFrameRegister(MF);
	return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
	}
	}
	}

	static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	EVT MaskVT = Mask.getValueType();
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(Mask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	// If source is undef or we know it won't be used, use a zero vector
	// to break register dependency.
	// TODO: use undef instead and let ExecutionDepsFix deal with it?
	if (Src.isUndef() \|\| ISD::isBuildVectorAllOnes(VMask.getNode()))
	Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
	SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Src, SDValue Mask, SDValue Base,
	SDValue Index, SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT = MVT::getVectorVT(MVT::i1,
	Index.getSimpleValueType().getVectorNumElements());

	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
	SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
	return SDValue(Res, 1);
	}

	static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
	SDValue Mask, SDValue Base, SDValue Index,
	SDValue ScaleOp, SDValue Chain,
	const X86Subtarget &Subtarget) {
	SDLoc dl(Op);
	auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
	// Scale must be constant.
	if (!C)
	return SDValue();
	SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
	SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
	SDValue Segment = DAG.getRegister(0, MVT::i32);
	MVT MaskVT =
	MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
	SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	/// Handles the lowering of builtin intrinsic that return the value
	/// of the extended control register.
	static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the XCR register to
	// return.
	SDValue Chain =
	DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
	SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
	Chain = SDValue(N1, 0);

	// Reads the content of XCR and returns it in registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// Merge the two 32-bit values into a 64-bit one..
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read performance monitor
	/// counters (x86_rdpmc).
	static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue LO, HI;

	// The ECX register is used to select the index of the performance counter
	// to read.
	SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
	N->getOperand(2));
	SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);

	// Reads the content of a 64-bit performance counter and returns it in the
	// registers EDX:EAX.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	Chain = HI.getValue(1);

	if (Subtarget.is64Bit()) {
	// The EAX register is loaded with the low-order 32 bits. The EDX register
	// is loaded with the supported high-order bits of the counter.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	/// Handles the lowering of builtin intrinsics that read the time stamp counter
	/// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
	/// READCYCLECOUNTER nodes.
	static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SmallVectorImpl<SDValue> &Results) {
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
	SDValue LO, HI;

	// The processor's time-stamp counter (a 64-bit MSR) is stored into the
	// EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
	// and the EAX register is loaded with the low-order 32 bits.
	if (Subtarget.is64Bit()) {
	LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
	LO.getValue(2));
	} else {
	LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
	HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
	LO.getValue(2));
	}
	SDValue Chain = HI.getValue(1);

	if (Opcode == X86ISD::RDTSCP_DAG) {
	assert(N->getNumOperands() == 3 && "Unexpected number of operands!");

	// Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
	// the ECX register. Add 'ecx' explicitly to the chain.
	SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
	HI.getValue(2));
	// Explicitly store the content of ECX at the location passed in input
	// to the 'rdtscp' intrinsic.
	Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
	MachinePointerInfo());
	}

	if (Subtarget.is64Bit()) {
	// The EDX register is loaded with the high-order 32 bits of the MSR, and
	// the EAX register is loaded with the low-order 32 bits.
	SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
	DAG.getConstant(32, DL, MVT::i8));
	Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
	Results.push_back(Chain);
	return;
	}

	// Use a buildpair to merge the two 32-bit values into a 64-bit one.
	SDValue Ops[] = { LO, HI };
	SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
	Results.push_back(Pair);
	Results.push_back(Chain);
	}

	static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SmallVector<SDValue, 2> Results;
	SDLoc DL(Op);
	getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, DL);
	}

	static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue RegNode = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EH registrations only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
	EHInfo->EHRegNodeFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
	MachineFunction &MF = DAG.getMachineFunction();
	SDValue Chain = Op.getOperand(0);
	SDValue EHGuard = Op.getOperand(2);
	WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
	if (!EHInfo)
	report_fatal_error("EHGuard only live in functions using WinEH");

	// Cast the operand to an alloca, and remember the frame index.
	auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
	if (!FINode)
	report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
	EHInfo->EHGuardFrameIndex = FINode->getIndex();

	// Return the chain operand without making any DAG nodes.
	return Chain;
	}

	/// Emit Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
	SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
	SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
	SDValue Ops[] = { Chain, Val, Ptr, Undef };
	return SignedSat ?
	DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	/// Emit Masked Truncating Store with signed or unsigned saturation.
	static SDValue
	EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
	SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
	MachineMemOperand *MMO, SelectionDAG &DAG) {

	SDVTList VTs = DAG.getVTList(MVT::Other);
	SDValue Ops[] = { Chain, Ptr, Mask, Val };
	return SignedSat ?
	DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
	DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
	}

	static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();

	const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
	if (!IntrData) {
	switch (IntNo) {
	case llvm::Intrinsic::x86_seh_ehregnode:
	return MarkEHRegistrationNode(Op, DAG);
	case llvm::Intrinsic::x86_seh_ehguard:
	return MarkEHGuard(Op, DAG);
	case llvm::Intrinsic::x86_flags_read_u32:
	case llvm::Intrinsic::x86_flags_read_u64:
	case llvm::Intrinsic::x86_flags_write_u32:
	case llvm::Intrinsic::x86_flags_write_u64: {
	// We need a frame pointer because this will get lowered to a PUSH/POP
	// sequence.
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	// Don't do anything here, we will expand these intrinsics out later
	// during ExpandISelPseudos in EmitInstrWithCustomInserter.
	return SDValue();
	}
	case Intrinsic::x86_lwpins32:
	case Intrinsic::x86_lwpins64: {
	SDLoc dl(Op);
	SDValue Chain = Op->getOperand(0);
	SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
	SDValue LwpIns =
	DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
	Op->getOperand(3), Op->getOperand(4));
	SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
	SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
	LwpIns.getValue(1));
	}
	}
	return SDValue();
	}

	SDLoc dl(Op);
	switch(IntrData->Type) {
	default: llvm_unreachable("Unknown Intrinsic Type");
	case RDSEED:
	case RDRAND: {
	// Emit the node with the right value type.
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
	SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
	// Otherwise return the value from Rand, which is always 0, casted to i32.
	SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
	DAG.getConstant(1, dl, Op->getValueType(1)),
	DAG.getConstant(X86::COND_B, dl, MVT::i32),
	SDValue(Result.getNode(), 1) };
	SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
	DAG.getVTList(Op->getValueType(1), MVT::Glue),
	Ops);

	// Return { result, isValid, chain }.
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
	SDValue(Result.getNode(), 2));
	}
	case GATHER_AVX2: {
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case GATHER: {
	//gather(v1, mask, index, base, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Src = Op.getOperand(2);
	SDValue Base = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Mask = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
	Chain, Subtarget);
	}
	case SCATTER: {
	//scatter(base, mask, index, v1, scale);
	SDValue Chain = Op.getOperand(0);
	SDValue Base = Op.getOperand(2);
	SDValue Mask = Op.getOperand(3);
	SDValue Index = Op.getOperand(4);
	SDValue Src = Op.getOperand(5);
	SDValue Scale = Op.getOperand(6);
	return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
	Scale, Chain, Subtarget);
	}
	case PREFETCH: {
	SDValue Hint = Op.getOperand(6);
	unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
	assert((HintVal == 2 \|\| HintVal == 3) &&
	"Wrong prefetch hint in intrinsic: should be 2 or 3");
	unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
	SDValue Chain = Op.getOperand(0);
	SDValue Mask = Op.getOperand(2);
	SDValue Index = Op.getOperand(3);
	SDValue Base = Op.getOperand(4);
	SDValue Scale = Op.getOperand(5);
	return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
	Subtarget);
	}
	// Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
	case RDTSC: {
	SmallVector<SDValue, 2> Results;
	getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
	Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Read Performance Monitoring Counters.
	case RDPMC: {
	SmallVector<SDValue, 2> Results;
	getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// Get Extended Control Register.
	case XGETBV: {
	SmallVector<SDValue, 2> Results;
	getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
	return DAG.getMergeValues(Results, dl);
	}
	// XTEST intrinsics.
	case XTEST: {
	SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));

	SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
	SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
	Ret, SDValue(InTrans.getNode(), 1));
	}
	// ADC/ADCX/SBB
	case ADX: {
	SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
	SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
	SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
	DAG.getConstant(-1, dl, MVT::i8));
	SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
	Op.getOperand(4), GenCF.getValue(1));
	SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
	Op.getOperand(5), MachinePointerInfo());
	SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
	SDValue Results[] = { SetCC, Store };
	return DAG.getMergeValues(Results, dl);
	}
	case COMPRESS_TO_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToCompress = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = DataToCompress.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // return just a store
	return DAG.getStore(Chain, dl, DataToCompress, Addr,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
	MemIntr->getMemOperand(),
	false /* truncating /, true / compressing */);
	}
	case TRUNCATE_TO_MEM_VI8:
	case TRUNCATE_TO_MEM_VI16:
	case TRUNCATE_TO_MEM_VI32: {
	SDValue Mask = Op.getOperand(4);
	SDValue DataToTruncate = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	EVT MemVT = MemIntr->getMemoryVT();

	uint16_t TruncationOp = IntrData->Opc0;
	switch (TruncationOp) {
	case X86ISD::VTRUNC: {
	if (isAllOnesConstant(Mask)) // return just a truncate store
	return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand());

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
	MemIntr->getMemOperand(), true /* truncating */);
	}
	case X86ISD::VTRUNCUS:
	case X86ISD::VTRUNCS: {
	bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
	if (isAllOnesConstant(Mask))
	return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
	MemIntr->getMemOperand(), DAG);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);

	return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
	VMask, MemVT, MemIntr->getMemOperand(), DAG);
	}
	default:
	llvm_unreachable("Unsupported truncstore intrinsic");
	}
	}

	case EXPAND_FROM_MEM: {
	SDValue Mask = Op.getOperand(4);
	SDValue PassThru = Op.getOperand(3);
	SDValue Addr = Op.getOperand(2);
	SDValue Chain = Op.getOperand(0);
	MVT VT = Op.getSimpleValueType();

	MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
	assert(MemIntr && "Expected MemIntrinsicSDNode!");

	if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
	return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
	if (X86::isZeroNode(Mask))
	return DAG.getUNDEF(VT);

	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
	return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
	MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
	true /* expanding */);
	}
	}
	}

	SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
	MFI.setReturnAddressIsTaken(true);

	if (verifyReturnAddressArgumentIsConstant(Op, DAG))
	return SDValue();

	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	SDLoc dl(Op);
	EVT PtrVT = getPointerTy(DAG.getDataLayout());

	if (Depth > 0) {
	SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
	DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
	MachinePointerInfo());
	}

	// Just load the return address.
	SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
	return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
	MachinePointerInfo());
	}

	SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
	SelectionDAG &DAG) const {
	DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
	return getReturnAddressFrameIndex(DAG);
	}

	SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
	MachineFunction &MF = DAG.getMachineFunction();
	MachineFrameInfo &MFI = MF.getFrameInfo();
	X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	EVT VT = Op.getValueType();

	MFI.setFrameAddressIsTaken(true);

	if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
	// Depth > 0 makes no sense on targets which use Windows unwind codes. It
	// is not possible to crawl up the stack without looking at the unwind codes
	// simultaneously.
	int FrameAddrIndex = FuncInfo->getFAIndex();
	if (!FrameAddrIndex) {
	// Set up a frame object for the return address.
	unsigned SlotSize = RegInfo->getSlotSize();
	FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
	SlotSize, /Offset=/0, /IsImmutable=/false);
	FuncInfo->setFAIndex(FrameAddrIndex);
	}
	return DAG.getFrameIndex(FrameAddrIndex, VT);
	}

	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	SDLoc dl(Op); // FIXME probably not meaningful
	unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
	assert(((FrameReg == X86::RBP && VT == MVT::i64) \|\|
	(FrameReg == X86::EBP && VT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
	while (Depth--)
	FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
	MachinePointerInfo());
	return FrameAddr;
	}

	// FIXME? Maybe this could be a TableGen attribute on some registers and
	// this table could be generated automatically from RegInfo.
	unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
	SelectionDAG &DAG) const {
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	const MachineFunction &MF = DAG.getMachineFunction();

	unsigned Reg = StringSwitch<unsigned>(RegName)
	.Case("esp", X86::ESP)
	.Case("rsp", X86::RSP)
	.Case("ebp", X86::EBP)
	.Case("rbp", X86::RBP)
	.Default(0);

	if (Reg == X86::EBP \|\| Reg == X86::RBP) {
	if (!TFI.hasFP(MF))
	report_fatal_error("register " + StringRef(RegName) +
	" is allocatable: function has no frame pointer");
	#ifndef NDEBUG
	else {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg =
	RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
	assert((FrameReg == X86::EBP \|\| FrameReg == X86::RBP) &&
	"Invalid Frame Register!");
	}
	#endif
	}

	if (Reg)
	return Reg;

	report_fatal_error("Invalid register name global variable");
	}

	SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
	SelectionDAG &DAG) const {
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
	}

	unsigned X86TargetLowering::getExceptionPointerRegister(
	const Constant *PersonalityFn) const {
	if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;

	return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
	}

	unsigned X86TargetLowering::getExceptionSelectorRegister(
	const Constant *PersonalityFn) const {
	// Funclet personalities don't use selectors (the runtime does the selection).
	assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
	return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
	}

	bool X86TargetLowering::needsFixedCatchObjects() const {
	return Subtarget.isTargetWin64();
	}

	SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
	SDValue Chain = Op.getOperand(0);
	SDValue Offset = Op.getOperand(1);
	SDValue Handler = Op.getOperand(2);
	SDLoc dl (Op);

	EVT PtrVT = getPointerTy(DAG.getDataLayout());
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
	assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) \|\|
	(FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
	"Invalid Frame Register!");
	SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
	unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;

	SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
	DAG.getIntPtrConstant(RegInfo->getSlotSize(),
	dl));
	StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
	Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
	Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);

	return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
	DAG.getRegister(StoreAddrReg, PtrVT));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	// If the subtarget is not 64bit, we may need the global base reg
	// after isel expand pseudo, i.e., after CGBR pass ran.
	// Therefore, ask for the GlobalBaseReg now, so that the pass
	// inserts the code for us in case we need it.
	// Otherwise, we will end up in a situation where we will
	// reference a virtual register that is not defined!
	if (!Subtarget.is64Bit()) {
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	(void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
	}
	return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
	Op.getOperand(0), Op.getOperand(1));
	}

	SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
	SelectionDAG &DAG) const {
	SDLoc DL(Op);
	return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
	Op.getOperand(0));
	}

	static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
	return Op.getOperand(0);
	}

	SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
	SelectionDAG &DAG) const {
	SDValue Root = Op.getOperand(0);
	SDValue Trmp = Op.getOperand(1); // trampoline
	SDValue FPtr = Op.getOperand(2); // nested function
	SDValue Nest = Op.getOperand(3); // 'nest' parameter value
	SDLoc dl (Op);

	const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	if (Subtarget.is64Bit()) {
	SDValue OutChains[6];

	// Large code-model.
	const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
	const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.

	const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
	const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;

	const unsigned char REX_WB = 0x40 \| 0x08 \| 0x01; // REX prefix

	// Load the pointer to the nested function into R11.
	unsigned OpCode = ((MOV64ri \| N86R11) << 8) \| REX_WB; // movabsq r11
	SDValue Addr = Trmp;
	OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(2, dl, MVT::i64));
	OutChains[1] =
	DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
	/* Alignment = */ 2);

	// Load the 'nest' parameter value into R10.
	// R10 is specified in X86CallingConv.td
	OpCode = ((MOV64ri \| N86R10) << 8) \| REX_WB; // movabsq r10
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(10, dl, MVT::i64));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 10));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(12, dl, MVT::i64));
	OutChains[3] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
	/* Alignment = */ 2);

	// Jump to the nested function.
	OpCode = (JMP64r << 8) \| REX_WB; // jmpq *...
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(20, dl, MVT::i64));
	OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
	Addr, MachinePointerInfo(TrmpAddr, 20));

	unsigned char ModRM = N86R11 \| (4 << 3) \| (3 << 6); // ...r11
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
	DAG.getConstant(22, dl, MVT::i64));
	OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 22));

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	} else {
	const Function *Func =
	cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
	CallingConv::ID CC = Func->getCallingConv();
	unsigned NestReg;

	switch (CC) {
	default:
	llvm_unreachable("Unsupported calling convention");
	case CallingConv::C:
	case CallingConv::X86_StdCall: {
	// Pass 'nest' parameter in ECX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::ECX;

	// Check that ECX wasn't needed by an 'inreg' parameter.
	FunctionType *FTy = Func->getFunctionType();
	const AttributeList &Attrs = Func->getAttributes();

	if (!Attrs.isEmpty() && !Func->isVarArg()) {
	unsigned InRegCount = 0;
	unsigned Idx = 1;

	for (FunctionType::param_iterator I = FTy->param_begin(),
	E = FTy->param_end(); I != E; ++I, ++Idx)
	if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
	auto &DL = DAG.getDataLayout();
	// FIXME: should only count parameters that are lowered to integers.
	InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
	}

	if (InRegCount > 2) {
	report_fatal_error("Nest register in use - reduce number of inreg"
	" parameters!");
	}
	}
	break;
	}
	case CallingConv::X86_FastCall:
	case CallingConv::X86_ThisCall:
	case CallingConv::Fast:
	// Pass 'nest' parameter in EAX.
	// Must be kept in sync with X86CallingConv.td
	NestReg = X86::EAX;
	break;
	}

	SDValue OutChains[4];
	SDValue Addr, Disp;

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(10, dl, MVT::i32));
	Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);

	// This is storing the opcode for MOV32ri.
	const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
	const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
	OutChains[0] =
	DAG.getStore(Root, dl, DAG.getConstant(MOV32ri \| N86Reg, dl, MVT::i8),
	Trmp, MachinePointerInfo(TrmpAddr));

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(1, dl, MVT::i32));
	OutChains[1] =
	DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
	/* Alignment = */ 1);

	const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(5, dl, MVT::i32));
	OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
	Addr, MachinePointerInfo(TrmpAddr, 5),
	/* Alignment = */ 1);

	Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
	DAG.getConstant(6, dl, MVT::i32));
	OutChains[3] =
	DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
	/* Alignment = */ 1);

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
	}
	}

	SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
	SelectionDAG &DAG) const {
	/*
	The rounding mode is in bits 11:10 of FPSR, and has the following
	settings:
	00 Round to nearest
	01 Round to -inf
	10 Round to +inf
	11 Round to 0

	FLT_ROUNDS, on the other hand, expects the following:
	-1 Undefined
	0 Round to 0
	1 Round to nearest
	2 Round to +inf
	3 Round to -inf

	To perform the conversion, we do:
	(((((FPSR & 0x800) >> 11) \| ((FPSR & 0x400) >> 9)) + 1) & 3)
	*/

	MachineFunction &MF = DAG.getMachineFunction();
	const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
	unsigned StackAlignment = TFI.getStackAlignment();
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);

	// Save FP Control Word to stack slot
	int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
	SDValue StackSlot =
	DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));

	MachineMemOperand *MMO =
	MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
	MachineMemOperand::MOStore, 2, 2);

	SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
	SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
	DAG.getVTList(MVT::Other),
	Ops, MVT::i16, MMO);

	// Load FP Control Word from stack slot
	SDValue CWD =
	DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());

	// Transform as necessary
	SDValue CWD1 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x800, DL, MVT::i16)),
	DAG.getConstant(11, DL, MVT::i8));
	SDValue CWD2 =
	DAG.getNode(ISD::SRL, DL, MVT::i16,
	DAG.getNode(ISD::AND, DL, MVT::i16,
	CWD, DAG.getConstant(0x400, DL, MVT::i16)),
	DAG.getConstant(9, DL, MVT::i8));

	SDValue RetVal =
	DAG.getNode(ISD::AND, DL, MVT::i16,
	DAG.getNode(ISD::ADD, DL, MVT::i16,
	DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
	DAG.getConstant(1, DL, MVT::i16)),
	DAG.getConstant(3, DL, MVT::i16));

	return DAG.getNode((VT.getSizeInBits() < 16 ?
	ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
	}

	// Split an unary integer op into 2 half sized ops.
	static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumElems = VT.getVectorNumElements();
	unsigned SizeInBits = VT.getSizeInBits();

	// Extract the Lo/Hi vectors
	SDLoc dl(Op);
	SDValue Src = Op.getOperand(0);
	SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
	SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
	DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	// Decompose 512-bit ops into smaller 256-bit ops.
	static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is512BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 512-bit vector integer operation");
	return LowerVectorIntUnary(Op, DAG);
	}

	/// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
	//
	// i8/i16 vector implemented using dword LZCNT vector instruction
	// ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
	// split the vector, perform operation on it's Lo a Hi part and
	// concatenate the results.
	static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getOpcode() == ISD::CTLZ);
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned NumElems = VT.getVectorNumElements();

	assert((EltVT == MVT::i8 \|\| EltVT == MVT::i16) &&
	"Unsupported element type");

	// Split vector, it's Lo and Hi parts will be handled in next iteration.
	if (16 < NumElems)
	return LowerVectorIntUnary(Op, DAG);

	MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
	assert((NewVT.is256BitVector() \|\| NewVT.is512BitVector()) &&
	"Unsupported value type for operation");

	// Use native supported vector instruction vplzcntd.
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
	SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
	SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
	SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);

	return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
	}

	// Lower CTLZ using a PSHUFB lookup table implementation.
	static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	int NumElts = VT.getVectorNumElements();
	int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
	MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);

	// Per-nibble leading zero PSHUFB lookup table.
	const int LUT[16] = {/* 0 / 4, / 1 / 3, / 2 / 2, / 3 */ 2,
	/* 4 / 1, / 5 / 1, / 6 / 1, / 7 */ 1,
	/* 8 / 0, / 9 / 0, / a / 0, / b */ 0,
	/* c / 0, / d / 0, / e / 0, / f */ 0};

	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumBytes; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);

	// Begin by bitcasting the input to byte vector, then split those bytes
	// into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
	// If the hi input nibble is zero then we add both results together, otherwise
	// we just take the hi result (by masking the lo result to zero before the
	// add).
	SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
	SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);

	SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
	SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
	SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);

	Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
	Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
	SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);

	// Merge result back from vXi8 back to VT, working on the lo/hi halves
	// of the current vector width in the same way we did for the nibbles.
	// If the upper half of the input element is zero then add the halves'
	// leading zero counts together, otherwise just use the upper half's.
	// Double the width of the result until we are at target width.
	while (CurrVT != VT) {
	int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
	int CurrNumElts = CurrVT.getVectorNumElements();
	MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
	MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
	SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);

	// Check if the upper half of the input element is zero.
	SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
	DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
	HiZ = DAG.getBitcast(NextVT, HiZ);

	// Move the upper/lower halves to the lower bits as we'll be extending to
	// NextVT. Mask the lower result to zero if HiZ is true and add the results
	// together.
	SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
	SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
	SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
	R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
	Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
	CurrVT = NextVT;
	}

	return Res;
	}

	static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	if (Subtarget.hasCDI())
	return LowerVectorCTLZ_AVX512CDI(Op, DAG);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
	return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
	}

	static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT OpVT = VT;
	unsigned NumBits = VT.getSizeInBits();
	SDLoc dl(Op);
	unsigned Opc = Op.getOpcode();

	if (VT.isVector())
	return LowerVectorCTLZ(Op, dl, Subtarget, DAG);

	Op = Op.getOperand(0);
	if (VT == MVT::i8) {
	// Zero extend to i32 since there is not an i8 bsr.
	OpVT = MVT::i32;
	Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
	}

	// Issue a bsr (scan bits in reverse) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);

	if (Opc == ISD::CTLZ) {
	// If src is zero (i.e. bsr sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
	}

	// Finally xor with NumBits-1.
	Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
	DAG.getConstant(NumBits - 1, dl, OpVT));

	if (VT == MVT::i8)
	Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
	return Op;
	}

	static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	unsigned NumBits = VT.getScalarSizeInBits();
	SDLoc dl(Op);

	if (VT.isVector()) {
	SDValue N0 = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, VT);

	// lsb(x) = (x & -x)
	SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
	DAG.getNode(ISD::SUB, dl, VT, Zero, N0));

	// cttz_undef(x) = (width - 1) - ctlz(lsb)
	if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
	SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
	return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
	DAG.getNode(ISD::CTLZ, dl, VT, LSB));
	}

	// cttz(x) = ctpop(lsb - 1)
	SDValue One = DAG.getConstant(1, dl, VT);
	return DAG.getNode(ISD::CTPOP, dl, VT,
	DAG.getNode(ISD::SUB, dl, VT, LSB, One));
	}

	assert(Op.getOpcode() == ISD::CTTZ &&
	"Only scalar CTTZ requires custom lowering");

	// Issue a bsf (scan bits forward) which also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));

	// If src is zero (i.e. bsf sets ZF), returns NumBits.
	SDValue Ops[] = {
	Op,
	DAG.getConstant(NumBits, dl, VT),
	DAG.getConstant(X86::COND_E, dl, MVT::i8),
	Op.getValue(1)
	};
	return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
	}

	/// Break a 256-bit integer operation into two new 128-bit ones and then
	/// concatenate the result back.
	static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is256BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	/// Break a 512-bit integer operation into two new 256-bit ones and then
	/// concatenate the result back.
	static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();

	assert(VT.is512BitVector() && VT.isInteger() &&
	"Unsupported value type for operation");

	unsigned NumElems = VT.getVectorNumElements();
	SDLoc dl(Op);

	// Extract the LHS vectors
	SDValue LHS = Op.getOperand(0);
	SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
	SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);

	// Extract the RHS vectors
	SDValue RHS = Op.getOperand(1);
	SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
	SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);

	MVT EltVT = VT.getVectorElementType();
	MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);

	return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
	DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
	}

	static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
	Op.getOperand(0), Op.getOperand(1));
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntUnary(Op, DAG);
	}

	static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().is256BitVector() &&
	Op.getSimpleValueType().isInteger() &&
	"Only handle AVX 256-bit vector integer operation");
	return Lower256IntArith(Op, DAG);
	}

	static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	if (VT.getScalarType() == MVT::i1)
	return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
	// vector pairs, multiply and truncate.
	if (VT == MVT::v16i8 \|\| VT == MVT::v32i8 \|\| VT == MVT::v64i8) {
	if (Subtarget.hasInt256()) {
	// For 512-bit vectors, split into 256-bit vectors to allow the
	// sign-extension to occur.
	if (VT == MVT::v64i8)
	return Lower512IntArith(Op, DAG);

	// For 256-bit vectors, split into 128-bit vectors to allow the
	// sign-extension to occur. We don't need this on AVX512BW as we can
	// safely sign-extend to v32i16.
	if (VT == MVT::v32i8 && !Subtarget.hasBWI())
	return Lower256IntArith(Op, DAG);

	MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
	return DAG.getNode(
	ISD::TRUNCATE, dl, VT,
	DAG.getNode(ISD::MUL, dl, ExVT,
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
	DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;

	// Extract the lo parts and sign extend to i16
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
	BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and sign extend to i16
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
	BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, mask the lower 8bits of the lo/hi results and pack
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
	RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	// Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
	if (VT == MVT::v4i32) {
	assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
	"Should not custom lower when pmuldq is available!");

	// Extract the odd parts.
	static const int UnpackMask[] = { 1, -1, 3, -1 };
	SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
	SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);

	// Multiply the even parts.
	SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
	// Now multiply odd parts.
	SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);

	Evens = DAG.getBitcast(VT, Evens);
	Odds = DAG.getBitcast(VT, Odds);

	// Merge the two vectors back together with a shuffle. This expands into 2
	// shuffles.
	static const int ShufMask[] = { 0, 4, 2, 6 };
	return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
	}

	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64 \|\| VT == MVT::v8i64) &&
	"Only know how to lower V2I64/V4I64/V8I64 multiply");

	// 32-bit vector types used for MULDQ/MULUDQ.
	MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);

	// MULDQ returns the 64-bit result of the signed multiplication of the lower
	// 32-bits. We can lower with this if the sign bits stretch that far.
	if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
	DAG.ComputeNumSignBits(B) > 32) {
	return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
	DAG.getBitcast(MulVT, B));
	}

	// Ahi = psrlqi(a, 32);
	// Bhi = psrlqi(b, 32);
	//
	// AloBlo = pmuludq(a, b);
	// AloBhi = pmuludq(a, Bhi);
	// AhiBlo = pmuludq(Ahi, b);
	//
	// Hi = psllqi(AloBhi + AhiBlo, 32);
	// return AloBlo + Hi;
	APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
	bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
	bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);

	APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
	bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
	bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);

	// Bit cast to 32-bit vectors for MULUDQ.
	SDValue Alo = DAG.getBitcast(MulVT, A);
	SDValue Blo = DAG.getBitcast(MulVT, B);

	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);

	// Only multiply lo/hi halves that aren't known to be zero.
	SDValue AloBlo = Zero;
	if (!ALoIsZero && !BLoIsZero)
	AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);

	SDValue AloBhi = Zero;
	if (!ALoIsZero && !BHiIsZero) {
	SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
	Bhi = DAG.getBitcast(MulVT, Bhi);
	AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
	}

	SDValue AhiBlo = Zero;
	if (!AHiIsZero && !BLoIsZero) {
	SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
	Ahi = DAG.getBitcast(MulVT, Ahi);
	AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
	}

	SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
	Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);

	return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
	}

	static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntArith(Op, DAG);

	// Only i8 vectors should need custom lowering after this.
	assert((VT == MVT::v16i8 \|\| (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
	"Unsupported vector type");

	// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
	// logical shift down the upper half and pack back to i8.
	SDValue A = Op.getOperand(0);
	SDValue B = Op.getOperand(1);

	// With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
	// and then ashr/lshr the upper bits down to the lower bits before multiply.
	unsigned Opcode = Op.getOpcode();
	unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
	unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);

	// AVX2 implementations - extend xmm subvectors to ymm.
	if (Subtarget.hasInt256()) {
	SDValue Lo = DAG.getIntPtrConstant(0, dl);
	SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);

	if (VT == MVT::v32i8) {
	SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
	SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
	SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
	SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
	ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
	BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
	AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
	BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
	Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
	DAG.getConstant(8, dl, MVT::v16i16));
	Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
	DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
	DAG.getConstant(8, dl, MVT::v16i16));
	// The ymm variant of PACKUS treats the 128-bit lanes separately, so before
	// using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
	const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
	16, 17, 18, 19, 20, 21, 22, 23};
	const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	24, 25, 26, 27, 28, 29, 30, 31};
	return DAG.getNode(X86ISD::PACKUS, dl, VT,
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
	DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
	}

	SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
	SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
	SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
	SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
	DAG.getConstant(8, dl, MVT::v16i16));
	Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
	Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	assert(VT == MVT::v16i8 &&
	"Pre-AVX2 support only supports v16i8 multiplication");
	MVT ExVT = MVT::v8i16;

	// Extract the lo parts and zero/sign extend to i16.
	SDValue ALo, BLo;
	if (Subtarget.hasSSE41()) {
	ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
	BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
	} else {
	const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
	-1, 4, -1, 5, -1, 6, -1, 7};
	ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	ALo = DAG.getBitcast(ExVT, ALo);
	BLo = DAG.getBitcast(ExVT, BLo);
	ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
	BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
	}

	// Extract the hi parts and zero/sign extend to i16.
	SDValue AHi, BHi;
	if (Subtarget.hasSSE41()) {
	const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
	-1, -1, -1, -1, -1, -1, -1, -1};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
	BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
	} else {
	const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
	-1, 12, -1, 13, -1, 14, -1, 15};
	AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
	BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
	AHi = DAG.getBitcast(ExVT, AHi);
	BHi = DAG.getBitcast(ExVT, BHi);
	AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
	BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
	}

	// Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
	// pack back to v16i8.
	SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
	SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
	RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
	RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}

	SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
	assert(Subtarget.isTargetWin64() && "Unexpected target");
	EVT VT = Op.getValueType();
	assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
	"Unexpected return type for lowering");

	RTLIB::Libcall LC;
	bool isSigned;
	switch (Op->getOpcode()) {
	default: llvm_unreachable("Unexpected request for libcall!");
	case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
	case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
	case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
	case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
	case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
	case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
	}

	SDLoc dl(Op);
	SDValue InChain = DAG.getEntryNode();

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;
	for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
	EVT ArgVT = Op->getOperand(i).getValueType();
	assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
	"Unexpected argument type for lowering");
	SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
	Entry.Node = StackPtr;
	InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
	MachinePointerInfo(), /* Alignment = */ 16);
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());
	Entry.Ty = PointerType::get(ArgTy,0);
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);
	}

	SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
	getPointerTy(DAG.getDataLayout()));

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(InChain)
	.setLibCallee(
	getLibcallCallingConv(LC),
	static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
	std::move(Args))
	.setInRegister()
	.setSExtResult(isSigned)
	.setZExtResult(!isSigned);

	std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
	return DAG.getBitcast(VT, CallInfo.first);
	}

	static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
	MVT VT = Op0.getSimpleValueType();
	SDLoc dl(Op);

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256()) {
	unsigned Opcode = Op.getOpcode();
	unsigned NumElems = VT.getVectorNumElements();
	MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
	SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
	SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
	SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
	SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
	SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
	SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
	SDValue Ops[] = {
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
	DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
	};
	return DAG.getMergeValues(Ops, dl);
	}

	assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) \|\|
	(VT == MVT::v8i32 && Subtarget.hasInt256()));

	// PMULxD operations multiply each even value (starting at 0) of LHS with
	// the related value of RHS and produce a widen result.
	// E.g., PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	//
	// In other word, to have all the results, we need to perform two PMULxD:
	// 1. one with the even values.
	// 2. one with the odd values.
	// To achieve #2, with need to place the odd values at an even position.
	//
	// Place the odd value at an even position (basically, shift all values 1
	// step to the left):
	const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
	// <a\|b\|c\|d> => <b\|undef\|d\|undef>
	SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
	makeArrayRef(&Mask[0], VT.getVectorNumElements()));
	// <e\|f\|g\|h> => <f\|undef\|h\|undef>
	SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
	makeArrayRef(&Mask[0], VT.getVectorNumElements()));

	// Emit two multiplies, one for the lower 2 ints and one for the higher 2
	// ints.
	MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
	bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
	unsigned Opcode =
	(!IsSigned \|\| !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
	// PMULUDQ <4 x i32> <a\|b\|c\|d>, <4 x i32> <e\|f\|g\|h>
	// => <2 x i64> <ae\|cg>
	SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
	// PMULUDQ <4 x i32> <b\|undef\|d\|undef>, <4 x i32> <f\|undef\|h\|undef>
	// => <2 x i64> <bf\|dh>
	SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));

	// Shuffle it back into the right order.
	SDValue Highs, Lows;
	if (VT == MVT::v8i32) {
	const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
	Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
	const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
	Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
	} else {
	const int HighMask[] = {1, 5, 3, 7};
	Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
	const int LowMask[] = {0, 4, 2, 6};
	Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
	}

	// If we have a signed multiply but no PMULDQ fix up the high parts of a
	// unsigned multiply.
	if (IsSigned && !Subtarget.hasSSE41()) {
	SDValue ShAmt = DAG.getConstant(
	31, dl,
	DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
	SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
	SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);

	SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
	Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
	}

	// The first result of MUL_LOHI is actually the low value, followed by the
	// high value.
	SDValue Ops[] = {Lows, Highs};
	return DAG.getMergeValues(Ops, dl);
	}

	// Return true if the required (according to Opcode) shift-imm form is natively
	// supported by the Subtarget
	static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	if (VT.getScalarSizeInBits() < 16)
	return false;

	if (VT.is512BitVector() && Subtarget.hasAVX512() &&
	(VT.getScalarSizeInBits() > 16 \|\| Subtarget.hasBWI()))
	return true;

	bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256());

	bool AShift = LShift && (Subtarget.hasAVX512() \|\|
	(VT != MVT::v2i64 && VT != MVT::v4i64));
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	// The shift amount is a variable, but it is the same for all vector lanes.
	// These instructions are defined together with shift-immediate.
	static
	bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {
	return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
	}

	// Return true if the required (according to Opcode) variable-shift form is
	// natively supported by the Subtarget
	static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
	unsigned Opcode) {

	if (!Subtarget.hasInt256() \|\| VT.getScalarSizeInBits() < 16)
	return false;

	// vXi16 supported only on AVX-512, BWI
	if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
	return false;

	if (Subtarget.hasAVX512())
	return true;

	bool LShift = VT.is128BitVector() \|\| VT.is256BitVector();
	bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
	return (Opcode == ISD::SRA) ? AShift : LShift;
	}

	static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
	assert((VT == MVT::v2i64 \|\| VT == MVT::v4i64) && "Unexpected SRA type");
	MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
	SDValue Ex = DAG.getBitcast(ExVT, R);

	// ashr(R, 63) === cmp_slt(R, 0)
	if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
	assert((VT != MVT::v4i64 \|\| Subtarget.hasInt256()) &&
	"Unsupported PCMPGT op");
	return DAG.getNode(X86ISD::PCMPGT, dl, VT,
	getZeroVector(VT, Subtarget, DAG, dl), R);
	}

	if (ShiftAmt >= 32) {
	// Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
	SDValue Upper =
	getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
	SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt - 32, DAG);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{9, 1, 11, 3, 13, 5, 15, 7});
	} else {
	// SRA upper i32, SHL whole i64 and select lower i32.
	SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
	ShiftAmt, DAG);
	SDValue Lower =
	getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
	Lower = DAG.getBitcast(ExVT, Lower);
	if (VT == MVT::v2i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
	if (VT == MVT::v4i64)
	Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
	{8, 1, 10, 3, 12, 5, 14, 7});
	}
	return DAG.getBitcast(VT, Ex);
	};

	// Optimize shl/srl/sra with constant shift amount.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
	uint64_t ShiftAmt = ShiftConst->getZExtValue();

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	// i64 SRA needs to be performed as partial shifts.
	if (((!Subtarget.hasXOP() && VT == MVT::v2i64) \|\|
	(Subtarget.hasInt256() && VT == MVT::v4i64)) &&
	Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);

	if (VT == MVT::v16i8 \|\|
	(Subtarget.hasInt256() && VT == MVT::v32i8) \|\|
	VT == MVT::v64i8) {
	unsigned NumElts = VT.getVectorNumElements();
	MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);

	// Simple i8 add case
	if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
	return DAG.getNode(ISD::ADD, dl, VT, R, R);

	// ashr(R, 7) === cmp_slt(R, 0)
	if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
	if (VT.is512BitVector()) {
	assert(VT == MVT::v64i8 && "Unexpected element type!");
	SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
	}
	return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
	}

	// XOP can shift v16i8 directly instead of as shift v8i16 + mask.
	if (VT == MVT::v16i8 && Subtarget.hasXOP())
	return SDValue();

	if (Op.getOpcode() == ISD::SHL) {
	// Make a large shift.
	SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SHL = DAG.getBitcast(VT, SHL);
	// Zero out the rightmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SHL,
	DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
	}
	if (Op.getOpcode() == ISD::SRL) {
	// Make a large shift.
	SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
	R, ShiftAmt, DAG);
	SRL = DAG.getBitcast(VT, SRL);
	// Zero out the leftmost bits.
	return DAG.getNode(ISD::AND, dl, VT, SRL,
	DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
	}
	if (Op.getOpcode() == ISD::SRA) {
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
	SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);

	SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
	Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
	Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
	return Res;
	}
	llvm_unreachable("Unknown shift opcode.");
	}
	}
	}

	// Special case in 32-bit mode, where i64 is expanded into high and low parts.
	// TODO: Replace constant extraction with getTargetConstantBitsFromNode.
	if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| (Subtarget.hasInt256() && VT == MVT::v4i64) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v8i64))) {

	// AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
	unsigned SubVectorScale = 1;
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
	SubVectorScale =
	Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
	Amt = Amt.getOperand(0);
	}

	// Peek through any splat that was introduced for i64 shift vectorization.
	int SplatIndex = -1;
	if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
	if (SVN->isSplat()) {
	SplatIndex = SVN->getSplatIndex();
	Amt = Amt.getOperand(0);
	assert(SplatIndex < (int)VT.getVectorNumElements() &&
	"Splat shuffle referencing second operand");
	}

	if (Amt.getOpcode() != ISD::BITCAST \|\|
	Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
	return SDValue();

	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	(SubVectorScale * VT.getVectorNumElements());
	unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
	uint64_t ShiftAmt = 0;
	unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
	for (unsigned i = 0; i != Ratio; ++i) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShiftAmt \|= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
	}

	// Check remaining shift amounts (if not a splat).
	if (SplatIndex < 0) {
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	uint64_t ShAmt = 0;
	for (unsigned j = 0; j != Ratio; ++j) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
	if (!C)
	return SDValue();
	// 6 == Log2(64)
	ShAmt \|= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
	}
	if (ShAmt != ShiftAmt)
	return SDValue();
	}
	}

	if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
	return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);

	if (Op.getOpcode() == ISD::SRA)
	return ArithmeticShiftRight64(ShiftAmt);
	}

	return SDValue();
	}

	static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);

	unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;

	unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
	(Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
	SDValue BaseShAmt;
	MVT EltVT = VT.getVectorElementType();

	if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
	// Check if this build_vector node is doing a splat.
	// If so, then set BaseShAmt equal to the splat value.
	BaseShAmt = BV->getSplatValue();
	if (BaseShAmt && BaseShAmt.isUndef())
	BaseShAmt = SDValue();
	} else {
	if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
	Amt = Amt.getOperand(0);

	ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
	if (SVN && SVN->isSplat()) {
	unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
	SDValue InVec = Amt.getOperand(0);
	if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
	assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
	"Unexpected shuffle index found!");
	BaseShAmt = InVec.getOperand(SplatIdx);
	} else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
	if (ConstantSDNode *C =
	dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
	if (C->getZExtValue() == SplatIdx)
	BaseShAmt = InVec.getOperand(1);
	}
	}

	if (!BaseShAmt)
	// Avoid introducing an extract element from a shuffle.
	BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
	DAG.getIntPtrConstant(SplatIdx, dl));
	}
	}

	if (BaseShAmt.getNode()) {
	assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
	if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
	else if (EltVT.bitsLT(MVT::i32))
	BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);

	return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
	}
	}

	// Special case in 32-bit mode, where i64 is expanded into high and low parts.
	if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
	Amt.getOpcode() == ISD::BITCAST &&
	Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
	Amt = Amt.getOperand(0);
	unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
	VT.getVectorNumElements();
	std::vector<SDValue> Vals(Ratio);
	for (unsigned i = 0; i != Ratio; ++i)
	Vals[i] = Amt.getOperand(i);
	for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
	for (unsigned j = 0; j != Ratio; ++j)
	if (Vals[j] != Amt.getOperand(i + j))
	return SDValue();
	}

	if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
	return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
	}
	return SDValue();
	}

	static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc dl(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	assert(VT.isVector() && "Custom lowering only for vector shifts!");
	assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");

	if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
	return V;

	if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
	return V;

	if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
	return Op;

	// XOP has 128-bit variable logical/arithmetic shifts.
	// +ve/-ve Amt = shift left/right.
	if (Subtarget.hasXOP() &&
	(VT == MVT::v2i64 \|\| VT == MVT::v4i32 \|\|
	VT == MVT::v8i16 \|\| VT == MVT::v16i8)) {
	if (Op.getOpcode() == ISD::SRL \|\| Op.getOpcode() == ISD::SRA) {
	SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
	Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
	}
	if (Op.getOpcode() == ISD::SHL \|\| Op.getOpcode() == ISD::SRL)
	return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
	if (Op.getOpcode() == ISD::SRA)
	return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
	}

	// 2i64 vector logical shifts can efficiently avoid scalarization - do the
	// shifts per-lane and then shuffle the partial results back together.
	if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
	// Splat the shift amounts so the scalar shifts above will catch it.
	SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
	SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
	SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
	return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
	}

	// i64 vector arithmetic shift can be emulated with the transform:
	// M = lshr(SIGN_MASK, Amt)
	// ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
	if ((VT == MVT::v2i64 \|\| (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
	Op.getOpcode() == ISD::SRA) {
	SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
	SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
	R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
	R = DAG.getNode(ISD::XOR, dl, VT, R, M);
	R = DAG.getNode(ISD::SUB, dl, VT, R, M);
	return R;
	}

	// If possible, lower this packed shift into a vector multiply instead of
	// expanding it into a sequence of scalar shifts.
	// Do this only if the vector shift count is a constant build_vector.
	if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
	(VT == MVT::v8i16 \|\| VT == MVT::v4i32 \|\|
	(Subtarget.hasInt256() && VT == MVT::v16i16))) {
	SmallVector<SDValue, 8> Elts;
	MVT SVT = VT.getVectorElementType();
	unsigned SVTBits = SVT.getSizeInBits();
	APInt One(SVTBits, 1);
	unsigned NumElems = VT.getVectorNumElements();

	for (unsigned i=0; i !=NumElems; ++i) {
	SDValue Op = Amt->getOperand(i);
	if (Op->isUndef()) {
	Elts.push_back(Op);
	continue;
	}

	ConstantSDNode *ND = cast<ConstantSDNode>(Op);
	APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
	uint64_t ShAmt = C.getZExtValue();
	if (ShAmt >= SVTBits) {
	Elts.push_back(DAG.getUNDEF(SVT));
	continue;
	}
	Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
	}
	SDValue BV = DAG.getBuildVector(VT, dl, Elts);
	return DAG.getNode(ISD::MUL, dl, VT, R, BV);
	}

	// Lower SHL with variable shift amount.
	if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
	Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));

	Op = DAG.getNode(ISD::ADD, dl, VT, Op,
	DAG.getConstant(0x3f800000U, dl, VT));
	Op = DAG.getBitcast(MVT::v4f32, Op);
	Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
	return DAG.getNode(ISD::MUL, dl, VT, Op, R);
	}

	// If possible, lower this shift as a sequence of two shifts by
	// constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
	// Example:
	// (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
	//
	// Could be rewritten as:
	// (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
	//
	// The advantage is that the two shifts from the example would be
	// lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
	// the vector shift into four scalar shifts plus four pairs of vector
	// insert/extract.
	if (ConstantAmt && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) {
	unsigned TargetOpcode = X86ISD::MOVSS;
	bool CanBeSimplified;
	// The splat value for the first packed shift (the 'X' from the example).
	SDValue Amt1 = Amt->getOperand(0);
	// The splat value for the second packed shift (the 'Y' from the example).
	SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);

	// See if it is possible to replace this node with a sequence of
	// two shifts followed by a MOVSS/MOVSD/PBLEND.
	if (VT == MVT::v4i32) {
	// Check if it is legal to use a MOVSS.
	CanBeSimplified = Amt2 == Amt->getOperand(2) &&
	Amt2 == Amt->getOperand(3);
	if (!CanBeSimplified) {
	// Otherwise, check if we can still simplify this node using a MOVSD.
	CanBeSimplified = Amt1 == Amt->getOperand(1) &&
	Amt->getOperand(2) == Amt->getOperand(3);
	TargetOpcode = X86ISD::MOVSD;
	Amt2 = Amt->getOperand(2);
	}
	} else {
	// Do similar checks for the case where the machine value type
	// is MVT::v8i16.
	CanBeSimplified = Amt1 == Amt->getOperand(1);
	for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
	CanBeSimplified = Amt2 == Amt->getOperand(i);

	if (!CanBeSimplified) {
	TargetOpcode = X86ISD::MOVSD;
	CanBeSimplified = true;
	Amt2 = Amt->getOperand(4);
	for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
	CanBeSimplified = Amt1 == Amt->getOperand(i);
	for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
	CanBeSimplified = Amt2 == Amt->getOperand(j);
	}
	}

	if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
	isa<ConstantSDNode>(Amt2)) {
	// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
	MVT CastVT = MVT::v4i32;
	SDValue Splat1 =
	DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
	SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
	SDValue Splat2 =
	DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
	SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
	SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
	SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
	if (TargetOpcode == X86ISD::MOVSD)
	return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
	BitCast2, {0, 1, 6, 7}));
	return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
	BitCast2, {0, 5, 6, 7}));
	}
	}

	// v4i32 Non Uniform Shifts.
	// If the shift amount is constant we can shift each lane using the SSE2
	// immediate shifts, else we need to zero-extend each lane to the lower i64
	// and shift using the SSE2 variable shifts.
	// The separate results can then be blended together.
	if (VT == MVT::v4i32) {
	unsigned Opc = Op.getOpcode();
	SDValue Amt0, Amt1, Amt2, Amt3;
	if (ConstantAmt) {
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
	} else {
	// ISD::SHL is handled above but we include it here for completeness.
	switch (Opc) {
	default:
	llvm_unreachable("Unknown target vector shift node");
	case ISD::SHL:
	Opc = X86ISD::VSHL;
	break;
	case ISD::SRL:
	Opc = X86ISD::VSRL;
	break;
	case ISD::SRA:
	Opc = X86ISD::VSRA;
	break;
	}
	// The SSE2 shifts use the lower i64 as the same shift amount for
	// all lanes and the upper i64 is ignored. These shuffle masks
	// optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
	Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
	Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
	Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
	}

	SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
	SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
	SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
	SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
	SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
	SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
	return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
	}

	// It's worth extending once and using the vXi16/vXi32 shifts for smaller
	// types, but without AVX512 the extra overheads to get from vXi8 to vXi32
	// make the existing SSE solution better.
	if ((Subtarget.hasInt256() && VT == MVT::v8i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i16) \|\|
	(Subtarget.hasAVX512() && VT == MVT::v16i8) \|\|
	(Subtarget.hasBWI() && VT == MVT::v32i8)) {
	MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
	MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
	unsigned ExtOpc =
	Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
	R = DAG.getNode(ExtOpc, dl, ExtVT, R);
	Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
	return DAG.getNode(ISD::TRUNCATE, dl, VT,
	DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
	}

	if (VT == MVT::v16i8 \|\|
	(VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) \|\|
	(VT == MVT::v64i8 && Subtarget.hasBWI())) {
	MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
	unsigned ShiftOpcode = Op->getOpcode();

	auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
	if (VT.is512BitVector()) {
	// On AVX512BW targets we make use of the fact that VSELECT lowers
	// to a masked blend which selects bytes based just on the sign bit
	// extracted to a mask.
	MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	} else if (Subtarget.hasSSE41()) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	V0 = DAG.getBitcast(VT, V0);
	V1 = DAG.getBitcast(VT, V1);
	Sel = DAG.getBitcast(VT, Sel);
	return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we test for the sign bit by comparing to
	// zero - a negative value will set all bits of the lanes to true
	// and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
	SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
	return DAG.getSelect(dl, SelVT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 5;
	// We can safely do this using i16 shifts as we're only interested in
	// the 3 lower bits of each byte.
	Amt = DAG.getBitcast(ExtVT, Amt);
	Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
	Amt = DAG.getBitcast(VT, Amt);

	if (Op->getOpcode() == ISD::SHL \|\| Op->getOpcode() == ISD::SRL) {
	// r = VSELECT(r, shift(r, 4), a);
	SDValue M =
	DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(VT, Amt, M, R);
	return R;
	}

	if (Op->getOpcode() == ISD::SRA) {
	// For SRA we need to unpack each byte to the higher byte of a i16 vector
	// so we can correctly sign extend. We don't care what happens to the
	// lower byte.
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);

	// r = VSELECT(r, shift(r, 4), a);
	SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(4, dl, ExtVT));
	SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(4, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 2), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(2, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(2, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// a += a
	ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
	AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);

	// r = VSELECT(r, shift(r, 1), a);
	MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
	DAG.getConstant(1, dl, ExtVT));
	MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
	DAG.getConstant(1, dl, ExtVT));
	RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
	RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);

	// Logical shift the result back to the lower byte, leaving a zero upper
	// byte
	// meaning that we can safely pack with PACKUSWB.
	RLo =
	DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
	RHi =
	DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
	}
	}

	if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
	MVT ExtVT = MVT::v8i32;
	SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
	SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
	SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
	SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
	SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
	ALo = DAG.getBitcast(ExtVT, ALo);
	AHi = DAG.getBitcast(ExtVT, AHi);
	RLo = DAG.getBitcast(ExtVT, RLo);
	RHi = DAG.getBitcast(ExtVT, RHi);
	SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
	SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
	Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
	Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
	return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
	}

	if (VT == MVT::v8i16) {
	unsigned ShiftOpcode = Op->getOpcode();

	// If we have a constant shift amount, the non-SSE41 path is best as
	// avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
	bool UseSSE41 = Subtarget.hasSSE41() &&
	!ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());

	auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
	// On SSE41 targets we make use of the fact that VSELECT lowers
	// to PBLENDVB which selects bytes based just on the sign bit.
	if (UseSSE41) {
	MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
	V0 = DAG.getBitcast(ExtVT, V0);
	V1 = DAG.getBitcast(ExtVT, V1);
	Sel = DAG.getBitcast(ExtVT, Sel);
	return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
	}
	// On pre-SSE41 targets we splat the sign bit - a negative value will
	// set all bits of the lanes to true and VSELECT uses that in
	// its OR(AND(V0,C),AND(V1,~C)) lowering.
	SDValue C =
	DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
	return DAG.getSelect(dl, VT, C, V0, V1);
	};

	// Turn 'a' into a mask suitable for VSELECT: a = a << 12;
	if (UseSSE41) {
	// On SSE41 targets we need to replicate the shift mask in both
	// bytes for PBLENDVB.
	Amt = DAG.getNode(
	ISD::OR, dl, VT,
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
	DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
	} else {
	Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
	}

	// r = VSELECT(r, shift(r, 8), a);
	SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 4), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// r = VSELECT(r, shift(r, 2), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
	R = SignBitSelect(Amt, M, R);

	// a += a
	Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);

	// return VSELECT(r, shift(r, 1), a);
	M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
	R = SignBitSelect(Amt, M, R);
	return R;
	}

	// Decompose 256-bit shifts into smaller 128-bit shifts.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	return SDValue();
	}

	static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDLoc DL(Op);
	SDValue R = Op.getOperand(0);
	SDValue Amt = Op.getOperand(1);
	unsigned Opcode = Op.getOpcode();
	unsigned EltSizeInBits = VT.getScalarSizeInBits();

	if (Subtarget.hasAVX512()) {
	// Attempt to rotate by immediate.
	APInt UndefElts;
	SmallVector<APInt, 16> EltBits;
	if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
	if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
	return EltBits[0] == V;
	})) {
	unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
	uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
	return DAG.getNode(Op, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Else, fall-back on VPROLV/VPRORV.
	return Op;
	}

	assert(VT.isVector() && "Custom lowering only for vector rotates!");
	assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
	assert((Opcode == ISD::ROTL) && "Only ROTL supported");

	// XOP has 128-bit vector variable + immediate rotates.
	// +ve/-ve Amt = rotate left/right.

	// Split 256-bit integers.
	if (VT.is256BitVector())
	return Lower256IntArith(Op, DAG);

	assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");

	// Attempt to rotate by immediate.
	if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
	if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
	uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
	assert(RotateAmt < EltSizeInBits && "Rotation out of range");
	return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
	DAG.getConstant(RotateAmt, DL, MVT::i8));
	}
	}

	// Use general rotate by variable (per-element).
	return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
	}

	static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
	// Lower the "add/sub/mul with overflow" instruction into a regular ins plus
	// a "setcc" instruction that checks the overflow flag. The "brcond" lowering
	// looks for this combo and may remove the "setcc" instruction if the "setcc"
	// has only one use.
	SDNode *N = Op.getNode();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	unsigned BaseOp = 0;
	X86::CondCode Cond;
	SDLoc DL(Op);
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Unknown ovf instruction!");
	case ISD::SADDO:
	// A subtract of one will be selected as a INC. Note that INC doesn't
	// set CF, so we can't do this for UADDO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::INC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_O;
	break;
	case ISD::UADDO:
	BaseOp = X86ISD::ADD;
	Cond = X86::COND_B;
	break;
	case ISD::SSUBO:
	// A subtract of one will be selected as a DEC. Note that DEC doesn't
	// set CF, so we can't do this for USUBO.
	if (isOneConstant(RHS)) {
	BaseOp = X86ISD::DEC;
	Cond = X86::COND_O;
	break;
	}
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_O;
	break;
	case ISD::USUBO:
	BaseOp = X86ISD::SUB;
	Cond = X86::COND_B;
	break;
	case ISD::SMULO:
	BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
	Cond = X86::COND_O;
	break;
	case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
	if (N->getValueType(0) == MVT::i8) {
	BaseOp = X86ISD::UMUL8;
	Cond = X86::COND_O;
	break;
	}
	SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
	MVT::i32);
	SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}
	}

	// Also sets EFLAGS.
	SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
	SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);

	SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);

	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	/// Returns true if the operand type is exactly twice the native width, and
	/// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
	/// Used to know whether to use cmpxchg8/16b when expanding atomic operations
	/// (otherwise we leave them alone to become __sync_fetch_and_... calls).
	bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
	unsigned OpWidth = MemType->getPrimitiveSizeInBits();

	if (OpWidth == 64)
	return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
	else if (OpWidth == 128)
	return Subtarget.hasCmpxchg16b();
	else
	return false;
	}

	bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
	return needsCmpXchgNb(SI->getValueOperand()->getType());
	}

	// Note: this turns large loads into lock cmpxchg8b/16b.
	// FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
	auto PTy = cast<PointerType>(LI->getPointerOperandType());
	return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	TargetLowering::AtomicExpansionKind
	X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();

	// If the operand is too big, we must see if cmpxchg8/16b is available
	// and default to library calls otherwise.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
	return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	}

	AtomicRMWInst::BinOp Op = AI->getOperation();
	switch (Op) {
	default:
	llvm_unreachable("Unknown atomic operation");
	case AtomicRMWInst::Xchg:
	case AtomicRMWInst::Add:
	case AtomicRMWInst::Sub:
	// It's better to use xadd, xsub or xchg for these in all cases.
	return AtomicExpansionKind::None;
	case AtomicRMWInst::Or:
	case AtomicRMWInst::And:
	case AtomicRMWInst::Xor:
	// If the atomicrmw's result isn't actually used, we can just add a "lock"
	// prefix to a normal instruction for these operations.
	return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
	: AtomicExpansionKind::None;
	case AtomicRMWInst::Nand:
	case AtomicRMWInst::Max:
	case AtomicRMWInst::Min:
	case AtomicRMWInst::UMax:
	case AtomicRMWInst::UMin:
	// These always require a non-trivial set of data operations on x86. We must
	// use a cmpxchg loop.
	return AtomicExpansionKind::CmpXChg;
	}
	}

	LoadInst *
	X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
	unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
	Type *MemType = AI->getType();
	// Accesses larger than the native width are turned into cmpxchg/libcalls, so
	// there is no benefit in turning such RMWs into loads, and it is actually
	// harmful as it introduces a mfence.
	if (MemType->getPrimitiveSizeInBits() > NativeWidth)
	return nullptr;

	auto Builder = IRBuilder<>(AI);
	Module *M = Builder.GetInsertBlock()->getParent()->getParent();
	auto SSID = AI->getSyncScopeID();
	// We must restrict the ordering to avoid generating loads with Release or
	// ReleaseAcquire orderings.
	auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
	auto Ptr = AI->getPointerOperand();

	// Before the load we need a fence. Here is an example lifted from
	// http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
	// is required:
	// Thread 0:
	// x.store(1, relaxed);
	// r1 = y.fetch_add(0, release);
	// Thread 1:
	// y.fetch_add(42, acquire);
	// r2 = x.load(relaxed);
	// r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
	// lowered to just a load without a fence. A mfence flushes the store buffer,
	// making the optimization clearly correct.
	// FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
	// otherwise, we might be able to be more aggressive on relaxed idempotent
	// rmw. In practice, they do not look useful, so we don't try to be
	// especially clever.
	if (SSID == SyncScope::SingleThread)
	// FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
	// the IR level, so we must wrap it in an intrinsic.
	return nullptr;

	if (!Subtarget.hasMFence())
	// FIXME: it might make sense to use a locked operation here but on a
	// different cache-line to prevent cache-line bouncing. In practice it
	// is probably a small win, and x86 processors without mfence are rare
	// enough that we do not bother.
	return nullptr;

	Function *MFence =
	llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
	Builder.CreateCall(MFence, {});

	// Finally we can emit the atomic load.
	LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
	AI->getType()->getPrimitiveSizeInBits());
	Loaded->setAtomic(Order, SSID);
	AI->replaceAllUsesWith(Loaded);
	AI->eraseFromParent();
	return Loaded;
	}

	static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc dl(Op);
	AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
	cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
	SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
	cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());

	// The only fence that needs an instruction is a sequentially-consistent
	// cross-thread fence.
	if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
	FenceSSID == SyncScope::System) {
	if (Subtarget.hasMFence())
	return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));

	SDValue Chain = Op.getOperand(0);
	SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
	SDValue Ops[] = {
	DAG.getRegister(X86::ESP, MVT::i32), // Base
	DAG.getTargetConstant(1, dl, MVT::i8), // Scale
	DAG.getRegister(0, MVT::i32), // Index
	DAG.getTargetConstant(0, dl, MVT::i32), // Disp
	DAG.getRegister(0, MVT::i32), // Segment.
	Zero,
	Chain
	};
	SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
	return SDValue(Res, 0);
	}

	// MEMBARRIER is a compiler barrier; it codegens to a no-op.
	return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
	}

	static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT T = Op.getSimpleValueType();
	SDLoc DL(Op);
	unsigned Reg = 0;
	unsigned size = 0;
	switch(T.SimpleTy) {
	default: llvm_unreachable("Invalid value type!");
	case MVT::i8: Reg = X86::AL; size = 1; break;
	case MVT::i16: Reg = X86::AX; size = 2; break;
	case MVT::i32: Reg = X86::EAX; size = 4; break;
	case MVT::i64:
	assert(Subtarget.is64Bit() && "Node not type legal!");
	Reg = X86::RAX; size = 8;
	break;
	}
	SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
	Op.getOperand(2), SDValue());
	SDValue Ops[] = { cpIn.getValue(0),
	Op.getOperand(1),
	Op.getOperand(3),
	DAG.getTargetConstant(size, DL, MVT::i8),
	cpIn.getValue(1) };
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
	SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
	Ops, T, MMO);

	SDValue cpOut =
	DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
	SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
	MVT::i32, cpOut.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);

	DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
	DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
	return SDValue();
	}

	static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT SrcVT = Op.getOperand(0).getSimpleValueType();
	MVT DstVT = Op.getSimpleValueType();

	if (SrcVT == MVT::v2i32 \|\| SrcVT == MVT::v4i16 \|\| SrcVT == MVT::v8i8 \|\|
	SrcVT == MVT::i64) {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	if (DstVT != MVT::f64)
	// This conversion needs to be expanded.
	return SDValue();

	SDValue Op0 = Op->getOperand(0);
	SmallVector<SDValue, 16> Elts;
	SDLoc dl(Op);
	unsigned NumElts;
	MVT SVT;
	if (SrcVT.isVector()) {
	NumElts = SrcVT.getVectorNumElements();
	SVT = SrcVT.getVectorElementType();

	// Widen the vector in input in the case of MVT::v2i32.
	// Example: from MVT::v2i32 to MVT::v4i32.
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
	DAG.getIntPtrConstant(i, dl)));
	} else {
	assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
	"Unexpected source type in LowerBITCAST");
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(0, dl)));
	Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
	DAG.getIntPtrConstant(1, dl)));
	NumElts = 2;
	SVT = MVT::i32;
	}
	// Explicitly mark the extra elements as Undef.
	Elts.append(NumElts, DAG.getUNDEF(SVT));

	EVT NewVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
	SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
	DAG.getIntPtrConstant(0, dl));
	}

	assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
	Subtarget.hasMMX() && "Unexpected custom BITCAST");
	assert((DstVT == MVT::i64 \|\|
	(DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
	"Unexpected custom BITCAST");
	// i64 <=> MMX conversions are Legal.
	if (SrcVT==MVT::i64 && DstVT.isVector())
	return Op;
	if (DstVT==MVT::i64 && SrcVT.isVector())
	return Op;
	// MMX <=> MMX conversions are Legal.
	if (SrcVT.isVector() && DstVT.isVector())
	return Op;
	// All other conversions need to be expanded.
	return SDValue();
	}

	/// Compute the horizontal sum of bytes in V for the elements of VT.
	///
	/// Requires V to be a byte vector and VT to be an integer vector type with
	/// wider elements than V's type. The width of the elements of VT determines
	/// how many bytes of V are summed horizontally to produce each element of the
	/// result.
	static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDLoc DL(V);
	MVT ByteVecVT = V.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
	"Expected value to have byte element type.");
	assert(EltVT != MVT::i8 &&
	"Horizontal byte sum only makes sense for wider elements!");
	unsigned VecSize = VT.getSizeInBits();
	assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");

	// PSADBW instruction horizontally add all bytes and leave the result in i64
	// chunks, thus directly computes the pop count for v2i64 and v4i64.
	if (EltVT == MVT::i64) {
	SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
	return DAG.getBitcast(VT, V);
	}

	if (EltVT == MVT::i32) {
	// We unpack the low half and high half into i32s interleaved with zeros so
	// that we can use PSADBW to horizontally sum them. The most useful part of
	// this is that it lines up the results of two PSADBW instructions to be
	// two v2i64 vectors which concatenated are the 4 population counts. We can
	// then use PACKUSWB to shrink and concatenate them into a v4i32 again.
	SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
	SDValue V32 = DAG.getBitcast(VT, V);
	SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
	SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);

	// Do the horizontal sums into two v2i64s.
	Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
	MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
	Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, Low), Zeros);
	High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
	DAG.getBitcast(ByteVecVT, High), Zeros);

	// Merge them together.
	MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
	V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
	DAG.getBitcast(ShortVecVT, Low),
	DAG.getBitcast(ShortVecVT, High));

	return DAG.getBitcast(VT, V);
	}

	// The only element type left is i16.
	assert(EltVT == MVT::i16 && "Unknown how to handle type");

	// To obtain pop count for each i16 element starting from the pop count for
	// i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
	// right by 8. It is important to shift as i16s as i8 vector shift isn't
	// directly supported.
	SDValue ShifterV = DAG.getConstant(8, DL, VT);
	SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
	DAG.getBitcast(ByteVecVT, V));
	return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
	}

	static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	unsigned VecSize = VT.getSizeInBits();

	// Implement a lookup table in register by using an algorithm based on:
	// http://wm.ite.pl/articles/sse-popcount.html
	//
	// The general idea is that every lower byte nibble in the input vector is an
	// index into a in-register pre-computed pop count table. We then split up the
	// input vector in two new ones: (1) a vector with only the shifted-right
	// higher nibbles for each byte and (2) a vector with the lower nibbles (and
	// masked out higher ones) for each byte. PSHUFB is used separately with both
	// to index the in-register table. Next, both are added and the result is a
	// i8 vector where each element contains the pop count for input byte.
	//
	// To obtain the pop count for elements != i8, we follow up with the same
	// approach and use additional tricks as described below.
	//
	const int LUT[16] = {/* 0 / 0, / 1 / 1, / 2 / 1, / 3 */ 2,
	/* 4 / 1, / 5 / 2, / 6 / 2, / 7 */ 3,
	/* 8 / 1, / 9 / 2, / a / 2, / b */ 3,
	/* c / 2, / d / 3, / e / 3, / f */ 4};

	int NumByteElts = VecSize / 8;
	MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
	SDValue In = DAG.getBitcast(ByteVecVT, Op);
	SmallVector<SDValue, 64> LUTVec;
	for (int i = 0; i < NumByteElts; ++i)
	LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
	SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
	SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);

	// High nibbles
	SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
	SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);

	// Low nibbles
	SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);

	// The input vector is used as the shuffle mask that index elements into the
	// LUT. After counting low and high nibbles, add the vector to obtain the
	// final pop count per i8 element.
	SDValue HighPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
	SDValue LowPopCnt =
	DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
	SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);

	if (EltVT == MVT::i8)
	return PopCnt;

	return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
	}

	static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitmath lowering supported.");

	int VecSize = VT.getSizeInBits();
	MVT EltVT = VT.getVectorElementType();
	int Len = EltVT.getSizeInBits();

	// This is the vectorized version of the "best" algorithm from
	// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
	// with a minor tweak to use a series of adds + shifts instead of vector
	// multiplications. Implemented for all integer vector types. We only use
	// this when we don't have SSSE3 which allows a LUT-based lowering that is
	// much faster, even faster than using native popcnt instructions.

	auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
	MVT VT = V.getSimpleValueType();
	SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
	return DAG.getNode(OpCode, DL, VT, V, ShifterV);
	};
	auto GetMask = [&](SDValue V, APInt Mask) {
	MVT VT = V.getSimpleValueType();
	SDValue MaskV = DAG.getConstant(Mask, DL, VT);
	return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
	};

	// We don't want to incur the implicit masks required to SRL vNi8 vectors on
	// x86, so set the SRL type to have elements at least i16 wide. This is
	// correct because all of our SRLs are followed immediately by a mask anyways
	// that handles any bits that sneak into the high bits of the byte elements.
	MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);

	SDValue V = Op;

	// v = v - ((v >> 1) & 0x55555555...)
	SDValue Srl =
	DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
	SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
	V = DAG.getNode(ISD::SUB, DL, VT, V, And);

	// v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
	SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
	SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
	V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);

	// v = (v + (v >> 4)) & 0x0F0F0F0F...
	Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
	SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
	V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));

	// At this point, V contains the byte-wise population count, and we are
	// merely doing a horizontal sum if necessary to get the wider element
	// counts.
	if (EltVT == MVT::i8)
	return V;

	return LowerHorizontalByteSum(
	DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
	DAG);
	}

	// Please ensure that any codegen change from LowerVectorCTPOP is reflected in
	// updated cost models in X86TTIImpl::getIntrinsicInstrCost.
	static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	assert((VT.is512BitVector() \|\| VT.is256BitVector() \|\| VT.is128BitVector()) &&
	"Unknown CTPOP type to handle");
	SDLoc DL(Op.getNode());
	SDValue Op0 = Op.getOperand(0);

	// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
	if (Subtarget.hasVPOPCNTDQ()) {
	if (VT == MVT::v8i16) {
	Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
	}
	if (VT == MVT::v16i8 \|\| VT == MVT::v16i16) {
	Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
	Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
	return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
	}
	}

	if (!Subtarget.hasSSSE3()) {
	// We can't use the fast LUT approach, so fall back on vectorized bitmath.
	assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
	return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
	}

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Decompose 512-bit ops into smaller 256-bit ops.
	if (VT.is512BitVector() && !Subtarget.hasBWI())
	return Lower512IntUnary(Op, DAG);

	return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
	}

	static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Op.getSimpleValueType().isVector() &&
	"We only do custom lowering for vector population count.");
	return LowerVectorCTPOP(Op, Subtarget, DAG);
	}

	static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	// For scalars, its still beneficial to transfer to/from the SIMD unit to
	// perform the BITREVERSE.
	if (!VT.isVector()) {
	MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
	SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
	Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}

	int NumElts = VT.getVectorNumElements();
	int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;

	// Decompose 256-bit ops into smaller 128-bit ops.
	if (VT.is256BitVector())
	return Lower256IntUnary(Op, DAG);

	assert(VT.is128BitVector() &&
	"Only 128-bit vector bitreverse lowering supported.");

	// VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
	// perform the BSWAP in the shuffle.
	// Its best to shuffle using the second operand as this will implicitly allow
	// memory folding for multiple vectors.
	SmallVector<SDValue, 16> MaskElts;
	for (int i = 0; i != NumElts; ++i) {
	for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
	int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
	int PermuteByte = SourceByte \| (2 << 5);
	MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
	}
	}

	SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
	SDValue Res = DAG.getBitcast(MVT::v16i8, In);
	Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
	Res, Mask);
	return DAG.getBitcast(VT, Res);
	}

	static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	if (Subtarget.hasXOP())
	return LowerBITREVERSE_XOP(Op, DAG);

	assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");

	MVT VT = Op.getSimpleValueType();
	SDValue In = Op.getOperand(0);
	SDLoc DL(Op);

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarType() == MVT::i8 &&
	"Only byte vector BITREVERSE supported");

	// Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
	if (VT.is256BitVector() && !Subtarget.hasInt256())
	return Lower256IntUnary(Op, DAG);

	// Perform BITREVERSE using PSHUFB lookups. Each byte is split into
	// two nibbles and a PSHUFB lookup to find the bitreverse of each
	// 0-15 value (moved to the other nibble).
	SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
	SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
	SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));

	const int LoLUT[16] = {
	/* 0 / 0x00, / 1 / 0x80, / 2 / 0x40, / 3 */ 0xC0,
	/* 4 / 0x20, / 5 / 0xA0, / 6 / 0x60, / 7 */ 0xE0,
	/* 8 / 0x10, / 9 / 0x90, / a / 0x50, / b */ 0xD0,
	/* c / 0x30, / d / 0xB0, / e / 0x70, / f */ 0xF0};
	const int HiLUT[16] = {
	/* 0 / 0x00, / 1 / 0x08, / 2 / 0x04, / 3 */ 0x0C,
	/* 4 / 0x02, / 5 / 0x0A, / 6 / 0x06, / 7 */ 0x0E,
	/* 8 / 0x01, / 9 / 0x09, / a / 0x05, / b */ 0x0D,
	/* c / 0x03, / d / 0x0B, / e / 0x07, / f */ 0x0F};

	SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
	for (unsigned i = 0; i < NumElts; ++i) {
	LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
	HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
	}

	SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
	SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
	Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
	Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
	return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
	}

	static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
	unsigned NewOpc = 0;
	switch (N->getOpcode()) {
	case ISD::ATOMIC_LOAD_ADD:
	NewOpc = X86ISD::LADD;
	break;
	case ISD::ATOMIC_LOAD_SUB:
	NewOpc = X86ISD::LSUB;
	break;
	case ISD::ATOMIC_LOAD_OR:
	NewOpc = X86ISD::LOR;
	break;
	case ISD::ATOMIC_LOAD_XOR:
	NewOpc = X86ISD::LXOR;
	break;
	case ISD::ATOMIC_LOAD_AND:
	NewOpc = X86ISD::LAND;
	break;
	default:
	llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
	}

	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
	return DAG.getMemIntrinsicNode(
	NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
	{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
	/MemVT=/N->getSimpleValueType(0), MMO);
	}

	/// Lower atomic_load_ops into LOCK-prefixed operations.
	static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	unsigned Opc = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	// We can lower atomic_load_add into LXADD. However, any other atomicrmw op
	// can only be lowered when the result is unused. They should have already
	// been transformed into a cmpxchg loop in AtomicExpand.
	if (N->hasAnyUseOfValue(0)) {
	// Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
	// select LXADD if LOCK_SUB can't be selected.
	if (Opc == ISD::ATOMIC_LOAD_SUB) {
	AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
	RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
	return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
	RHS, AN->getMemOperand());
	}
	assert(Opc == ISD::ATOMIC_LOAD_ADD &&
	"Used AtomicRMW ops other than Add should have been expanded!");
	return N;
	}

	SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
	// RAUW the chain, but don't worry about the result, as it's unused.
	assert(!N->hasAnyUseOfValue(0));
	DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
	return SDValue();
	}

	static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
	SDNode *Node = Op.getNode();
	SDLoc dl(Node);
	EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();

	// Convert seq_cst store -> xchg
	// Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
	// FIXME: On 32-bit, store -> fist or movq would be more efficient
	// (The only way to get a 16-byte store is cmpxchg16b)
	// FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
	if (cast<AtomicSDNode>(Node)->getOrdering() ==
	AtomicOrdering::SequentiallyConsistent \|\|
	!DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
	SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
	cast<AtomicSDNode>(Node)->getMemoryVT(),
	Node->getOperand(0),
	Node->getOperand(1), Node->getOperand(2),
	cast<AtomicSDNode>(Node)->getMemOperand());
	return Swap.getValue(1);
	}
	// Other atomic stores have a simple pattern.
	return Op;
	}

	static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
	SDNode *N = Op.getNode();
	MVT VT = N->getSimpleValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	SDVTList VTs = DAG.getVTList(VT, MVT::i32);
	SDLoc DL(N);

	// Set the carry flag.
	SDValue Carry = Op.getOperand(2);
	EVT CarryVT = Carry.getValueType();
	APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
	Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
	Carry, DAG.getConstant(NegOne, DL, CarryVT));

	unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
	SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
	Op.getOperand(1), Carry.getValue(1));

	SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
	if (N->getValueType(1) == MVT::i1)
	SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);

	return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
	}

	static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());

	// For MacOSX, we want to call an alternative entry point: __sincos_stret,
	// which returns the values as { float, float } (in XMM0) or
	// { double, double } (which is returned in XMM0, XMM1).
	SDLoc dl(Op);
	SDValue Arg = Op.getOperand(0);
	EVT ArgVT = Arg.getValueType();
	Type ArgTy = ArgVT.getTypeForEVT(DAG.getContext());

	TargetLowering::ArgListTy Args;
	TargetLowering::ArgListEntry Entry;

	Entry.Node = Arg;
	Entry.Ty = ArgTy;
	Entry.IsSExt = false;
	Entry.IsZExt = false;
	Args.push_back(Entry);

	bool isF64 = ArgVT == MVT::f64;
	// Only optimize x86_64 for now. i386 is a bit messy. For f32,
	// the small struct {f32, f32} is returned in (eax, edx). For f64,
	// the results are returned via SRet in memory.
	const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Callee =
	DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));

	Type RetTy = isF64 ? (Type )StructType::get(ArgTy, ArgTy)
	: (Type *)VectorType::get(ArgTy, 4);

	TargetLowering::CallLoweringInfo CLI(DAG);
	CLI.setDebugLoc(dl)
	.setChain(DAG.getEntryNode())
	.setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));

	std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);

	if (isF64)
	// Returned in xmm0 and xmm1.
	return CallResult.first;

	// Returned in bits 0:31 and 32:64 xmm0.
	SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(0, dl));
	SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
	CallResult.first, DAG.getIntPtrConstant(1, dl));
	SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
	return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
	}

	/// Widen a vector input to a vector of NVT. The
	/// input vector must have the same element type as NVT.
	static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
	bool FillWithZeroes = false) {
	// Check if InOp already has the right width.
	MVT InVT = InOp.getSimpleValueType();
	if (InVT == NVT)
	return InOp;

	if (InOp.isUndef())
	return DAG.getUNDEF(NVT);

	assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
	"input and widen element type must match");

	unsigned InNumElts = InVT.getVectorNumElements();
	unsigned WidenNumElts = NVT.getVectorNumElements();
	assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
	"Unexpected request for vector widening");

	SDLoc dl(InOp);
	if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
	InOp.getNumOperands() == 2) {
	SDValue N1 = InOp.getOperand(1);
	if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) \|\|
	N1.isUndef()) {
	InOp = InOp.getOperand(0);
	InVT = InOp.getSimpleValueType();
	InNumElts = InVT.getVectorNumElements();
	}
	}
	if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) \|\|
	ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
	SmallVector<SDValue, 16> Ops;
	for (unsigned i = 0; i < InNumElts; ++i)
	Ops.push_back(InOp.getOperand(i));

	EVT EltVT = InOp.getOperand(0).getValueType();

	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
	DAG.getUNDEF(EltVT);
	for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
	Ops.push_back(FillVal);
	return DAG.getBuildVector(NVT, dl, Ops);
	}
	SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
	DAG.getUNDEF(NVT);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
	InOp, DAG.getIntPtrConstant(0, dl));
	}

	static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	// X86 scatter kills mask register, so its type should be added to
	// the list of return values.
	// If the "scatter" has 2 return values, it is already handled.
	if (Op.getNode()->getNumValues() == 2)
	return Op;

	MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
	SDValue Src = N->getValue();
	MVT VT = Src.getSimpleValueType();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
	SDLoc dl(Op);

	SDValue NewScatter;
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Chain = N->getChain();
	SDValue BasePtr = N->getBasePtr();
	MVT MemVT = N->getMemoryVT().getSimpleVT();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
	// The v2i32 value was promoted to v2i64.
	// Now we "redo" the type legalizer's work and widen the original
	// v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
	// with a shuffle.
	assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
	"Unexpected memory type");
	int ShuffleMask[] = {0, 2, -1, -1};
	Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
	DAG.getUNDEF(MVT::v4i32), ShuffleMask);
	// Now we have 4 elements instead of 2.
	// Expand the index.
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
	Index = ExtendToType(Index, NewIndexVT, DAG);

	// Expand the mask with zeroes
	// Mask may be <2 x i64> or <2 x i1> at this moment
	assert((MaskVT == MVT::v2i1 \|\| MaskVT == MVT::v2i64) &&
	"Unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	VT = MVT::v4i32;
	}

	unsigned NumElts = VT.getVectorNumElements();
	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (IndexVT == MVT::v8i32)
	// Just extend index
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	else {
	// The minimal number of elts in scatter is 8
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	// Use original index here, do not modify the index twice
	Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	// Use the original mask here, do not modify the mask twice
	Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);

	// The value that should be stored
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src = ExtendToType(Src, NewVT, DAG);
	}
	}
	// If the mask is "wide" at this point - truncate it to i1 vector
	MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);

	// The mask is killed by scatter, add it to the values
	SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
	SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
	NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
	N->getMemOperand());
	DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
	return SDValue(NewScatter.getNode(), 1);
	}

	static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {

	MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
	MVT VT = Op.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isExpandingLoad() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isExpandingLoad() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
	// VLX. These types for exp-loads are handled here.
	if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked load op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked load op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
	SDValue Src0 = N->getSrc0();
	Src0 = ExtendToType(Src0, WideDataVT, DAG);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
	N->getBasePtr(), Mask, Src0,
	N->getMemoryVT(), N->getMemOperand(),
	N->getExtensionType(),
	N->isExpandingLoad());

	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewLoad.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}

	static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
	SDValue DataToStore = N->getValue();
	MVT VT = DataToStore.getSimpleValueType();
	MVT ScalarVT = VT.getScalarType();
	SDValue Mask = N->getMask();
	SDLoc dl(Op);

	assert((!N->isCompressingStore() \|\| Subtarget.hasAVX512()) &&
	"Expanding masked load is supported on AVX-512 target only!");

	assert((!N->isCompressingStore() \|\| ScalarVT.getSizeInBits() >= 32) &&
	"Expanding masked load is supported for 32 and 64-bit types only!");

	// 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
	if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
	return Op;

	assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
	"Cannot lower masked store op.");

	assert((ScalarVT.getSizeInBits() >= 32 \|\|
	(Subtarget.hasBWI() &&
	(ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16))) &&
	"Unsupported masked store op.");

	// This operation is legal for targets with VLX, but without
	// VLX the vector should be widened to 512 bit
	unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
	MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);

	// Mask element has to be i1.
	MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
	assert((MaskEltTy == MVT::i1 \|\| VT.getVectorNumElements() <= 4) &&
	"We handle 4x32, 4x64 and 2x64 vectors only in this case");

	MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);

	DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
	Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
	if (MaskEltTy != MVT::i1)
	Mask = DAG.getNode(ISD::TRUNCATE, dl,
	MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
	return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
	Mask, N->getMemoryVT(), N->getMemOperand(),
	N->isTruncatingStore(), N->isCompressingStore());
	}

	static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	assert(Subtarget.hasAVX512() &&
	"MGATHER/MSCATTER are supported on AVX-512 arch only");

	MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
	SDLoc dl(Op);
	MVT VT = Op.getSimpleValueType();
	SDValue Index = N->getIndex();
	SDValue Mask = N->getMask();
	SDValue Src0 = N->getValue();
	MVT IndexVT = Index.getSimpleValueType();
	MVT MaskVT = Mask.getSimpleValueType();

	unsigned NumElts = VT.getVectorNumElements();
	assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");

	if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
	!Index.getSimpleValueType().is512BitVector()) {
	// AVX512F supports only 512-bit vectors. Or data or index should
	// be 512 bit wide. If now the both index and data are 256-bit, but
	// the vector contains 8 elements, we just sign-extend the index
	if (NumElts == 8) {
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
	SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
	N->getOperand(3), Index };
	DAG.UpdateNodeOperands(N, Ops);
	return Op;
	}

	// Minimal number of elements in Gather
	NumElts = 8;
	// Index
	MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
	Index = ExtendToType(Index, NewIndexVT, DAG);
	if (IndexVT.getScalarType() == MVT::i32)
	Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);

	// Mask
	MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
	// At this point we have promoted mask operand
	assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
	MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
	Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
	Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);

	// The pass-through value
	MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
	Src0 = ExtendToType(Src0, NewVT, DAG);

	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
	N->getMemoryVT(), dl, Ops,
	N->getMemOperand());
	SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
	NewGather.getValue(0),
	DAG.getIntPtrConstant(0, dl));
	SDValue RetOps[] = {Exract, NewGather.getValue(1)};
	return DAG.getMergeValues(RetOps, dl);
	}
	if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
	// There is a special case when the return type is v2i32 is illegal and
	// the type legaizer extended it to v2i64. Without this conversion we end up
	// with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
	// In order to avoid this situation, we'll build an X86 specific Gather node
	// with index v2i64 and value type v4i32.
	assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
	"Unexpected type in masked gather");
	Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
	DAG.getBitcast(MVT::v4i32, Src0),
	DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
	// The mask should match the destination type. Extending mask with zeroes
	// is not necessary since instruction itself reads only two values from
	// memory.
	Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());

	SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
	NewGather.getValue(0), DAG);
	SDValue RetOps[] = { Sext, NewGather.getValue(1) };
	return DAG.getMergeValues(RetOps, dl);
	}
	if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
	// This transformation is for optimization only.
	// The type legalizer extended mask and index to 4 elements vector
	// in order to match requirements of the common gather node - same
	// vector width of index and value. X86 Gather node allows mismatch
	// of vector width in order to select more optimal instruction at the
	// end.
	assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
	"Unexpected type in masked gather");
	if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
	ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
	Index.getOpcode() == ISD::CONCAT_VECTORS &&
	Index.getOperand(1).isUndef()) {
	Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
	Index = Index.getOperand(0);
	} else
	return Op;
	SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
	SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
	DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
	N->getMemOperand());

	SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
	return DAG.getMergeValues(RetOps, dl);

	}
	return Op;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
	SelectionDAG &DAG) const {
	// TODO: Eventually, the lowering of these nodes should be informed by or
	// deferred to the GC strategy for the function in which they appear. For
	// now, however, they must be lowered to something. Since they are logically
	// no-ops in the case of a null GC strategy (or a GC strategy which does not
	// require special handling for these nodes), lower them as literal NOOPs for
	// the time being.
	SmallVector<SDValue, 2> Ops;

	Ops.push_back(Op.getOperand(0));
	if (Op->getGluedNode())
	Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));

	SDLoc OpDL(Op);
	SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
	SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);

	return NOOP;
	}

	/// Provide custom lowering hooks for some operations.
	SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
	switch (Op.getOpcode()) {
	default: llvm_unreachable("Should not custom lower this!");
	case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
	return LowerCMP_SWAP(Op, Subtarget, DAG);
	case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
	case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
	case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
	case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
	case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
	case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
	case ISD::VSELECT: return LowerVSELECT(Op, DAG);
	case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
	case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
	case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
	case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
	case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
	case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
	case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
	case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
	case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
	case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
	case ISD::SHL_PARTS:
	case ISD::SRA_PARTS:
	case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
	case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
	case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
	case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
	case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
	case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
	case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
	case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
	case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
	case ISD::FABS:
	case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
	case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
	case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
	case ISD::SETCC: return LowerSETCC(Op, DAG);
	case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
	case ISD::SELECT: return LowerSELECT(Op, DAG);
	case ISD::BRCOND: return LowerBRCOND(Op, DAG);
	case ISD::JumpTable: return LowerJumpTable(Op, DAG);
	case ISD::VASTART: return LowerVASTART(Op, DAG);
	case ISD::VAARG: return LowerVAARG(Op, DAG);
	case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
	case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
	case ISD::INTRINSIC_VOID:
	case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
	case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
	case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
	case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
	case ISD::FRAME_TO_ARGS_OFFSET:
	return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
	case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
	case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
	case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
	case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
	case ISD::EH_SJLJ_SETUP_DISPATCH:
	return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
	case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
	case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
	case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
	case ISD::CTLZ:
	case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
	case ISD::CTTZ:
	case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
	case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
	case ISD::MULHS:
	case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
	case ISD::UMUL_LOHI:
	case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
	case ISD::ROTL:
	case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
	case ISD::SRA:
	case ISD::SRL:
	case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
	case ISD::SADDO:
	case ISD::UADDO:
	case ISD::SSUBO:
	case ISD::USUBO:
	case ISD::SMULO:
	case ISD::UMULO: return LowerXALUO(Op, DAG);
	case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
	case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
	case ISD::ADDCARRY:
	case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
	case ISD::ADD:
	case ISD::SUB: return LowerADD_SUB(Op, DAG);
	case ISD::SMAX:
	case ISD::SMIN:
	case ISD::UMAX:
	case ISD::UMIN: return LowerMINMAX(Op, DAG);
	case ISD::ABS: return LowerABS(Op, DAG);
	case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
	case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
	case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
	case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
	case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
	case ISD::GC_TRANSITION_START:
	return LowerGC_TRANSITION_START(Op, DAG);
	case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
	case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
	}
	}

	/// Places new result values for the node in Results (their number
	/// and types must exactly match those of the original return values of
	/// the node), or leaves Results empty, which indicates that the node is not
	/// to be custom lowered after all.
	void X86TargetLowering::LowerOperationWrapper(SDNode *N,
	SmallVectorImpl<SDValue> &Results,
	SelectionDAG &DAG) const {
	SDValue Res = LowerOperation(SDValue(N, 0), DAG);

	if (!Res.getNode())
	return;

	assert((N->getNumValues() <= Res->getNumValues()) &&
	"Lowering returned the wrong number of results!");

	// Places new result values base on N result number.
	// In some cases (LowerSINT_TO_FP for example) Res has more result values
	// than original node, chain should be dropped(last value).
	for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
	Results.push_back(Res.getValue(I));
	}

	/// Replace a node with an illegal result type with a new node built out of
	/// custom code.
	void X86TargetLowering::ReplaceNodeResults(SDNode *N,
	SmallVectorImpl<SDValue>&Results,
	SelectionDAG &DAG) const {
	SDLoc dl(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	switch (N->getOpcode()) {
	default:
	llvm_unreachable("Do not know how to custom type legalize this operation!");
	case X86ISD::AVG: {
	// Legalize types for X86ISD::AVG by expanding vectors.
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");

	auto InVT = N->getValueType(0);
	auto InVTSize = InVT.getSizeInBits();
	const unsigned RegSize =
	(InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
	assert((Subtarget.hasBWI() \|\| RegSize < 512) &&
	"512-bit vector requires AVX512BW");
	assert((Subtarget.hasAVX2() \|\| RegSize < 256) &&
	"256-bit vector requires AVX2");

	auto ElemVT = InVT.getVectorElementType();
	auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
	RegSize / ElemVT.getSizeInBits());
	assert(RegSize % InVT.getSizeInBits() == 0);
	unsigned NumConcat = RegSize / InVT.getSizeInBits();

	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
	Ops[0] = N->getOperand(0);
	SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
	Ops[0] = N->getOperand(1);
	SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);

	SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
	Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
	DAG.getIntPtrConstant(0, dl)));
	return;
	}
	// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
	case X86ISD::FMINC:
	case X86ISD::FMIN:
	case X86ISD::FMAXC:
	case X86ISD::FMAX: {
	EVT VT = N->getValueType(0);
	assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
	SDValue UNDEF = DAG.getUNDEF(VT);
	SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(0), UNDEF);
	SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
	N->getOperand(1), UNDEF);
	Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
	return;
	}
	case ISD::SDIV:
	case ISD::UDIV:
	case ISD::SREM:
	case ISD::UREM:
	case ISD::SDIVREM:
	case ISD::UDIVREM: {
	SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
	Results.push_back(V);
	return;
	}
	case ISD::FP_TO_SINT:
	case ISD::FP_TO_UINT: {
	bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;

	if (N->getValueType(0) == MVT::v2i32) {
	assert((IsSigned \|\| Subtarget.hasAVX512()) &&
	"Can only handle signed conversion without AVX512");
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	SDValue Src = N->getOperand(0);
	if (Src.getValueType() == MVT::v2f64) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
	: X86ISD::CVTTP2UI,
	dl, MVT::v4i32, Src);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}
	if (Src.getValueType() == MVT::v2f32) {
	SDValue Idx = DAG.getIntPtrConstant(0, dl);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
	DAG.getUNDEF(MVT::v2f32));
	Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
	: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
	Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
	Results.push_back(Res);
	return;
	}

	// The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
	// so early out here.
	return;
	}

	std::pair<SDValue,SDValue> Vals =
	FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /IsReplace=/ true);
	SDValue FIST = Vals.first, StackSlot = Vals.second;
	if (FIST.getNode()) {
	EVT VT = N->getValueType(0);
	// Return a load from the stack slot.
	if (StackSlot.getNode())
	Results.push_back(
	DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
	else
	Results.push_back(FIST);
	}
	return;
	}
	case ISD::SINT_TO_FP: {
	assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
	SDValue Src = N->getOperand(0);
	if (N->getValueType(0) != MVT::v2f32 \|\| Src.getValueType() != MVT::v2i64)
	return;
	Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
	return;
	}
	case ISD::UINT_TO_FP: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT VT = N->getValueType(0);
	if (VT != MVT::v2f32)
	return;
	SDValue Src = N->getOperand(0);
	EVT SrcVT = Src.getValueType();
	if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
	Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
	return;
	}
	if (SrcVT != MVT::v2i32)
	return;
	SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
	SDValue VBias =
	DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
	SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
	DAG.getBitcast(MVT::v2i64, VBias));
	Or = DAG.getBitcast(MVT::v2f64, Or);
	// TODO: Are there any fast-math-flags to propagate here?
	SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
	Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
	return;
	}
	case ISD::FP_ROUND: {
	if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
	return;
	SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
	Results.push_back(V);
	return;
	}
	case ISD::FP_EXTEND: {
	// Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
	// No other ValueType for FP_EXTEND should reach this point.
	assert(N->getValueType(0) == MVT::v2f32 &&
	"Do not know how to legalize this Node");
	return;
	}
	case ISD::INTRINSIC_W_CHAIN: {
	unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
	switch (IntNo) {
	default : llvm_unreachable("Do not know how to custom type "
	"legalize this intrinsic operation!");
	case Intrinsic::x86_rdtsc:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdtscp:
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
	Results);
	case Intrinsic::x86_rdpmc:
	return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);

	case Intrinsic::x86_xgetbv:
	return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
	}
	}
	case ISD::INTRINSIC_WO_CHAIN: {
	if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
	Results.push_back(V);
	return;
	}
	case ISD::READCYCLECOUNTER: {
	return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
	Results);
	}
	case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
	EVT T = N->getValueType(0);
	assert((T == MVT::i64 \|\| T == MVT::i128) && "can only expand cmpxchg pair");
	bool Regs64bit = T == MVT::i128;
	MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
	SDValue cpInL, cpInH;
	cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(0, dl, HalfT));
	cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
	DAG.getConstant(1, dl, HalfT));
	cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	cpInL, SDValue());
	cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	cpInH, cpInL.getValue(1));
	SDValue swapInL, swapInH;
	swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(0, dl, HalfT));
	swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
	DAG.getConstant(1, dl, HalfT));
	swapInH =
	DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
	swapInH, cpInH.getValue(1));
	// If the current function needs the base pointer, RBX,
	// we shouldn't use cmpxchg directly.
	// Indeed the lowering of that instruction will clobber
	// that register and since RBX will be a reserved register
	// the register allocator will not make sure its value will
	// be properly saved and restored around this live-range.
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	SDValue Result;
	SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
	unsigned BasePtr = TRI->getBaseRegister();
	MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
	if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
	(BasePtr == X86::RBX \|\| BasePtr == X86::EBX)) {
	// ISel prefers the LCMPXCHG64 variant.
	// If that assert breaks, that means it is not the case anymore,
	// and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
	// not just EBX. This is a matter of accepting i64 input for that
	// pseudo, and restoring into the register of the right wide
	// in expand pseudo. Everything else should just work.
	assert(((Regs64bit == (BasePtr == X86::RBX)) \|\| BasePtr == X86::EBX) &&
	"Saving only half of the RBX");
	unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
	: X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
	SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX,
	HalfT, swapInH.getValue(1));
	SDValue Ops[] = {/Chain/ RBXSave.getValue(1), N->getOperand(1), swapInL,
	RBXSave,
	/Glue/ RBXSave.getValue(2)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	} else {
	unsigned Opcode =
	Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
	swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
	Regs64bit ? X86::RBX : X86::EBX, swapInL,
	swapInH.getValue(1));
	SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
	swapInL.getValue(1)};
	Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
	}
	SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
	Regs64bit ? X86::RAX : X86::EAX,
	HalfT, Result.getValue(1));
	SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
	Regs64bit ? X86::RDX : X86::EDX,
	HalfT, cpOutL.getValue(2));
	SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};

	SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
	MVT::i32, cpOutH.getValue(2));
	SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
	Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));

	Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
	Results.push_back(Success);
	Results.push_back(EFLAGS.getValue(1));
	return;
	}
	case ISD::ATOMIC_SWAP:
	case ISD::ATOMIC_LOAD_ADD:
	case ISD::ATOMIC_LOAD_SUB:
	case ISD::ATOMIC_LOAD_AND:
	case ISD::ATOMIC_LOAD_OR:
	case ISD::ATOMIC_LOAD_XOR:
	case ISD::ATOMIC_LOAD_NAND:
	case ISD::ATOMIC_LOAD_MIN:
	case ISD::ATOMIC_LOAD_MAX:
	case ISD::ATOMIC_LOAD_UMIN:
	case ISD::ATOMIC_LOAD_UMAX:
	case ISD::ATOMIC_LOAD: {
	// Delegate to generic TypeLegalization. Situations we can really handle
	// should have already been dealt with by AtomicExpandPass.cpp.
	break;
	}
	case ISD::BITCAST: {
	assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
	EVT DstVT = N->getValueType(0);
	EVT SrcVT = N->getOperand(0)->getValueType(0);

	if (SrcVT != MVT::f64 \|\|
	(DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
	return;

	unsigned NumElts = DstVT.getVectorNumElements();
	EVT SVT = DstVT.getVectorElementType();
	EVT WiderVT = EVT::getVectorVT(DAG.getContext(), SVT, NumElts 2);
	SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
	MVT::v2f64, N->getOperand(0));
	SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);

	if (ExperimentalVectorWideningLegalization) {
	// If we are legalizing vectors by widening, we already have the desired
	// legal vector type, just return it.
	Results.push_back(ToVecInt);
	return;
	}

	SmallVector<SDValue, 8> Elts;
	for (unsigned i = 0, e = NumElts; i != e; ++i)
	Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
	ToVecInt, DAG.getIntPtrConstant(i, dl)));

	Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
	}
	}
	}

	const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
	switch ((X86ISD::NodeType)Opcode) {
	case X86ISD::FIRST_NUMBER: break;
	case X86ISD::BSF: return "X86ISD::BSF";
	case X86ISD::BSR: return "X86ISD::BSR";
	case X86ISD::SHLD: return "X86ISD::SHLD";
	case X86ISD::SHRD: return "X86ISD::SHRD";
	case X86ISD::FAND: return "X86ISD::FAND";
	case X86ISD::FANDN: return "X86ISD::FANDN";
	case X86ISD::FOR: return "X86ISD::FOR";
	case X86ISD::FXOR: return "X86ISD::FXOR";
	case X86ISD::FILD: return "X86ISD::FILD";
	case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
	case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
	case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
	case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
	case X86ISD::FLD: return "X86ISD::FLD";
	case X86ISD::FST: return "X86ISD::FST";
	case X86ISD::CALL: return "X86ISD::CALL";
	case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
	case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
	case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
	case X86ISD::BT: return "X86ISD::BT";
	case X86ISD::CMP: return "X86ISD::CMP";
	case X86ISD::COMI: return "X86ISD::COMI";
	case X86ISD::UCOMI: return "X86ISD::UCOMI";
	case X86ISD::CMPM: return "X86ISD::CMPM";
	case X86ISD::CMPMU: return "X86ISD::CMPMU";
	case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
	case X86ISD::SETCC: return "X86ISD::SETCC";
	case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
	case X86ISD::FSETCC: return "X86ISD::FSETCC";
	case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
	case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
	case X86ISD::CMOV: return "X86ISD::CMOV";
	case X86ISD::BRCOND: return "X86ISD::BRCOND";
	case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
	case X86ISD::IRET: return "X86ISD::IRET";
	case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
	case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
	case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
	case X86ISD::Wrapper: return "X86ISD::Wrapper";
	case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
	case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
	case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
	case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
	case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
	case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
	case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
	case X86ISD::PINSRB: return "X86ISD::PINSRB";
	case X86ISD::PINSRW: return "X86ISD::PINSRW";
	case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
	case X86ISD::ANDNP: return "X86ISD::ANDNP";
	case X86ISD::BLENDI: return "X86ISD::BLENDI";
	case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
	case X86ISD::ADDUS: return "X86ISD::ADDUS";
	case X86ISD::SUBUS: return "X86ISD::SUBUS";
	case X86ISD::HADD: return "X86ISD::HADD";
	case X86ISD::HSUB: return "X86ISD::HSUB";
	case X86ISD::FHADD: return "X86ISD::FHADD";
	case X86ISD::FHSUB: return "X86ISD::FHSUB";
	case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
	case X86ISD::FMAX: return "X86ISD::FMAX";
	case X86ISD::FMAXS: return "X86ISD::FMAXS";
	case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
	case X86ISD::FMIN: return "X86ISD::FMIN";
	case X86ISD::FMINS: return "X86ISD::FMINS";
	case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
	case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
	case X86ISD::FMAXC: return "X86ISD::FMAXC";
	case X86ISD::FMINC: return "X86ISD::FMINC";
	case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
	case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
	case X86ISD::FRCP: return "X86ISD::FRCP";
	case X86ISD::FRCPS: return "X86ISD::FRCPS";
	case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
	case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
	case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
	case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
	case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
	case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
	case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
	case X86ISD::EH_SJLJ_SETUP_DISPATCH:
	return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
	case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
	case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
	case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
	case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
	case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
	case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
	case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
	case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
	return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
	case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
	return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
	case X86ISD::LADD: return "X86ISD::LADD";
	case X86ISD::LSUB: return "X86ISD::LSUB";
	case X86ISD::LOR: return "X86ISD::LOR";
	case X86ISD::LXOR: return "X86ISD::LXOR";
	case X86ISD::LAND: return "X86ISD::LAND";
	case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
	case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
	case X86ISD::VZEXT: return "X86ISD::VZEXT";
	case X86ISD::VSEXT: return "X86ISD::VSEXT";
	case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
	case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
	case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
	case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
	case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
	case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
	case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
	case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
	case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
	case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
	case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
	case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
	case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
	case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
	case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
	case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
	case X86ISD::VSHL: return "X86ISD::VSHL";
	case X86ISD::VSRL: return "X86ISD::VSRL";
	case X86ISD::VSRA: return "X86ISD::VSRA";
	case X86ISD::VSHLI: return "X86ISD::VSHLI";
	case X86ISD::VSRLI: return "X86ISD::VSRLI";
	case X86ISD::VSRAI: return "X86ISD::VSRAI";
	case X86ISD::VSRAV: return "X86ISD::VSRAV";
	case X86ISD::VROTLI: return "X86ISD::VROTLI";
	case X86ISD::VROTRI: return "X86ISD::VROTRI";
	case X86ISD::VPPERM: return "X86ISD::VPPERM";
	case X86ISD::CMPP: return "X86ISD::CMPP";
	case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
	case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
	case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
	case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
	case X86ISD::ADD: return "X86ISD::ADD";
	case X86ISD::SUB: return "X86ISD::SUB";
	case X86ISD::ADC: return "X86ISD::ADC";
	case X86ISD::SBB: return "X86ISD::SBB";
	case X86ISD::SMUL: return "X86ISD::SMUL";
	case X86ISD::UMUL: return "X86ISD::UMUL";
	case X86ISD::SMUL8: return "X86ISD::SMUL8";
	case X86ISD::UMUL8: return "X86ISD::UMUL8";
	case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
	case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
	case X86ISD::INC: return "X86ISD::INC";
	case X86ISD::DEC: return "X86ISD::DEC";
	case X86ISD::OR: return "X86ISD::OR";
	case X86ISD::XOR: return "X86ISD::XOR";
	case X86ISD::AND: return "X86ISD::AND";
	case X86ISD::BEXTR: return "X86ISD::BEXTR";
	case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
	case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
	case X86ISD::PTEST: return "X86ISD::PTEST";
	case X86ISD::TESTP: return "X86ISD::TESTP";
	case X86ISD::TESTM: return "X86ISD::TESTM";
	case X86ISD::TESTNM: return "X86ISD::TESTNM";
	case X86ISD::KORTEST: return "X86ISD::KORTEST";
	case X86ISD::KTEST: return "X86ISD::KTEST";
	case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
	case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
	case X86ISD::PACKSS: return "X86ISD::PACKSS";
	case X86ISD::PACKUS: return "X86ISD::PACKUS";
	case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
	case X86ISD::VALIGN: return "X86ISD::VALIGN";
	case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
	case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
	case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
	case X86ISD::SHUFP: return "X86ISD::SHUFP";
	case X86ISD::SHUF128: return "X86ISD::SHUF128";
	case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
	case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
	case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
	case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
	case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
	case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
	case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
	case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
	case X86ISD::MOVSD: return "X86ISD::MOVSD";
	case X86ISD::MOVSS: return "X86ISD::MOVSS";
	case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
	case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
	case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
	case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
	case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
	case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
	case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
	case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
	case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
	case X86ISD::VPERMV: return "X86ISD::VPERMV";
	case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
	case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
	case X86ISD::VPERMI: return "X86ISD::VPERMI";
	case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
	case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
	case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
	case X86ISD::VRANGE: return "X86ISD::VRANGE";
	case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
	case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
	case X86ISD::PSADBW: return "X86ISD::PSADBW";
	case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
	case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
	case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
	case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
	case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
	case X86ISD::MFENCE: return "X86ISD::MFENCE";
	case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
	case X86ISD::SAHF: return "X86ISD::SAHF";
	case X86ISD::RDRAND: return "X86ISD::RDRAND";
	case X86ISD::RDSEED: return "X86ISD::RDSEED";
	case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
	case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
	case X86ISD::VPROT: return "X86ISD::VPROT";
	case X86ISD::VPROTI: return "X86ISD::VPROTI";
	case X86ISD::VPSHA: return "X86ISD::VPSHA";
	case X86ISD::VPSHL: return "X86ISD::VPSHL";
	case X86ISD::VPCOM: return "X86ISD::VPCOM";
	case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
	case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
	case X86ISD::FMADD: return "X86ISD::FMADD";
	case X86ISD::FMSUB: return "X86ISD::FMSUB";
	case X86ISD::FNMADD: return "X86ISD::FNMADD";
	case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
	case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
	case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
	case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
	case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
	case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
	case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
	case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
	case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
	case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
	case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
	case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
	case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
	case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
	case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
	case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
	case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
	case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
	case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
	case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
	case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
	case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
	case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
	case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
	case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
	case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
	case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
	case X86ISD::XTEST: return "X86ISD::XTEST";
	case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
	case X86ISD::EXPAND: return "X86ISD::EXPAND";
	case X86ISD::SELECT: return "X86ISD::SELECT";
	case X86ISD::SELECTS: return "X86ISD::SELECTS";
	case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
	case X86ISD::RCP28: return "X86ISD::RCP28";
	case X86ISD::RCP28S: return "X86ISD::RCP28S";
	case X86ISD::EXP2: return "X86ISD::EXP2";
	case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
	case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
	case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
	case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
	case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
	case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
	case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
	case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
	case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
	case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
	case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
	case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
	case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
	case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
	case X86ISD::SCALEF: return "X86ISD::SCALEF";
	case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
	case X86ISD::ADDS: return "X86ISD::ADDS";
	case X86ISD::SUBS: return "X86ISD::SUBS";
	case X86ISD::AVG: return "X86ISD::AVG";
	case X86ISD::MULHRS: return "X86ISD::MULHRS";
	case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
	case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
	case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
	case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
	case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
	case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
	case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
	case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
	case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
	case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
	case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
	case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
	case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
	case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
	case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
	case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
	case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
	case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
	case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
	case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
	case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
	case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
	case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
	case X86ISD::LWPINS: return "X86ISD::LWPINS";
	case X86ISD::MGATHER: return "X86ISD::MGATHER";
	}
	return nullptr;
	}

	/// Return true if the addressing mode represented by AM is legal for this
	/// target, for a load/store of the specified type.
	bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// X86 supports extremely general addressing modes.
	CodeModel::Model M = getTargetMachine().getCodeModel();

	// X86 allows a sign-extended 32-bit immediate field as a displacement.
	if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
	return false;

	if (AM.BaseGV) {
	unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);

	// If a reference to this global requires an extra load, we can't fold it.
	if (isGlobalStubReference(GVFlags))
	return false;

	// If BaseGV requires a register for the PIC base, we cannot also have a
	// BaseReg specified.
	if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
	return false;

	// If lower 4G is not available, then we must use rip-relative addressing.
	if ((M != CodeModel::Small \|\| isPositionIndependent()) &&
	Subtarget.is64Bit() && (AM.BaseOffs \|\| AM.Scale > 1))
	return false;
	}

	switch (AM.Scale) {
	case 0:
	case 1:
	case 2:
	case 4:
	case 8:
	// These scales always work.
	break;
	case 3:
	case 5:
	case 9:
	// These scales are formed with basereg+scalereg. Only accept if there is
	// no basereg yet.
	if (AM.HasBaseReg)
	return false;
	break;
	default: // Other stuff never works.
	return false;
	}

	return true;
	}

	bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
	unsigned Bits = Ty->getScalarSizeInBits();

	// 8-bit shifts are always expensive, but versions with a scalar amount aren't
	// particularly cheaper than those without.
	if (Bits == 8)
	return false;

	// On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
	// variable shifts just as cheap as scalar ones.
	if (Subtarget.hasInt256() && (Bits == 32 \|\| Bits == 64))
	return false;

	// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
	// fully general vector.
	return true;
	}

	bool X86TargetLowering::isTruncateFree(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;
	unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
	unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::allowTruncateForTailCall(Type Ty1, Type Ty2) const {
	if (!Ty1->isIntegerTy() \|\| !Ty2->isIntegerTy())
	return false;

	if (!isTypeLegal(EVT::getEVT(Ty1)))
	return false;

	assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");

	// Assuming the caller doesn't have a zeroext or signext return parameter,
	// truncation all the way down to i1 is valid.
	return true;
	}

	bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
	// Can also use sub to handle negated immediates.
	return isInt<32>(Imm);
	}

	bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
	if (!VT1.isInteger() \|\| !VT2.isInteger())
	return false;
	unsigned NumBits1 = VT1.getSizeInBits();
	unsigned NumBits2 = VT2.getSizeInBits();
	return NumBits1 > NumBits2;
	}

	bool X86TargetLowering::isZExtFree(Type Ty1, Type Ty2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
	// x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
	return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
	}

	bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
	EVT VT1 = Val.getValueType();
	if (isZExtFree(VT1, VT2))
	return true;

	if (Val.getOpcode() != ISD::LOAD)
	return false;

	if (!VT1.isSimple() \|\| !VT1.isInteger() \|\|
	!VT2.isSimple() \|\| !VT2.isInteger())
	return false;

	switch (VT1.getSimpleVT().SimpleTy) {
	default: break;
	case MVT::i8:
	case MVT::i16:
	case MVT::i32:
	// X86 has 8, 16, and 32-bit zero-extending loads.
	return true;
	}

	return false;
	}

	bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }

	bool
	X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
	if (!Subtarget.hasAnyFMA())
	return false;

	VT = VT.getScalarType();

	if (!VT.isSimple())
	return false;

	switch (VT.getSimpleVT().SimpleTy) {
	case MVT::f32:
	case MVT::f64:
	return true;
	default:
	break;
	}

	return false;
	}

	bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
	// i16 instructions are longer (0x66 prefix) and potentially slower.
	return !(VT1 == MVT::i32 && VT2 == MVT::i16);
	}

	/// Targets can use this to indicate that they only support some
	/// VECTOR_SHUFFLE operations, those with specific masks.
	/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
	/// are assumed to be legal.
	bool
	X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
	EVT VT) const {
	if (!VT.isSimple())
	return false;

	// Not for i1 vectors
	if (VT.getSimpleVT().getScalarType() == MVT::i1)
	return false;

	// Very little shuffling can be done for 64-bit vectors right now.
	if (VT.getSimpleVT().getSizeInBits() == 64)
	return false;

	// We only care that the types being shuffled are legal. The lowering can
	// handle any possible shuffle mask that results.
	return isTypeLegal(VT.getSimpleVT());
	}

	bool
	X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
	EVT VT) const {
	// Just delegate to the generic legality, clear masks aren't special.
	return isShuffleMaskLegal(Mask, VT);
	}

	//===----------------------------------------------------------------------===//
	// X86 Scheduler Hooks
	//===----------------------------------------------------------------------===//

	/// Utility function to emit xbegin specifying the start of an RTM region.
	static MachineBasicBlock emitXBegin(MachineInstr &MI, MachineBasicBlock MBB,
	const TargetInstrInfo *TII) {
	DebugLoc DL = MI.getDebugLoc();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// For the v = xbegin(), we generate
	//
	// thisMBB:
	// xbegin sinkMBB
	//
	// mainMBB:
	// s0 = -1
	//
	// fallBB:
	// eax = # XABORT_DEF
	// s1 = eax
	//
	// sinkMBB:
	// v = phi(s0/mainBB, s1/fallBB)

	MachineBasicBlock *thisMBB = MBB;
	MachineFunction *MF = MBB->getParent();
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, fallMBB);
	MF->insert(I, sinkMBB);

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	unsigned DstReg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned fallDstReg = MRI.createVirtualRegister(RC);

	// thisMBB:
	// xbegin fallMBB
	// # fallthrough to mainMBB
	// # abortion to fallMBB
	BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(fallMBB);

	// mainMBB:
	// mainDstReg := -1
	BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
	BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	mainMBB->addSuccessor(sinkMBB);

	// fallMBB:
	// ; pseudo instruction to model hardware's definition from XABORT
	// EAX := XABORT_DEF
	// fallDstReg := EAX
	BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
	BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
	.addReg(X86::EAX);
	fallMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
	BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(fallDstReg).addMBB(fallMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
	// or XMM0_V32I8 in AVX all of this code can be replaced with that
	// in the .td file.
	static MachineBasicBlock emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
	case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
	case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
	case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
	case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
	case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
	case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
	case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands();
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::XMM0);

	MI.eraseFromParent();
	return BB;
	}

	// FIXME: Custom handling because TableGen doesn't support multiple implicit
	// defs in an instruction pattern
	static MachineBasicBlock emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock BB,
	const TargetInstrInfo *TII) {
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
	case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
	case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
	case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
	case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
	case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
	case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
	case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
	}

	DebugLoc dl = MI.getDebugLoc();
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));

	unsigned NumArgs = MI.getNumOperands(); // remove the results
	for (unsigned i = 1; i < NumArgs; ++i) {
	MachineOperand &Op = MI.getOperand(i);
	if (!(Op.isReg() && Op.isImplicit()))
	MIB.add(Op);
	}
	if (MI.hasOneMemOperand())
	MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());

	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::ECX);

	MI.eraseFromParent();
	return BB;
	}

	static MachineBasicBlock emitWRPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert input VAL into EAX
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
	.addReg(MI.getOperand(0).getReg());
	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert zero to EDX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);

	// insert WRPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitRDPKRU(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();

	// insert zero to ECX
	BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);

	// insert RDPKRU instruction
	BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
	.addReg(X86::EAX);

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitMonitor(MachineInstr &MI, MachineBasicBlock BB,
	const X86Subtarget &Subtarget,
	unsigned Opc) {
	DebugLoc dl = MI.getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX, other two args into ECX, EDX.
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));

	unsigned ValOps = X86::AddrNumOperands;
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
	.addReg(MI.getOperand(ValOps).getReg());
	BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
	.addReg(MI.getOperand(ValOps + 1).getReg());

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(Opc));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	static MachineBasicBlock emitClzero(MachineInstr MI, MachineBasicBlock *BB,
	const X86Subtarget &Subtarget) {
	DebugLoc dl = MI->getDebugLoc();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	// Address into RAX/EAX
	unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
	unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
	MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI->getOperand(i));

	// The instruction doesn't actually take any operands though.
	BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));

	MI->eraseFromParent(); // The pseudo is gone now.
	return BB;
	}



	MachineBasicBlock *
	X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	// Emit va_arg instruction on X86-64.

	// Operands to this pseudo-instruction:
	// 0 ) Output : destination address (reg)
	// 1-5) Input : va_list address (addr, i64mem)
	// 6 ) ArgSize : Size (in bytes) of vararg type
	// 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
	// 8 ) Align : Alignment of type
	// 9 ) EFLAGS (implicit-def)

	assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
	static_assert(X86::AddrNumOperands == 5,
	"VAARG_64 assumes 5 address operands");

	unsigned DestReg = MI.getOperand(0).getReg();
	MachineOperand &Base = MI.getOperand(1);
	MachineOperand &Scale = MI.getOperand(2);
	MachineOperand &Index = MI.getOperand(3);
	MachineOperand &Disp = MI.getOperand(4);
	MachineOperand &Segment = MI.getOperand(5);
	unsigned ArgSize = MI.getOperand(6).getImm();
	unsigned ArgMode = MI.getOperand(7).getImm();
	unsigned Align = MI.getOperand(8).getImm();

	// Memory Reference
	assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	// Machine Information
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
	const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
	const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
	DebugLoc DL = MI.getDebugLoc();

	// struct va_list {
	// i32 gp_offset
	// i32 fp_offset
	// i64 overflow_area (address)
	// i64 reg_save_area (address)
	// }
	// sizeof(va_list) = 24
	// alignment(va_list) = 8

	unsigned TotalNumIntRegs = 6;
	unsigned TotalNumXMMRegs = 8;
	bool UseGPOffset = (ArgMode == 1);
	bool UseFPOffset = (ArgMode == 2);
	unsigned MaxOffset = TotalNumIntRegs * 8 +
	(UseFPOffset ? TotalNumXMMRegs * 16 : 0);

	/* Align ArgSize to a multiple of 8 */
	unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
	bool NeedsAlign = (Align > 8);

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *overflowMBB;
	MachineBasicBlock *offsetMBB;
	MachineBasicBlock *endMBB;

	unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
	unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
	unsigned OffsetReg = 0;

	if (!UseGPOffset && !UseFPOffset) {
	// If we only pull from the overflow region, we don't create a branch.
	// We don't need to alter control flow.
	OffsetDestReg = 0; // unused
	OverflowDestReg = DestReg;

	offsetMBB = nullptr;
	overflowMBB = thisMBB;
	endMBB = thisMBB;
	} else {
	// First emit code to check if gp_offset (or fp_offset) is below the bound.
	// If so, pull the argument from reg_save_area. (branch to offsetMBB)
	// If not, pull from overflow_area. (branch to overflowMBB)
	//
	// thisMBB
	// \| .
	// \| .
	// offsetMBB overflowMBB
	// \| .
	// \| .
	// endMBB

	// Registers for the PHI in endMBB
	OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
	OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);

	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *MF = MBB->getParent();
	overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	endMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineFunction::iterator MBBIter = ++MBB->getIterator();

	// Insert the new basic blocks
	MF->insert(MBBIter, offsetMBB);
	MF->insert(MBBIter, overflowMBB);
	MF->insert(MBBIter, endMBB);

	// Transfer the remainder of MBB and its successor edges to endMBB.
	endMBB->splice(endMBB->begin(), thisMBB,
	std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
	endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);

	// Make offsetMBB and overflowMBB successors of thisMBB
	thisMBB->addSuccessor(offsetMBB);
	thisMBB->addSuccessor(overflowMBB);

	// endMBB is a successor of both offsetMBB and overflowMBB
	offsetMBB->addSuccessor(endMBB);
	overflowMBB->addSuccessor(endMBB);

	// Load the offset value into a register
	OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Check if there is enough room left to pull this argument.
	BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
	.addReg(OffsetReg)
	.addImm(MaxOffset + 8 - ArgSizeA8);

	// Branch to "overflowMBB" if offset >= max
	// Fall through to "offsetMBB" otherwise
	BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
	.addMBB(overflowMBB);
	}

	// In offsetMBB, emit code to use the reg_save_area.
	if (offsetMBB) {
	assert(OffsetReg != 0);

	// Read the reg_save_area address.
	unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 16)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// Zero-extend the offset
	unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
	.addImm(0)
	.addReg(OffsetReg)
	.addImm(X86::sub_32bit);

	// Add the offset to the reg_save_area to get the final address.
	BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
	.addReg(OffsetReg64)
	.addReg(RegSaveReg);

	// Compute the offset for the next argument
	unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
	BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
	.addReg(OffsetReg)
	.addImm(UseFPOffset ? 16 : 8);

	// Store it back into the va_list.
	BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, UseFPOffset ? 4 : 0)
	.add(Segment)
	.addReg(NextOffsetReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// Jump to endMBB
	BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
	.addMBB(endMBB);
	}

	//
	// Emit code to use overflow area
	//

	// Load the overflow_area address into a register.
	unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we need to align it, do so. Otherwise, just copy the address
	// to OverflowDestReg.
	if (NeedsAlign) {
	// Align the overflow address
	assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
	unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);

	// aligned_addr = (addr + (align-1)) & ~(align-1)
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
	.addReg(OverflowAddrReg)
	.addImm(Align-1);

	BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
	.addReg(TmpReg)
	.addImm(~(uint64_t)(Align-1));
	} else {
	BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
	.addReg(OverflowAddrReg);
	}

	// Compute the next overflow address after this argument.
	// (the overflow address should be kept 8-byte aligned)
	unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
	BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
	.addReg(OverflowDestReg)
	.addImm(ArgSizeA8);

	// Store the new overflow address.
	BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
	.add(Base)
	.add(Scale)
	.add(Index)
	.addDisp(Disp, 8)
	.add(Segment)
	.addReg(NextAddrReg)
	.setMemRefs(MMOBegin, MMOEnd);

	// If we branched, emit the PHI to the front of endMBB.
	if (offsetMBB) {
	BuildMI(*endMBB, endMBB->begin(), DL,
	TII->get(X86::PHI), DestReg)
	.addReg(OffsetDestReg).addMBB(offsetMBB)
	.addReg(OverflowDestReg).addMBB(overflowMBB);
	}

	// Erase the pseudo instruction
	MI.eraseFromParent();

	return endMBB;
	}

	MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
	MachineInstr &MI, MachineBasicBlock *MBB) const {
	// Emit code to save XMM registers to the stack. The ABI says that the
	// number of registers to save is given in %al, so it's theoretically
	// possible to do an indirect jump trick to avoid saving all of them,
	// however this code takes a simpler approach and just executes all
	// of the stores if %al is non-zero. It's less code, and it's probably
	// easier on the hardware branch predictor, and stores aren't all that
	// expensive anyway.

	// Create the new basic blocks. One block contains all the XMM stores,
	// and one block is the final destination regardless of whether any
	// stores were performed.
	const BasicBlock *LLVM_BB = MBB->getBasicBlock();
	MachineFunction *F = MBB->getParent();
	MachineFunction::iterator MBBIter = ++MBB->getIterator();
	MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(MBBIter, XMMSaveMBB);
	F->insert(MBBIter, EndMBB);

	// Transfer the remainder of MBB and its successor edges to EndMBB.
	EndMBB->splice(EndMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	EndMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// The original block will now fall through to the XMM save block.
	MBB->addSuccessor(XMMSaveMBB);
	// The XMMSaveMBB will fall through to the end block.
	XMMSaveMBB->addSuccessor(EndMBB);

	// Now add the instructions.
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	unsigned CountReg = MI.getOperand(0).getReg();
	int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
	int64_t VarArgsFPOffset = MI.getOperand(2).getImm();

	if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
	// If %al is 0, branch around the XMM save block.
	BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
	BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
	MBB->addSuccessor(EndMBB);
	}

	// Make sure the last operand is EFLAGS, which gets clobbered by the branch
	// that was just emitted, but clearly shouldn't be "saved".
	assert((MI.getNumOperands() <= 3 \|\|
	!MI.getOperand(MI.getNumOperands() - 1).isReg() \|\|
	MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
	"Expected last argument to be EFLAGS");
	unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
	// In the XMM save block, save all the XMM argument registers.
	for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
	int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
	MachineMemOperand *MMO = F->getMachineMemOperand(
	MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
	MachineMemOperand::MOStore,
	/Size=/16, /Align=/16);
	BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
	.addFrameIndex(RegSaveFrameIndex)
	.addImm(/Scale=/1)
	.addReg(/IndexReg=/0)
	.addImm(/Disp=/Offset)
	.addReg(/Segment=/0)
	.addReg(MI.getOperand(i).getReg())
	.addMemOperand(MMO);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.

	return EndMBB;
	}

	// The EFLAGS operand of SelectItr might be missing a kill marker
	// because there were multiple uses of EFLAGS, and ISel didn't know
	// which to mark. Figure out whether SelectItr should have had a
	// kill marker, and set it if it should. Returns the correct kill
	// marker value.
	static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
	MachineBasicBlock* BB,
	const TargetRegisterInfo* TRI) {
	// Scan forward through BB for a use/def of EFLAGS.
	MachineBasicBlock::iterator miI(std::next(SelectItr));
	for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
	const MachineInstr& mi = *miI;
	if (mi.readsRegister(X86::EFLAGS))
	return false;
	if (mi.definesRegister(X86::EFLAGS))
	break; // Should have kill-flag - update below.
	}

	// If we hit the end of the block, check whether EFLAGS is live into a
	// successor.
	if (miI == BB->end()) {
	for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
	sEnd = BB->succ_end();
	sItr != sEnd; ++sItr) {
	MachineBasicBlock* succ = *sItr;
	if (succ->isLiveIn(X86::EFLAGS))
	return false;
	}
	}

	// We found a def, or hit the end of the basic block and EFLAGS wasn't live
	// out. SelectMI should have a kill flag on EFLAGS.
	SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
	return true;
	}

	// Return true if it is OK for this CMOV pseudo-opcode to be cascaded
	// together with other CMOV pseudo-opcodes into a single basic-block with
	// conditional jump around it.
	static bool isCMOVPseudo(MachineInstr &MI) {
	switch (MI.getOpcode()) {
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return true;

	default:
	return false;
	}
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	// To "insert" a SELECT_CC instruction, we actually have to insert the
	// diamond control-flow pattern. The incoming instruction knows the
	// destination vreg to set, the condition code register to branch on, the
	// true/false values to select between, and a branch opcode to use.
	const BasicBlock *LLVM_BB = BB->getBasicBlock();
	MachineFunction::iterator It = ++BB->getIterator();

	// thisMBB:
	// ...
	// TrueVal = ...
	// cmpTY ccX, r1, r2
	// bCC copy1MBB
	// fallthrough --> copy0MBB
	MachineBasicBlock *thisMBB = BB;
	MachineFunction *F = BB->getParent();

	// This code lowers all pseudo-CMOV instructions. Generally it lowers these
	// as described above, by inserting a BB, and then making a PHI at the join
	// point to select the true and false operands of the CMOV in the PHI.
	//
	// The code also handles two different cases of multiple CMOV opcodes
	// in a row.
	//
	// Case 1:
	// In this case, there are multiple CMOVs in a row, all which are based on
	// the same condition setting (or the exact opposite condition setting).
	// In this case we can lower all the CMOVs using a single inserted BB, and
	// then make a number of PHIs at the join point to model the CMOVs. The only
	// trickiness here, is that in a case like:
	//
	// t2 = CMOV cond1 t1, f1
	// t3 = CMOV cond1 t2, f2
	//
	// when rewriting this into PHIs, we have to perform some renaming on the
	// temps since you cannot have a PHI operand refer to a PHI result earlier
	// in the same block. The "simple" but wrong lowering would be:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t2(BB1), f2(BB2)
	//
	// but clearly t2 is not defined in BB1, so that is incorrect. The proper
	// renaming is to note that on the path through BB1, t2 is really just a
	// copy of t1, and do that renaming, properly generating:
	//
	// t2 = PHI t1(BB1), f1(BB2)
	// t3 = PHI t1(BB1), f2(BB2)
	//
	// Case 2, we lower cascaded CMOVs such as
	//
	// (CMOV (CMOV F, T, cc1), T, cc2)
	//
	// to two successive branches. For that, we look for another CMOV as the
	// following instruction.
	//
	// Without this, we would add a PHI between the two jumps, which ends up
	// creating a few copies all around. For instance, for
	//
	// (sitofp (zext (fcmp une)))
	//
	// we would generate:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// movaps %xmm0, %xmm1
	// jne .LBB5_2
	// xorps %xmm1, %xmm1
	// .LBB5_2:
	// jp .LBB5_4
	// movaps %xmm1, %xmm0
	// .LBB5_4:
	// retq
	//
	// because this custom-inserter would have generated:
	//
	// A
	// \| \
	// \| B
	// \| /
	// C
	// \| \
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// B: empty
	// C: Z = PHI [X, A], [Y, B]
	// D: empty
	// E: PHI [X, C], [Z, D]
	//
	// If we lower both CMOVs in a single step, we can instead generate:
	//
	// A
	// \| \
	// \| C
	// \| /\|
	// \|/ \|
	// \| \|
	// \| D
	// \| /
	// E
	//
	// A: X = ...; Y = ...
	// D: empty
	// E: PHI [X, A], [X, C], [Y, D]
	//
	// Which, in our sitofp/fcmp example, gives us something like:
	//
	// ucomiss %xmm1, %xmm0
	// movss <1.0f>, %xmm0
	// jne .LBB5_4
	// jp .LBB5_4
	// xorps %xmm0, %xmm0
	// .LBB5_4:
	// retq
	//
	MachineInstr *CascadedCMOV = nullptr;
	MachineInstr *LastCMOV = &MI;
	X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
	X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
	MachineBasicBlock::iterator NextMIIt =
	std::next(MachineBasicBlock::iterator(MI));

	// Check for case 1, where there are multiple CMOVs with the same condition
	// first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
	// number of jumps the most.

	if (isCMOVPseudo(MI)) {
	// See if we have a string of CMOVS with the same condition.
	while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
	(NextMIIt->getOperand(3).getImm() == CC \|\|
	NextMIIt->getOperand(3).getImm() == OppCC)) {
	LastCMOV = &*NextMIIt;
	++NextMIIt;
	}
	}

	// This checks for case 2, but only do this if we didn't already find
	// case 1, as indicated by LastCMOV == MI.
	if (LastCMOV == &MI && NextMIIt != BB->end() &&
	NextMIIt->getOpcode() == MI.getOpcode() &&
	NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
	NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
	NextMIIt->getOperand(1).isKill()) {
	CascadedCMOV = &*NextMIIt;
	}

	MachineBasicBlock *jcc1MBB = nullptr;

	// If we have a cascaded CMOV, we lower it to two successive branches to
	// the same block. EFLAGS is used by both, so mark it as live in the second.
	if (CascadedCMOV) {
	jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, jcc1MBB);
	jcc1MBB->addLiveIn(X86::EFLAGS);
	}

	MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
	F->insert(It, copy0MBB);
	F->insert(It, sinkMBB);

	// If the EFLAGS register isn't dead in the terminator, then claim that it's
	// live into the sink and copy blocks.
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();

	MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
	if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
	!checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
	copy0MBB->addLiveIn(X86::EFLAGS);
	sinkMBB->addLiveIn(X86::EFLAGS);
	}

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add the true and fallthrough blocks as its successors.
	if (CascadedCMOV) {
	// The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
	BB->addSuccessor(jcc1MBB);

	// In that case, jcc1MBB will itself fallthrough the copy0MBB, and
	// jump to the sinkMBB.
	jcc1MBB->addSuccessor(copy0MBB);
	jcc1MBB->addSuccessor(sinkMBB);
	} else {
	BB->addSuccessor(copy0MBB);
	}

	// The true block target of the first (or only) branch is always sinkMBB.
	BB->addSuccessor(sinkMBB);

	// Create the conditional branch instruction.
	unsigned Opc = X86::GetCondBranchFromCond(CC);
	BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);

	if (CascadedCMOV) {
	unsigned Opc2 = X86::GetCondBranchFromCond(
	(X86::CondCode)CascadedCMOV->getOperand(3).getImm());
	BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
	}

	// copy0MBB:
	// %FalseValue = ...
	// # fallthrough to sinkMBB
	copy0MBB->addSuccessor(sinkMBB);

	// sinkMBB:
	// %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
	// ...
	MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
	MachineBasicBlock::iterator MIItEnd =
	std::next(MachineBasicBlock::iterator(LastCMOV));
	MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
	DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
	MachineInstrBuilder MIB;

	// As we are creating the PHIs, we have to be careful if there is more than
	// one. Later CMOVs may reference the results of earlier CMOVs, but later
	// PHIs have to reference the individual true/false inputs from earlier PHIs.
	// That also means that PHI construction must work forward from earlier to
	// later, and that the code must maintain a mapping from earlier PHI's
	// destination registers, and the registers that went into the PHI.

	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
	unsigned DestReg = MIIt->getOperand(0).getReg();
	unsigned Op1Reg = MIIt->getOperand(1).getReg();
	unsigned Op2Reg = MIIt->getOperand(2).getReg();

	// If this CMOV we are generating is the opposite condition from
	// the jump we generated, then we have to swap the operands for the
	// PHI that is going to be generated.
	if (MIIt->getOperand(3).getImm() == OppCC)
	std::swap(Op1Reg, Op2Reg);

	if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
	Op1Reg = RegRewriteTable[Op1Reg].first;

	if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
	Op2Reg = RegRewriteTable[Op2Reg].second;

	MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
	TII->get(X86::PHI), DestReg)
	.addReg(Op1Reg).addMBB(copy0MBB)
	.addReg(Op2Reg).addMBB(thisMBB);

	// Add this PHI to the rewrite table.
	RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
	}

	// If we have a cascaded CMOV, the second Jcc provides the same incoming
	// value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
	if (CascadedCMOV) {
	MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
	// Copy the PHI result to the register defined by the second CMOV.
	BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
	DL, TII->get(TargetOpcode::COPY),
	CascadedCMOV->getOperand(0).getReg())
	.addReg(MI.getOperand(0).getReg());
	CascadedCMOV->eraseFromParent();
	}

	// Now remove the CMOV(s).
	for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
	(MIIt++)->eraseFromParent();

	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// Combine the following atomic floating-point modification pattern:
	// a.store(reg OP a.load(acquire), release)
	// Transform them into:
	// OPss (%gpr), %xmm
	// movss %xmm, (%gpr)
	// Or sd equivalent for 64-bit operations.
	unsigned MOp, FOp;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
	case X86::RELEASE_FADD32mr:
	FOp = X86::ADDSSrm;
	MOp = X86::MOVSSmr;
	break;
	case X86::RELEASE_FADD64mr:
	FOp = X86::ADDSDrm;
	MOp = X86::MOVSDmr;
	break;
	}
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
	unsigned ValOpIdx = X86::AddrNumOperands;
	unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(FOp),
	MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
	.addReg(VSrc);
	for (int i = 0; i < X86::AddrNumOperands; ++i) {
	MachineOperand &Operand = MI.getOperand(i);
	// Clear any kill flags on register operands as we'll create a second
	// instruction using the same address operands.
	if (Operand.isReg())
	Operand.setIsKill(false);
	MIB.add(Operand);
	}
	MachineInstr *FOpMI = MIB;
	MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
	for (int i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	const BasicBlock *LLVM_BB = BB->getBasicBlock();

	assert(MF->shouldSplitStack());

	const bool Is64Bit = Subtarget.is64Bit();
	const bool IsLP64 = Subtarget.isTarget64BitLP64();

	const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
	const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;

	// BB:
	// ... [Till the alloca]
	// If stacklet is not large enough, jump to mallocMBB
	//
	// bumpMBB:
	// Allocate by subtracting from RSP
	// Jump to continueMBB
	//
	// mallocMBB:
	// Allocate by call to runtime
	//
	// continueMBB:
	// ...
	// [rest of original BB]
	//

	MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
	MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);

	MachineRegisterInfo &MRI = MF->getRegInfo();
	const TargetRegisterClass *AddrRegClass =
	getRegClassFor(getPointerTy(MF->getDataLayout()));

	unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
	tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
	SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
	sizeVReg = MI.getOperand(1).getReg(),
	physSPReg =
	IsLP64 \|\| Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;

	MachineFunction::iterator MBBIter = ++BB->getIterator();

	MF->insert(MBBIter, bumpMBB);
	MF->insert(MBBIter, mallocMBB);
	MF->insert(MBBIter, continueMBB);

	continueMBB->splice(continueMBB->begin(), BB,
	std::next(MachineBasicBlock::iterator(MI)), BB->end());
	continueMBB->transferSuccessorsAndUpdatePHIs(BB);

	// Add code to the main basic block to check if the stack limit has been hit,
	// and if so, jump to mallocMBB otherwise to bumpMBB.
	BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
	.addReg(tmpSPVReg).addReg(sizeVReg);
	BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
	.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
	.addReg(SPLimitVReg);
	BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);

	// bumpMBB simply decreases the stack pointer, since we know the current
	// stacklet has enough space.
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
	.addReg(SPLimitVReg);
	BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Calls into a routine in libgcc to allocate more space from the heap.
	const uint32_t *RegMask =
	Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
	if (IsLP64) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::RDI, RegState::Implicit)
	.addReg(X86::RAX, RegState::ImplicitDefine);
	} else if (Is64Bit) {
	BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
	.addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EDI, RegState::Implicit)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	} else {
	BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
	.addImm(12);
	BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
	BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
	.addExternalSymbol("__morestack_allocate_stack_space")
	.addRegMask(RegMask)
	.addReg(X86::EAX, RegState::ImplicitDefine);
	}

	if (!Is64Bit)
	BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
	.addImm(16);

	BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
	.addReg(IsLP64 ? X86::RAX : X86::EAX);
	BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);

	// Set up the CFG correctly.
	BB->addSuccessor(bumpMBB);
	BB->addSuccessor(mallocMBB);
	mallocMBB->addSuccessor(continueMBB);
	bumpMBB->addSuccessor(continueMBB);

	// Take care of the PHI nodes.
	BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
	MI.getOperand(0).getReg())
	.addReg(mallocPtrVReg)
	.addMBB(mallocMBB)
	.addReg(bumpSPPtrVReg)
	.addMBB(bumpMBB);

	// Delete the original pseudo instruction.
	MI.eraseFromParent();

	// And we're done.
	return continueMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
	DebugLoc DL = MI.getDebugLoc();

	assert(!isAsynchronousEHPersonality(
	classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
	"SEH does not use catchret!");

	// Only 32-bit EH needs to worry about manually restoring stack pointers.
	if (!Subtarget.is32Bit())
	return BB;

	// C++ EH creates a new target block to hold the restore code, and wires up
	// the new block to the return destination with a normal JMP_4.
	MachineBasicBlock *RestoreMBB =
	MF->CreateMachineBasicBlock(BB->getBasicBlock());
	assert(BB->succ_size() == 1);
	MF->insert(std::next(BB->getIterator()), RestoreMBB);
	RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
	BB->addSuccessor(RestoreMBB);
	MI.getOperand(0).setMBB(RestoreMBB);

	auto RestoreMBBI = RestoreMBB->begin();
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
	BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const Constant *PerFn = MF->getFunction()->getPersonalityFn();
	bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
	// Only 32-bit SEH requires special handling for catchpad.
	if (IsSEH && Subtarget.is32Bit()) {
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
	}
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// So, here we replace TLSADDR with the sequence:
	// adjust_stackdown -> TLSADDR -> adjust_stackup.
	// We need this because TLSADDR is lowered into calls
	// inside MC, therefore without the two markers shrink-wrapping
	// may push the prologue/epilogue pass them.
	const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction &MF = *BB->getParent();

	// Emit CALLSEQ_START right before the instruction.
	unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
	MachineInstrBuilder CallseqStart =
	BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
	BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);

	// Emit CALLSEQ_END right after the instruction.
	// We don't call erase from parent because we want to keep the
	// original instruction around.
	unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
	MachineInstrBuilder CallseqEnd =
	BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
	BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);

	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	// This is pretty easy. We're taking the value that we received from
	// our load from the relocation, sticking it in either RDI (x86-64)
	// or EAX and doing an indirect call. The return value will then
	// be in the normal return register.
	MachineFunction *F = BB->getParent();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
	assert(MI.getOperand(3).isGlobal() && "This should be a global");

	// Get a register mask for the lowered call.
	// FIXME: The 32-bit calls have non-standard calling conventions. Use a
	// proper register mask.
	const uint32_t *RegMask =
	Subtarget.is64Bit() ?
	Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
	Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
	if (Subtarget.is64Bit()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
	addDirectMem(MIB, X86::RDI);
	MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else if (!isPositionIndependent()) {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(0)
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	} else {
	MachineInstrBuilder MIB =
	BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
	.addReg(TII->getGlobalBaseReg(F))
	.addImm(0)
	.addReg(0)
	.addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
	MI.getOperand(3).getTargetFlags())
	.addReg(0);
	MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
	addDirectMem(MIB, X86::EAX);
	MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
	}

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	const BasicBlock *BB = MBB->getBasicBlock();
	MachineFunction::iterator I = ++MBB->getIterator();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	unsigned DstReg;
	unsigned MemOpndSlot = 0;

	unsigned CurOp = 0;

	DstReg = MI.getOperand(CurOp++).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
	assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
	(void)TRI;
	unsigned mainDstReg = MRI.createVirtualRegister(RC);
	unsigned restoreDstReg = MRI.createVirtualRegister(RC);

	MemOpndSlot = CurOp;

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	// For v = setjmp(buf), we generate
	//
	// thisMBB:
	// buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
	// SjLjSetup restoreMBB
	//
	// mainMBB:
	// v_main = 0
	//
	// sinkMBB:
	// v = phi(main, restore)
	//
	// restoreMBB:
	// if base pointer being used, load it from frame
	// v_restore = 1

	MachineBasicBlock *thisMBB = MBB;
	MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
	MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
	MF->insert(I, mainMBB);
	MF->insert(I, sinkMBB);
	MF->push_back(restoreMBB);
	restoreMBB->setHasAddressTaken();

	MachineInstrBuilder MIB;

	// Transfer the remainder of BB and its successor edges to sinkMBB.
	sinkMBB->splice(sinkMBB->begin(), MBB,
	std::next(MachineBasicBlock::iterator(MI)), MBB->end());
	sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);

	// thisMBB:
	unsigned PtrStoreOpc = 0;
	unsigned LabelReg = 0;
	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	// Prepare IP either in reg or imm.
	if (!UseImmLabel) {
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
	const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
	LabelReg = MRI.createVirtualRegister(PtrRC);
	if (Subtarget.is64Bit()) {
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
	.addReg(X86::RIP)
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB)
	.addReg(0);
	} else {
	const X86InstrInfo XII = static_cast<const X86InstrInfo>(TII);
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
	.addReg(XII->getGlobalBaseReg(MF))
	.addImm(0)
	.addReg(0)
	.addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}
	} else
	PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	// Store IP
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
	else
	MIB.add(MI.getOperand(MemOpndSlot + i));
	}
	if (!UseImmLabel)
	MIB.addReg(LabelReg);
	else
	MIB.addMBB(restoreMBB);
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Setup
	MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
	.addMBB(restoreMBB);

	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	MIB.addRegMask(RegInfo->getNoPreservedMask());
	thisMBB->addSuccessor(mainMBB);
	thisMBB->addSuccessor(restoreMBB);

	// mainMBB:
	// EAX = 0
	BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
	mainMBB->addSuccessor(sinkMBB);

	// sinkMBB:
	BuildMI(*sinkMBB, sinkMBB->begin(), DL,
	TII->get(X86::PHI), DstReg)
	.addReg(mainDstReg).addMBB(mainMBB)
	.addReg(restoreDstReg).addMBB(restoreMBB);

	// restoreMBB:
	if (RegInfo->hasBasePointer(*MF)) {
	const bool Uses64BitFramePtr =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
	X86FI->setRestoreBasePointer(MF);
	unsigned FramePtr = RegInfo->getFrameRegister(*MF);
	unsigned BasePtr = RegInfo->getBaseRegister();
	unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
	FramePtr, true, X86FI->getRestoreBasePointerOffset())
	.setMIFlag(MachineInstr::FrameSetup);
	}
	BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
	BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
	restoreMBB->addSuccessor(sinkMBB);

	MI.eraseFromParent();
	return sinkMBB;
	}

	MachineBasicBlock *
	X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
	MachineBasicBlock *MBB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo &MRI = MF->getRegInfo();

	// Memory Reference
	MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
	MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) &&
	"Invalid Pointer Size!");

	const TargetRegisterClass *RC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	unsigned Tmp = MRI.createVirtualRegister(RC);
	// Since FP is only updated here but NOT referenced, it's treated as GPR.
	const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
	unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
	unsigned SP = RegInfo->getStackRegister();

	MachineInstrBuilder MIB;

	const int64_t LabelOffset = 1 * PVT.getStoreSize();
	const int64_t SPOffset = 2 * PVT.getStoreSize();

	unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
	unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;

	// Reload FP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
	MIB.add(MI.getOperand(i));
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload IP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), LabelOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Reload SP
	MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
	for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
	if (i == X86::AddrDisp)
	MIB.addDisp(MI.getOperand(i), SPOffset);
	else
	MIB.add(MI.getOperand(i));
	}
	MIB.setMemRefs(MMOBegin, MMOEnd);
	// Jump
	BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);

	MI.eraseFromParent();
	return MBB;
	}

	void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
	MachineBasicBlock *MBB,
	MachineBasicBlock *DispatchBB,
	int FI) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = MBB->getParent();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();

	MVT PVT = getPointerTy(MF->getDataLayout());
	assert((PVT == MVT::i64 \|\| PVT == MVT::i32) && "Invalid Pointer Size!");

	unsigned Op = 0;
	unsigned VR = 0;

	bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
	!isPositionIndependent();

	if (UseImmLabel) {
	Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
	} else {
	const TargetRegisterClass *TRC =
	(PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
	VR = MRI->createVirtualRegister(TRC);
	Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;

	if (Subtarget.is64Bit())
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
	.addReg(X86::RIP)
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB)
	.addReg(0);
	else
	BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
	.addReg(0) /* TII->getGlobalBaseReg(MF) */
	.addImm(1)
	.addReg(0)
	.addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
	.addReg(0);
	}

	MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
	addFrameReference(MIB, FI, 36);
	if (UseImmLabel)
	MIB.addMBB(DispatchBB);
	else
	MIB.addReg(VR);
	}

	MachineBasicBlock *
	X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	DebugLoc DL = MI.getDebugLoc();
	MachineFunction *MF = BB->getParent();
	MachineFrameInfo &MFI = MF->getFrameInfo();
	MachineRegisterInfo *MRI = &MF->getRegInfo();
	const X86InstrInfo *TII = Subtarget.getInstrInfo();
	int FI = MFI.getFunctionContextIndex();

	// Get a mapping of the call site numbers to all of the landing pads they're
	// associated with.
	DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
	unsigned MaxCSNum = 0;
	for (auto &MBB : *MF) {
	if (!MBB.isEHPad())
	continue;

	MCSymbol *Sym = nullptr;
	for (const auto &MI : MBB) {
	if (MI.isDebugValue())
	continue;

	assert(MI.isEHLabel() && "expected EH_LABEL");
	Sym = MI.getOperand(0).getMCSymbol();
	break;
	}

	if (!MF->hasCallSiteLandingPad(Sym))
	continue;

	for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
	CallSiteNumToLPad[CSI].push_back(&MBB);
	MaxCSNum = std::max(MaxCSNum, CSI);
	}
	}

	// Get an ordered list of the machine basic blocks for the jump table.
	std::vector<MachineBasicBlock *> LPadList;
	SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
	LPadList.reserve(CallSiteNumToLPad.size());

	for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
	for (auto &LP : CallSiteNumToLPad[CSI]) {
	LPadList.push_back(LP);
	InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
	}
	}

	assert(!LPadList.empty() &&
	"No landing pad destinations for the dispatch jump table!");

	// Create the MBBs for the dispatch code.

	// Shove the dispatch's address into the return slot in the function context.
	MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
	DispatchBB->setIsEHPad(true);

	MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
	BuildMI(TrapBB, DL, TII->get(X86::TRAP));
	DispatchBB->addSuccessor(TrapBB);

	MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
	DispatchBB->addSuccessor(DispContBB);

	// Insert MBBs.
	MF->push_back(DispatchBB);
	MF->push_back(DispContBB);
	MF->push_back(TrapBB);

	// Insert code into the entry block that creates and registers the function
	// context.
	SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);

	// Create the jump table and associated information
	MachineJumpTableInfo *JTI =
	MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
	unsigned MJTI = JTI->createJumpTableIndex(LPadList);

	const X86RegisterInfo &RI = TII->getRegisterInfo();
	// Add a register mask with no preserved registers. This results in all
	// registers being marked as clobbered.
	if (RI.hasBasePointer(*MF)) {
	const bool FPIs64Bit =
	Subtarget.isTarget64BitLP64() \|\| Subtarget.isTargetNaCl64();
	X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
	MFI->setRestoreBasePointer(MF);

	unsigned FP = RI.getFrameRegister(*MF);
	unsigned BP = RI.getBaseRegister();
	unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
	addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
	MFI->getRestoreBasePointerOffset())
	.addRegMask(RI.getNoPreservedMask());
	} else {
	BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
	.addRegMask(RI.getNoPreservedMask());
	}

	unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
	4);
	BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
	.addReg(IReg)
	.addImm(LPadList.size());
	BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);

	unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
	BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
	.addReg(IReg)
	.addImm(1);
	BuildMI(DispContBB, DL,
	TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
	.addReg(0)
	.addImm(Subtarget.is64Bit() ? 8 : 4)
	.addReg(JReg)
	.addJumpTableIndex(MJTI)
	.addReg(0);

	// Add the jump table entries as successors to the MBB.
	SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
	for (auto &LP : LPadList)
	if (SeenMBBs.insert(LP).second)
	DispContBB->addSuccessor(LP);

	// N.B. the order the invoke BBs are processed in doesn't matter here.
	SmallVector<MachineBasicBlock *, 64> MBBLPads;
	const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
	for (MachineBasicBlock *MBB : InvokeBBs) {
	// Remove the landing pad successor from the invoke block and replace it
	// with the new dispatch block.
	// Keep a copy of Successors since it's modified inside the loop.
	SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
	MBB->succ_rend());
	// FIXME: Avoid quadratic complexity.
	for (auto MBBS : Successors) {
	if (MBBS->isEHPad()) {
	MBB->removeSuccessor(MBBS);
	MBBLPads.push_back(MBBS);
	}
	}

	MBB->addSuccessor(DispatchBB);

	// Find the invoke call and mark all of the callee-saved registers as
	// 'implicit defined' so that they're spilled. This prevents code from
	// moving instructions to before the EH block, where they will never be
	// executed.
	for (auto &II : reverse(*MBB)) {
	if (!II.isCall())
	continue;

	DenseMap<unsigned, bool> DefRegs;
	for (auto &MOp : II.operands())
	if (MOp.isReg())
	DefRegs[MOp.getReg()] = true;

	MachineInstrBuilder MIB(*MF, &II);
	for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
	unsigned Reg = SavedRegs[RI];
	if (!DefRegs[Reg])
	MIB.addReg(Reg, RegState::ImplicitDefine \| RegState::Dead);
	}

	break;
	}
	}

	// Mark all former landing pads as non-landing pads. The dispatch is the only
	// landing pad now.
	for (auto &LP : MBBLPads)
	LP->setIsEHPad(false);

	// The instruction is gone now.
	MI.eraseFromParent();
	return BB;
	}

	MachineBasicBlock *
	X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
	MachineBasicBlock *BB) const {
	MachineFunction *MF = BB->getParent();
	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	DebugLoc DL = MI.getDebugLoc();

	switch (MI.getOpcode()) {
	default: llvm_unreachable("Unexpected instr type to insert");
	case X86::TAILJMPd64:
	case X86::TAILJMPr64:
	case X86::TAILJMPm64:
	case X86::TAILJMPr64_REX:
	case X86::TAILJMPm64_REX:
	llvm_unreachable("TAILJMP64 would not be touched here.");
	case X86::TCRETURNdi64:
	case X86::TCRETURNri64:
	case X86::TCRETURNmi64:
	return BB;
	case X86::TLS_addr32:
	case X86::TLS_addr64:
	case X86::TLS_base_addr32:
	case X86::TLS_base_addr64:
	return EmitLoweredTLSAddr(MI, BB);
	case X86::CATCHRET:
	return EmitLoweredCatchRet(MI, BB);
	case X86::CATCHPAD:
	return EmitLoweredCatchPad(MI, BB);
	case X86::SEG_ALLOCA_32:
	case X86::SEG_ALLOCA_64:
	return EmitLoweredSegAlloca(MI, BB);
	case X86::TLSCall_32:
	case X86::TLSCall_64:
	return EmitLoweredTLSCall(MI, BB);
	case X86::CMOV_FR32:
	case X86::CMOV_FR64:
	case X86::CMOV_FR128:
	case X86::CMOV_GR8:
	case X86::CMOV_GR16:
	case X86::CMOV_GR32:
	case X86::CMOV_RFP32:
	case X86::CMOV_RFP64:
	case X86::CMOV_RFP80:
	case X86::CMOV_V2F64:
	case X86::CMOV_V2I64:
	case X86::CMOV_V4F32:
	case X86::CMOV_V4F64:
	case X86::CMOV_V4I64:
	case X86::CMOV_V16F32:
	case X86::CMOV_V8F32:
	case X86::CMOV_V8F64:
	case X86::CMOV_V8I64:
	case X86::CMOV_V8I1:
	case X86::CMOV_V16I1:
	case X86::CMOV_V32I1:
	case X86::CMOV_V64I1:
	return EmitLoweredSelect(MI, BB);

	case X86::RDFLAGS32:
	case X86::RDFLAGS64: {
	unsigned PushF =
	MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
	unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
	MachineInstr Push = BuildMI(BB, MI, DL, TII->get(PushF));
	// Permit reads of the FLAGS register without it being defined.
	// This intrinsic exists to read external processor state in flags, such as
	// the trap flag, interrupt flag, and direction flag, none of which are
	// modeled by the backend.
	Push->getOperand(2).setIsUndef();
	BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::WRFLAGS32:
	case X86::WRFLAGS64: {
	unsigned Push =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
	unsigned PopF =
	MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
	BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
	BuildMI(*BB, MI, DL, TII->get(PopF));

	MI.eraseFromParent(); // The pseudo is gone now.
	return BB;
	}

	case X86::RELEASE_FADD32mr:
	case X86::RELEASE_FADD64mr:
	return EmitLoweredAtomicFP(MI, BB);

	case X86::FP32_TO_INT16_IN_MEM:
	case X86::FP32_TO_INT32_IN_MEM:
	case X86::FP32_TO_INT64_IN_MEM:
	case X86::FP64_TO_INT16_IN_MEM:
	case X86::FP64_TO_INT32_IN_MEM:
	case X86::FP64_TO_INT64_IN_MEM:
	case X86::FP80_TO_INT16_IN_MEM:
	case X86::FP80_TO_INT32_IN_MEM:
	case X86::FP80_TO_INT64_IN_MEM: {
	// Change the floating point control register to use "round towards zero"
	// mode when truncating to an integer value.
	int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FNSTCW16m)), CWFrameIdx);

	// Load the old value of the high byte of the control word...
	unsigned OldCW =
	MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
	CWFrameIdx);

	// Set the high part to be round to zero...
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
	.addImm(0xC7F);

	// Reload the modified control word now...
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	// Restore the memory image of control word to original value
	addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
	.addReg(OldCW);

	// Get the X86 opcode to use.
	unsigned Opc;
	switch (MI.getOpcode()) {
	default: llvm_unreachable("illegal opcode!");
	case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
	case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
	case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
	case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
	case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
	case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
	case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
	case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
	case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
	}

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
	.addReg(MI.getOperand(X86::AddrNumOperands).getReg());

	// Reload the original control word now.
	addFrameReference(BuildMI(*BB, MI, DL,
	TII->get(X86::FLDCW16m)), CWFrameIdx);

	MI.eraseFromParent(); // The pseudo instruction is gone now.
	return BB;
	}
	// String/text processing lowering.
	case X86::PCMPISTRM128REG:
	case X86::VPCMPISTRM128REG:
	case X86::PCMPISTRM128MEM:
	case X86::VPCMPISTRM128MEM:
	case X86::PCMPESTRM128REG:
	case X86::VPCMPESTRM128REG:
	case X86::PCMPESTRM128MEM:
	case X86::VPCMPESTRM128MEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());

	// String/text processing lowering.
	case X86::PCMPISTRIREG:
	case X86::VPCMPISTRIREG:
	case X86::PCMPISTRIMEM:
	case X86::VPCMPISTRIMEM:
	case X86::PCMPESTRIREG:
	case X86::VPCMPESTRIREG:
	case X86::PCMPESTRIMEM:
	case X86::VPCMPESTRIMEM:
	assert(Subtarget.hasSSE42() &&
	"Target must have SSE4.2 or AVX features enabled");
	return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());

	// Thread synchronization.
	case X86::MONITOR:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
	case X86::MONITORX:
	return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);

	// Cache line zero
	case X86::CLZERO:
	return emitClzero(&MI, BB, Subtarget);

	// PKU feature
	case X86::WRPKRU:
	return emitWRPKRU(MI, BB, Subtarget);
	case X86::RDPKRU:
	return emitRDPKRU(MI, BB, Subtarget);
	// xbegin
	case X86::XBEGIN:
	return emitXBegin(MI, BB, Subtarget.getInstrInfo());

	case X86::VASTART_SAVE_XMM_REGS:
	return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);

	case X86::VAARG_64:
	return EmitVAARG64WithCustomInserter(MI, BB);

	case X86::EH_SjLj_SetJmp32:
	case X86::EH_SjLj_SetJmp64:
	return emitEHSjLjSetJmp(MI, BB);

	case X86::EH_SjLj_LongJmp32:
	case X86::EH_SjLj_LongJmp64:
	return emitEHSjLjLongJmp(MI, BB);

	case X86::Int_eh_sjlj_setup_dispatch:
	return EmitSjLjDispatchBlock(MI, BB);

	case TargetOpcode::STATEPOINT:
	// As an implementation detail, STATEPOINT shares the STACKMAP format at
	// this point in the process. We diverge later.
	return emitPatchPoint(MI, BB);

	case TargetOpcode::STACKMAP:
	case TargetOpcode::PATCHPOINT:
	return emitPatchPoint(MI, BB);

	case TargetOpcode::PATCHABLE_EVENT_CALL:
	// Do nothing here, handle in xray instrumentation pass.
	return BB;

	case X86::LCMPXCHG8B: {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	// In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
	// requires a memory operand. If it happens that current architecture is
	// i686 and for current function we need a base pointer
	// - which is ESI for i686 - register allocator would not be able to
	// allocate registers for an address in form of X(%reg, %reg, Y)
	// - there never would be enough unreserved registers during regalloc
	// (without the need for base ptr the only option would be X(%edi, %esi, Y).
	// We are giving a hand to register allocator by precomputing the address in
	// a new vreg using LEA.

	// If it is not i686 or there is no base pointer - nothing to do here.
	if (!Subtarget.is32Bit() \|\| !TRI->hasBasePointer(*MF))
	return BB;

	// Even though this code does not necessarily needs the base pointer to
	// be ESI, we check for that. The reason: if this assert fails, there are
	// some changes happened in the compiler base pointer handling, which most
	// probably have to be addressed somehow here.
	assert(TRI->getBaseRegister() == X86::ESI &&
	"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
	"base pointer in mind");

	MachineRegisterInfo &MRI = MF->getRegInfo();
	MVT SPTy = getPointerTy(MF->getDataLayout());
	const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
	unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);

	X86AddressMode AM = getAddressFromInstr(&MI, 0);
	// Regalloc does not need any help when the memory operand of CMPXCHG8B
	// does not use index register.
	if (AM.IndexReg == X86::NoRegister)
	return BB;

	// After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
	// four operand definitions that are E[ABCD] registers. We skip them and
	// then insert the LEA.
	MachineBasicBlock::iterator MBBI(MI);
	while (MBBI->definesRegister(X86::EAX) \|\| MBBI->definesRegister(X86::EBX) \|\|
	MBBI->definesRegister(X86::ECX) \|\| MBBI->definesRegister(X86::EDX))
	--MBBI;
	addFullAddress(
	BuildMI(BB, MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);

	setDirectAddressInInstr(&MI, 0, computedAddrVReg);

	return BB;
	}
	case X86::LCMPXCHG16B:
	return BB;
	case X86::LCMPXCHG8B_SAVE_EBX:
	case X86::LCMPXCHG16B_SAVE_RBX: {
	unsigned BasePtr =
	MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
	if (!BB->isLiveIn(BasePtr))
	BB->addLiveIn(BasePtr);
	return BB;
	}
	}
	}

	//===----------------------------------------------------------------------===//
	// X86 Optimization Hooks
	//===----------------------------------------------------------------------===//

	void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
	KnownBits &Known,
	const APInt &DemandedElts,
	const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned BitWidth = Known.getBitWidth();
	unsigned Opc = Op.getOpcode();
	EVT VT = Op.getValueType();
	assert((Opc >= ISD::BUILTIN_OP_END \|\|
	Opc == ISD::INTRINSIC_WO_CHAIN \|\|
	Opc == ISD::INTRINSIC_W_CHAIN \|\|
	Opc == ISD::INTRINSIC_VOID) &&
	"Should use MaskedValueIsZero if you don't know whether Op"
	" is a target node!");

	Known.resetAll();
	switch (Opc) {
	default: break;
	case X86ISD::ADD:
	case X86ISD::SUB:
	case X86ISD::ADC:
	case X86ISD::SBB:
	case X86ISD::SMUL:
	case X86ISD::UMUL:
	case X86ISD::INC:
	case X86ISD::DEC:
	case X86ISD::OR:
	case X86ISD::XOR:
	case X86ISD::AND:
	// These nodes' second result is a boolean.
	if (Op.getResNo() == 0)
	break;
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	Known.Zero.setBitsFrom(1);
	break;
	case X86ISD::MOVMSK: {
	unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
	Known.Zero.setBitsFrom(NumLoBits);
	break;
	}
	case X86ISD::VSHLI:
	case X86ISD::VSRLI: {
	if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
	Known.setAllZero();
	break;
	}

	DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
	unsigned ShAmt = ShiftImm->getZExtValue();
	if (Opc == X86ISD::VSHLI) {
	Known.Zero <<= ShAmt;
	Known.One <<= ShAmt;
	// Low bits are known zero.
	Known.Zero.setLowBits(ShAmt);
	} else {
	Known.Zero.lshrInPlace(ShAmt);
	Known.One.lshrInPlace(ShAmt);
	// High bits are known zero.
	Known.Zero.setHighBits(ShAmt);
	}
	}
	break;
	}
	case X86ISD::VZEXT: {
	SDValue N0 = Op.getOperand(0);
	unsigned NumElts = VT.getVectorNumElements();

	EVT SrcVT = N0.getValueType();
	unsigned InNumElts = SrcVT.getVectorNumElements();
	unsigned InBitWidth = SrcVT.getScalarSizeInBits();
	assert(InNumElts >= NumElts && "Illegal VZEXT input");

	Known = KnownBits(InBitWidth);
	APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
	DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
	Known = Known.zext(BitWidth);
	Known.Zero.setBitsFrom(InBitWidth);
	break;
	}
	}
	}

	unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
	SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
	unsigned Depth) const {
	unsigned VTBits = Op.getScalarValueSizeInBits();
	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::SETCC_CARRY:
	// SETCC_CARRY sets the dest to ~0 for true or 0 for false.
	return VTBits;

	case X86ISD::VSEXT: {
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	Tmp += VTBits - Src.getScalarValueSizeInBits();
	return Tmp;
	}

	case X86ISD::VSHLI: {
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	if (ShiftVal.uge(VTBits))
	return VTBits; // Shifted all bits out --> zero.
	if (ShiftVal.uge(Tmp))
	return 1; // Shifted all sign bits out --> unknown.
	return Tmp - ShiftVal.getZExtValue();
	}

	case X86ISD::VSRAI: {
	SDValue Src = Op.getOperand(0);
	unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
	APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
	ShiftVal += Tmp;
	return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
	}

	case X86ISD::PCMPGT:
	case X86ISD::PCMPEQ:
	case X86ISD::CMPP:
	case X86ISD::VPCOM:
	case X86ISD::VPCOMU:
	// Vector compares return zero/all-bits result values.
	return VTBits;
	}

	// Fallback case.
	return 1;
	}

	/// Returns true (and the GlobalValue and the offset) if the node is a
	/// GlobalAddress + offset.
	bool X86TargetLowering::isGAPlusOffset(SDNode *N,
	const GlobalValue* &GA,
	int64_t &Offset) const {
	if (N->getOpcode() == X86ISD::Wrapper) {
	if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
	GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
	Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
	return true;
	}
	}
	return TargetLowering::isGAPlusOffset(N, GA, Offset);
	}

	// Attempt to match a combined shuffle mask against supported unary shuffle
	// instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSize = MaskVT.getScalarSizeInBits();

	// Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
	// TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
	unsigned MaxScale = 64 / MaskEltSize;
	for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
	bool Match = true;
	unsigned NumDstElts = NumMaskElts / Scale;
	for (unsigned i = 0; i != NumDstElts && Match; ++i) {
	Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
	Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
	}
	if (Match) {
	unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
	SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
	if (SrcVT != MaskVT)
	V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
	DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
	DstVT = MVT::getVectorVT(DstVT, NumDstElts);
	Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
	: unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
	return true;
	}
	}
	}

	// Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
	if (((MaskEltSize == 32) \|\| (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
	isUndefOrEqual(Mask[0], 0) &&
	isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
	Shuffle = X86ISD::VZEXT_MOVL;
	SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
	return true;
	}

	// Check if we have SSE3 which will let us use MOVDDUP etc. The
	// instructions are no slower than UNPCKLPD but has the option to
	// fold the input operand into even an unaligned memory load.
	if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
	if (isTargetShuffleEquivalent(Mask, {0, 0})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v2f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v4f32;
	return true;
	}
	}

	if (MaskVT.is256BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v4f64;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v8f32;
	return true;
	}
	}

	if (MaskVT.is512BitVector() && AllowFloatDomain) {
	assert(Subtarget.hasAVX512() &&
	"AVX512 required for 512-bit vector shuffles");
	if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
	Shuffle = X86ISD::MOVDDUP;
	SrcVT = DstVT = MVT::v8f64;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
	Shuffle = X86ISD::MOVSLDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	if (isTargetShuffleEquivalent(
	Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
	Shuffle = X86ISD::MOVSHDUP;
	SrcVT = DstVT = MVT::v16f32;
	return true;
	}
	}

	// Attempt to match against broadcast-from-vector.
	if (Subtarget.hasAVX2()) {
	SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
	if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
	SrcVT = DstVT = MaskVT;
	Shuffle = X86ISD::VBROADCAST;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined shuffle mask against supported unary immediate
	// permute instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned InputSizeInBits = MaskVT.getSizeInBits();
	unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
	MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);

	bool ContainsZeros =
	llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	// Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
	if (!ContainsZeros && MaskScalarSizeInBits == 64) {
	// Check for lane crossing permutes.
	if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
	// PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
	if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
	PermuteImm = getV4X86ShuffleImm(Mask);
	return true;
	}
	if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
	SmallVector<int, 4> RepeatedMask;
	if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
	Shuffle = X86ISD::VPERMI;
	ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
	PermuteImm = getV4X86ShuffleImm(RepeatedMask);
	return true;
	}
	}
	} else if (AllowFloatDomain && Subtarget.hasAVX()) {
	// VPERMILPD can permute with a non-repeating shuffle.
	Shuffle = X86ISD::VPERMILPI;
	ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
	PermuteImm = 0;
	for (int i = 0, e = Mask.size(); i != e; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef)
	continue;
	assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
	PermuteImm \|= (M & 1) << i;
	}
	return true;
	}
	}

	// Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
	// AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
	// had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
	if ((MaskScalarSizeInBits == 64 \|\| MaskScalarSizeInBits == 32) &&
	!ContainsZeros && (AllowIntDomain \|\| Subtarget.hasAVX())) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	// Narrow the repeated mask to create 32-bit element permutes.
	SmallVector<int, 4> WordMask = RepeatedMask;
	if (MaskScalarSizeInBits == 64)
	scaleShuffleMask(2, RepeatedMask, WordMask);

	Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
	ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
	PermuteImm = getV4X86ShuffleImm(WordMask);
	return true;
	}
	}

	// Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
	if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
	SmallVector<int, 4> RepeatedMask;
	if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
	ArrayRef<int> LoMask(Mask.data() + 0, 4);
	ArrayRef<int> HiMask(Mask.data() + 4, 4);

	// PSHUFLW: permute lower 4 elements only.
	if (isUndefOrInRange(LoMask, 0, 4) &&
	isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
	Shuffle = X86ISD::PSHUFLW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(LoMask);
	return true;
	}

	// PSHUFHW: permute upper 4 elements only.
	if (isUndefOrInRange(HiMask, 4, 8) &&
	isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
	// Offset the HiMask so that we can create the shuffle immediate.
	int OffsetHiMask[4];
	for (int i = 0; i != 4; ++i)
	OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);

	Shuffle = X86ISD::PSHUFHW;
	ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
	PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
	return true;
	}
	}
	}

	// Attempt to match against byte/bit shifts.
	// FIXME: Add 512-bit support.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
	MaskScalarSizeInBits, Mask,
	0, Zeroable, Subtarget);
	if (0 < ShiftAmt) {
	PermuteImm = (unsigned)ShiftAmt;
	return true;
	}
	}

	return false;
	}

	// Attempt to match a combined unary shuffle mask against supported binary
	// shuffle instructions.
	// TODO: Investigate sharing more of this with shuffle lowering.
	static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	bool AllowFloatDomain, bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	bool IsUnary) {
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	if (MaskVT.is128BitVector()) {
	if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVLHPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
	V2 = V1;
	Shuffle = X86ISD::MOVHLPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	std::swap(V1, V2);
	Shuffle = X86ISD::MOVSD;
	ShuffleVT = MaskVT;
	return true;
	}
	if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
	(AllowFloatDomain \|\| !Subtarget.hasSSE41())) {
	Shuffle = X86ISD::MOVSS;
	ShuffleVT = MaskVT;
	return true;
	}
	}

	// Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
	if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) \|\|
	(MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
	if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
	DAG, Subtarget)) {
	ShuffleVT = MaskVT;
	if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
	ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
	return true;
	}
	}

	return false;
	}

	static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
	const APInt &Zeroable,
	bool AllowFloatDomain,
	bool AllowIntDomain,
	SDValue &V1, SDValue &V2, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	unsigned &Shuffle, MVT &ShuffleVT,
	unsigned &PermuteImm) {
	unsigned NumMaskElts = Mask.size();
	unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();

	// Attempt to match against PALIGNR byte rotate.
	if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
	int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
	if (0 < ByteRotation) {
	Shuffle = X86ISD::PALIGNR;
	ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
	PermuteImm = ByteRotation;
	return true;
	}
	}

	// Attempt to combine to X86ISD::BLENDI.
	if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) \|\|
	(Subtarget.hasAVX() && MaskVT.is256BitVector()))) \|\|
	(MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
	uint64_t BlendMask = 0;
	bool ForceV1Zero = false, ForceV2Zero = false;
	SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
	if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
	BlendMask)) {
	if (MaskVT == MVT::v16i16) {
	// We can only use v16i16 PBLENDW if the lanes are repeated.
	SmallVector<int, 8> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
	RepeatedMask)) {
	assert(RepeatedMask.size() == 8 &&
	"Repeated mask size doesn't match!");
	PermuteImm = 0;
	for (int i = 0; i < 8; ++i)
	if (RepeatedMask[i] >= 8)
	PermuteImm \|= 1 << i;
	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	Shuffle = X86ISD::BLENDI;
	ShuffleVT = MaskVT;
	return true;
	}
	} else {
	// Determine a type compatible with X86ISD::BLENDI.
	ShuffleVT = MaskVT;
	if (Subtarget.hasAVX2()) {
	if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v8i32;
	else if (ShuffleVT == MVT::v2i64)
	ShuffleVT = MVT::v4i32;
	} else {
	if (ShuffleVT == MVT::v2i64 \|\| ShuffleVT == MVT::v4i32)
	ShuffleVT = MVT::v8i16;
	else if (ShuffleVT == MVT::v4i64)
	ShuffleVT = MVT::v4f64;
	else if (ShuffleVT == MVT::v8i32)
	ShuffleVT = MVT::v8f32;
	}

	if (!ShuffleVT.isFloatingPoint()) {
	int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
	BlendMask =
	scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
	ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
	ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
	}

	V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
	V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
	PermuteImm = (unsigned)BlendMask;
	Shuffle = X86ISD::BLENDI;
	return true;
	}
	}
	}

	// Attempt to combine to INSERTPS.
	if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
	MaskVT.is128BitVector()) {
	if (Zeroable.getBoolValue() &&
	matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
	Shuffle = X86ISD::INSERTPS;
	ShuffleVT = MVT::v4f32;
	return true;
	}
	}

	// Attempt to combine to SHUFPD.
	if (AllowFloatDomain && EltSizeInBits == 64 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
	return true;
	}
	}

	// Attempt to combine to SHUFPS.
	if (AllowFloatDomain && EltSizeInBits == 32 &&
	((MaskVT.is128BitVector() && Subtarget.hasSSE1()) \|\|
	(MaskVT.is256BitVector() && Subtarget.hasAVX()) \|\|
	(MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
	SmallVector<int, 4> RepeatedMask;
	if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
	// Match each half of the repeated mask, to determine if its just
	// referencing one of the vectors, is zeroable or entirely undef.
	auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
	int M0 = RepeatedMask[Offset];
	int M1 = RepeatedMask[Offset + 1];

	if (isUndefInRange(RepeatedMask, Offset, 2)) {
	return DAG.getUNDEF(MaskVT);
	} else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : 0);
	S1 = (SM_SentinelUndef == M1 ? -1 : 1);
	return getZeroVector(MaskVT, Subtarget, DAG, DL);
	} else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V1;
	} else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
	S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
	S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
	return V2;
	}

	return SDValue();
	};

	int ShufMask[4] = {-1, -1, -1, -1};
	SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
	SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);

	if (Lo && Hi) {
	V1 = Lo;
	V2 = Hi;
	Shuffle = X86ISD::SHUFP;
	ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
	PermuteImm = getV4X86ShuffleImm(ShufMask);
	return true;
	}
	}
	}

	return false;
	}

	/// \brief Combine an arbitrary chain of shuffles into a single instruction if
	/// possible.
	///
	/// This is the leaf of the recursive combine below. When we have found some
	/// chain of single-use x86 shuffle instructions and accumulated the combined
	/// shuffle mask represented by them, this will try to pattern match that mask
	/// into either a single instruction if there is a special purpose instruction
	/// for this operation, or into a PSHUFB instruction which is a fully general
	/// instruction but should only be used to replace chains over a certain depth.
	static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
	ArrayRef<int> BaseMask, int Depth,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
	assert((Inputs.size() == 1 \|\| Inputs.size() == 2) &&
	"Unexpected number of shuffle inputs!");

	// Find the inputs that enter the chain. Note that multiple uses are OK
	// here, we're not going to remove the operands we find.
	bool UnaryShuffle = (Inputs.size() == 1);
	SDValue V1 = peekThroughBitcasts(Inputs[0]);
	SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
	: peekThroughBitcasts(Inputs[1]));

	MVT VT1 = V1.getSimpleValueType();
	MVT VT2 = V2.getSimpleValueType();
	MVT RootVT = Root.getSimpleValueType();
	assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
	VT2.getSizeInBits() == RootVT.getSizeInBits() &&
	"Vector size mismatch");

	SDLoc DL(Root);
	SDValue Res;

	unsigned NumBaseMaskElts = BaseMask.size();
	if (NumBaseMaskElts == 1) {
	assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
	/AddTo/ true);
	return true;
	}

	unsigned RootSizeInBits = RootVT.getSizeInBits();
	unsigned NumRootElts = RootVT.getVectorNumElements();
	unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
	bool FloatDomain = VT1.isFloatingPoint() \|\| VT2.isFloatingPoint() \|\|
	(RootVT.is256BitVector() && !Subtarget.hasAVX2());

	// Don't combine if we are a AVX512/EVEX target and the mask element size
	// is different from the root element size - this would prevent writemasks
	// from being reused.
	// TODO - this currently prevents all lane shuffles from occurring.
	// TODO - check for writemasks usage instead of always preventing combining.
	// TODO - attempt to narrow Mask back to writemask size.
	bool IsEVEXShuffle =
	RootSizeInBits == 512 \|\| (Subtarget.hasVLX() && RootSizeInBits >= 128);
	if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
	return false;

	// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.

	// Handle 128-bit lane shuffles of 256-bit vectors.
	// TODO - this should support binary shuffles.
	if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
	!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
	return false; // Nothing to do!
	MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
	unsigned PermMask = 0;
	PermMask \|= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
	PermMask \|= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);

	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
	DAG.getUNDEF(ShuffleVT),
	DAG.getConstant(PermMask, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// For masks that have been widened to 128-bit elements or more,
	// narrow back down to 64-bit elements.
	SmallVector<int, 64> Mask;
	if (BaseMaskEltSizeInBits > 64) {
	assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
	int MaskScale = BaseMaskEltSizeInBits / 64;
	scaleShuffleMask(MaskScale, BaseMask, Mask);
	} else {
	Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
	}

	unsigned NumMaskElts = Mask.size();
	unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;

	// Determine the effective mask value type.
	FloatDomain &= (32 <= MaskEltSizeInBits);
	MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
	: MVT::getIntegerVT(MaskEltSizeInBits);
	MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);

	// Only allow legal mask types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
	return false;

	// Attempt to match the mask against known shuffle patterns.
	MVT ShuffleSrcVT, ShuffleVT;
	unsigned Shuffle, PermuteImm;

	// Which shuffle domains are permitted?
	// Permit domain crossing at higher combine depths.
	bool AllowFloatDomain = FloatDomain \|\| (Depth > 3);
	bool AllowIntDomain = (!FloatDomain \|\| (Depth > 3)) &&
	(!MaskVT.is256BitVector() \|\| Subtarget.hasAVX2());

	// Determine zeroable mask elements.
	APInt Zeroable(NumMaskElts, 0);
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (isUndefOrZero(Mask[i]))
	Zeroable.setBit(i);

	if (UnaryShuffle) {
	// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
	// directly if we don't shuffle the lower element and we shuffle the upper
	// (zero) elements within themselves.
	if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
	(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
	unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
	ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
	if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
	isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
	/AddTo/ true);
	return true;
	}
	}

	if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
	ShuffleVT)) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return false; // Nothing to do!
	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
	return false; // AVX512 Writemask clash.
	Res = DAG.getBitcast(ShuffleSrcVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, Subtarget, Shuffle,
	ShuffleVT, PermuteImm)) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return false; // Nothing to do!
	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
	return false; // AVX512 Writemask clash.
	Res = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}
	}

	if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
	V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
	UnaryShuffle)) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return false; // Nothing to do!
	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
	return false; // AVX512 Writemask clash.
	V1 = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ShuffleVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
	AllowIntDomain, V1, V2, DL, DAG,
	Subtarget, Shuffle, ShuffleVT,
	PermuteImm)) {
	if (Depth == 1 && Root.getOpcode() == Shuffle)
	return false; // Nothing to do!
	if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
	return false; // AVX512 Writemask clash.
	V1 = DAG.getBitcast(ShuffleVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ShuffleVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
	DAG.getConstant(PermuteImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// Typically from here on, we need an integer version of MaskVT.
	MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
	IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);

	// Annoyingly, SSE4A instructions don't map into the above match helpers.
	if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
	uint64_t BitLen, BitIdx;
	if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
	Zeroable)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
	return false; // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
	if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
	return false; // Nothing to do!
	V1 = DAG.getBitcast(IntMaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(IntMaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
	DAG.getConstant(BitLen, DL, MVT::i8),
	DAG.getConstant(BitIdx, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}
	}

	// Don't try to re-form single instruction chains under any circumstances now
	// that we've done encoding canonicalization for them.
	if (Depth < 2)
	return false;

	bool MaskContainsZeros =
	any_of(Mask, [](int M) { return M == SM_SentinelZero; });

	if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
	// If we have a single input lane-crossing shuffle then lower to VPERMV.
	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) && !MaskContainsZeros &&
	((Subtarget.hasAVX2() &&
	(MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
	// vector as the second source.
	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	// Adjust shuffle mask - replace SM_SentinelZero with second source index.
	for (unsigned i = 0; i != NumMaskElts; ++i)
	if (Mask[i] == SM_SentinelZero)
	Mask[i] = NumMaskElts + i;

	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
	DCI.AddToWorklist(Zero.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
	if ((Depth >= 3 \|\| HasVariableMask) && !MaskContainsZeros &&
	((Subtarget.hasAVX512() &&
	(MaskVT == MVT::v8f64 \|\| MaskVT == MVT::v8i64 \|\|
	MaskVT == MVT::v16f32 \|\| MaskVT == MVT::v16i32)) \|\|
	(Subtarget.hasVLX() &&
	(MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4i64 \|\|
	MaskVT == MVT::v8f32 \|\| MaskVT == MVT::v8i32)) \|\|
	(Subtarget.hasBWI() && MaskVT == MVT::v32i16) \|\|
	(Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) \|\|
	(Subtarget.hasVBMI() && MaskVT == MVT::v64i8) \|\|
	(Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
	SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPermMask.getNode());
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}
	return false;
	}

	// See if we can combine a single input shuffle with zeros to a bit-mask,
	// which is much simpler than any shuffle.
	if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 \|\| HasVariableMask) &&
	isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
	DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
	APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
	APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
	APInt UndefElts(NumMaskElts, 0);
	SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	}
	if (M == SM_SentinelZero)
	continue;
	EltBits[i] = AllOnes;
	}
	SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(BitMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	unsigned AndOpcode =
	FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
	Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// If we have a single input shuffle with different shuffle patterns in the
	// the 128-bit lanes use the variable mask to VPERMILPS.
	// TODO Combine other mask types at higher depths.
	if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
	((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) \|\|
	(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
	SmallVector<SDValue, 16> VPermIdx;
	for (int M : Mask) {
	SDValue Idx =
	M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
	VPermIdx.push_back(Idx);
	}
	SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
	DCI.AddToWorklist(VPermMask.getNode());
	Res = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(Res.getNode());
	Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
	// to VPERMIL2PD/VPERMIL2PS.
	if ((Depth >= 3 \|\| HasVariableMask) && Subtarget.hasXOP() &&
	(MaskVT == MVT::v2f64 \|\| MaskVT == MVT::v4f64 \|\| MaskVT == MVT::v4f32 \|\|
	MaskVT == MVT::v8f32)) {
	// VPERMIL2 Operation.
	// Bits[3] - Match Bit.
	// Bits[2:1] - (Per Lane) PD Shuffle Mask.
	// Bits[2:0] - (Per Lane) PS Shuffle Mask.
	unsigned NumLanes = MaskVT.getSizeInBits() / 128;
	unsigned NumEltsPerLane = NumMaskElts / NumLanes;
	SmallVector<int, 8> VPerm2Idx;
	unsigned M2ZImm = 0;
	for (int M : Mask) {
	if (M == SM_SentinelUndef) {
	VPerm2Idx.push_back(-1);
	continue;
	}
	if (M == SM_SentinelZero) {
	M2ZImm = 2;
	VPerm2Idx.push_back(8);
	continue;
	}
	int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
	Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
	VPerm2Idx.push_back(Index);
	}
	V1 = DAG.getBitcast(MaskVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(MaskVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
	DCI.AddToWorklist(VPerm2MaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
	DAG.getConstant(M2ZImm, DL, MVT::i8));
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// If we have 3 or more shuffle instructions or a chain involving a variable
	// mask, we can replace them with a single PSHUFB instruction profitably.
	// Intel's manuals suggest only using PSHUFB if doing so replacing 5
	// instructions, but in practice PSHUFB tends to be very fast so we're
	// more aggressive.
	if (UnaryShuffle && (Depth >= 3 \|\| HasVariableMask) &&
	((RootVT.is128BitVector() && Subtarget.hasSSSE3()) \|\|
	(RootVT.is256BitVector() && Subtarget.hasAVX2()) \|\|
	(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
	SmallVector<SDValue, 16> PSHUFBMask;
	int NumBytes = RootVT.getSizeInBits() / 8;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	assert ((M / 16) == (i / 16) && "Lane crossing detected");
	PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
	Res = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(Res.getNode());
	SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
	DCI.AddToWorklist(PSHUFBMaskOp.getNode());
	Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// With XOP, if we have a 128-bit binary input shuffle we can always combine
	// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
	// slower than PSHUFB on targets that support both.
	if ((Depth >= 3 \|\| HasVariableMask) && RootVT.is128BitVector() &&
	Subtarget.hasXOP()) {
	// VPPERM Mask Operation
	// Bits[4:0] - Byte Index (0 - 31)
	// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
	SmallVector<SDValue, 16> VPPERMMask;
	int NumBytes = 16;
	int Ratio = NumBytes / NumMaskElts;
	for (int i = 0; i < NumBytes; ++i) {
	int M = Mask[i / Ratio];
	if (M == SM_SentinelUndef) {
	VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
	continue;
	}
	if (M == SM_SentinelZero) {
	VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
	continue;
	}
	M = Ratio * M + i % Ratio;
	VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
	}
	MVT ByteVT = MVT::v16i8;
	V1 = DAG.getBitcast(ByteVT, V1);
	DCI.AddToWorklist(V1.getNode());
	V2 = DAG.getBitcast(ByteVT, V2);
	DCI.AddToWorklist(V2.getNode());
	SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
	DCI.AddToWorklist(VPPERMMaskOp.getNode());
	Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
	DCI.AddToWorklist(Res.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
	/AddTo/ true);
	return true;
	}

	// Failed to find any combines.
	return false;
	}

	// Attempt to constant fold all of the constant source ops.
	// Returns true if the entire shuffle is folded to a constant.
	// TODO: Extend this to merge multiple constant Ops and update the mask.
	static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
	ArrayRef<int> Mask, SDValue Root,
	bool HasVariableMask, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MVT VT = Root.getSimpleValueType();

	unsigned SizeInBits = VT.getSizeInBits();
	unsigned NumMaskElts = Mask.size();
	unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
	unsigned NumOps = Ops.size();

	// Extract constant bits from each source op.
	bool OneUseConstantOp = false;
	SmallVector<APInt, 16> UndefEltsOps(NumOps);
	SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
	for (unsigned i = 0; i != NumOps; ++i) {
	SDValue SrcOp = Ops[i];
	OneUseConstantOp \|= SrcOp.hasOneUse();
	if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
	RawBitsOps[i]))
	return false;
	}

	// Only fold if at least one of the constants is only used once or
	// the combined shuffle has included a variable mask shuffle, this
	// is to avoid constant pool bloat.
	if (!OneUseConstantOp && !HasVariableMask)
	return false;

	// Shuffle the constant bits according to the mask.
	APInt UndefElts(NumMaskElts, 0);
	APInt ZeroElts(NumMaskElts, 0);
	APInt ConstantElts(NumMaskElts, 0);
	SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
	APInt::getNullValue(MaskSizeInBits));
	for (unsigned i = 0; i != NumMaskElts; ++i) {
	int M = Mask[i];
	if (M == SM_SentinelUndef) {
	UndefElts.setBit(i);
	continue;
	} else if (M == SM_SentinelZero) {
	ZeroElts.setBit(i);
	continue;
	}
	assert(0 <= M && M < (int)(NumMaskElts * NumOps));

	unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
	unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;

	auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
	if (SrcUndefElts[SrcMaskIdx]) {
	UndefElts.setBit(i);
	continue;
	}

	auto &SrcEltBits = RawBitsOps[SrcOpIdx];
	APInt &Bits = SrcEltBits[SrcMaskIdx];
	if (!Bits) {
	ZeroElts.setBit(i);
	continue;
	}

	ConstantElts.setBit(i);
	ConstantBitData[i] = Bits;
	}
	assert((UndefElts \| ZeroElts \| ConstantElts).isAllOnesValue());

	// Create the constant data.
	MVT MaskSVT;
	if (VT.isFloatingPoint() && (MaskSizeInBits == 32 \|\| MaskSizeInBits == 64))
	MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
	else
	MaskSVT = MVT::getIntegerVT(MaskSizeInBits);

	MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);

	SDLoc DL(Root);
	SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
	DCI.AddToWorklist(CstOp.getNode());
	DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
	return true;
	}

	/// \brief Fully generic combining of x86 shuffle instructions.
	///
	/// This should be the last combine run over the x86 shuffle instructions. Once
	/// they have been fully optimized, this will recursively consider all chains
	/// of single-use shuffle instructions, build a generic model of the cumulative
	/// shuffle operation, and check for simpler instructions which implement this
	/// operation. We use this primarily for two purposes:
	///
	/// 1) Collapse generic shuffles to specialized single instructions when
	/// equivalent. In most cases, this is just an encoding size win, but
	/// sometimes we will collapse multiple generic shuffles into a single
	/// special-purpose shuffle.
	/// 2) Look for sequences of shuffle instructions with 3 or more total
	/// instructions, and replace them with the slightly more expensive SSSE3
	/// PSHUFB instruction if available. We do this as the last combining step
	/// to ensure we avoid using PSHUFB if we can implement the shuffle with
	/// a suitable short sequence of other instructions. The PSHUFB will either
	/// use a register or have to read from memory and so is slightly (but only
	/// slightly) more expensive than the other shuffle instructions.
	///
	/// Because this is inherently a quadratic operation (for each shuffle in
	/// a chain, we recurse up the chain), the depth is limited to 8 instructions.
	/// This should never be an issue in practice as the shuffle lowering doesn't
	/// produce sequences of more than 8 instructions.
	///
	/// FIXME: We will currently miss some cases where the redundant shuffling
	/// would simplify under the threshold for PSHUFB formation because of
	/// combine-ordering. To fix this, we should do the redundant instruction
	/// combining in this recursive walk.
	static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
	int SrcOpIndex, SDValue Root,
	ArrayRef<int> RootMask,
	ArrayRef<const SDNode*> SrcNodes,
	int Depth, bool HasVariableMask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// Bound the depth of our recursive combine because this is ultimately
	// quadratic in nature.
	if (Depth > 8)
	return false;

	// Directly rip through bitcasts to find the underlying operand.
	SDValue Op = SrcOps[SrcOpIndex];
	Op = peekThroughOneUseBitcasts(Op);

	MVT VT = Op.getSimpleValueType();
	if (!VT.isVector())
	return false; // Bail if we hit a non-vector.

	assert(Root.getSimpleValueType().isVector() &&
	"Shuffles operate on vector types!");
	assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
	"Can only combine shuffles of the same vector register size.");

	// Extract target shuffle mask and resolve sentinels and inputs.
	SmallVector<int, 64> OpMask;
	SmallVector<SDValue, 2> OpInputs;
	if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
	return false;

	assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
	SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
	SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());

	// Add the inputs to the Ops list, avoiding duplicates.
	SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());

	int InputIdx0 = -1, InputIdx1 = -1;
	for (int i = 0, e = Ops.size(); i < e; ++i) {
	SDValue BC = peekThroughBitcasts(Ops[i]);
	if (Input0 && BC == peekThroughBitcasts(Input0))
	InputIdx0 = i;
	if (Input1 && BC == peekThroughBitcasts(Input1))
	InputIdx1 = i;
	}

	if (Input0 && InputIdx0 < 0) {
	InputIdx0 = SrcOpIndex;
	Ops[SrcOpIndex] = Input0;
	}
	if (Input1 && InputIdx1 < 0) {
	InputIdx1 = Ops.size();
	Ops.push_back(Input1);
	}

	assert(((RootMask.size() > OpMask.size() &&
	RootMask.size() % OpMask.size() == 0) \|\|
	(OpMask.size() > RootMask.size() &&
	OpMask.size() % RootMask.size() == 0) \|\|
	OpMask.size() == RootMask.size()) &&
	"The smaller number of elements must divide the larger.");

	// This function can be performance-critical, so we rely on the power-of-2
	// knowledge that we have about the mask sizes to replace div/rem ops with
	// bit-masks and shifts.
	assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
	unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());

	unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
	unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
	unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
	assert((RootRatio == 1 \|\| OpRatio == 1) &&
	"Must not have a ratio for both incoming and op masks!");

	assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
	assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
	unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
	unsigned OpRatioLog2 = countTrailingZeros(OpRatio);

	SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);

	// Merge this shuffle operation's mask into our accumulated mask. Note that
	// this shuffle's mask will be the first applied to the input, followed by the
	// root mask to get us all the way to the root value arrangement. The reason
	// for this order is that we are recursing up the operation chain.
	for (unsigned i = 0; i < MaskWidth; ++i) {
	unsigned RootIdx = i >> RootRatioLog2;
	if (RootMask[RootIdx] < 0) {
	// This is a zero or undef lane, we're done.
	Mask[i] = RootMask[RootIdx];
	continue;
	}

	unsigned RootMaskedIdx =
	RootRatio == 1
	? RootMask[RootIdx]
	: (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));

	// Just insert the scaled root mask value if it references an input other
	// than the SrcOp we're currently inserting.
	if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) \|\|
	(((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
	Mask[i] = RootMaskedIdx;
	continue;
	}

	RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
	unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
	if (OpMask[OpIdx] < 0) {
	// The incoming lanes are zero or undef, it doesn't matter which ones we
	// are using.
	Mask[i] = OpMask[OpIdx];
	continue;
	}

	// Ok, we have non-zero lanes, map them through to one of the Op's inputs.
	unsigned OpMaskedIdx =
	OpRatio == 1
	? OpMask[OpIdx]
	: (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));

	OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
	if (OpMask[OpIdx] < (int)OpMask.size()) {
	assert(0 <= InputIdx0 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx0 * MaskWidth;
	} else {
	assert(0 <= InputIdx1 && "Unknown target shuffle input");
	OpMaskedIdx += InputIdx1 * MaskWidth;
	}

	Mask[i] = OpMaskedIdx;
	}

	// Handle the all undef/zero cases early.
	if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
	DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
	return true;
	}
	if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
	// TODO - should we handle the mixed zero/undef case as well? Just returning
	// a zero mask will lose information on undef elements possibly reducing
	// future combine possibilities.
	DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
	Subtarget, DAG, SDLoc(Root)));
	return true;
	}

	// Remove unused shuffle source ops.
	resolveTargetShuffleInputsAndMask(Ops, Mask);
	assert(!Ops.empty() && "Shuffle with no inputs detected");

	HasVariableMask \|= isTargetShuffleVariableMask(Op.getOpcode());

	// Update the list of shuffle nodes that have been combined so far.
	SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
	SrcNodes.end());
	CombinedNodes.push_back(Op.getNode());

	// See if we can recurse into each shuffle source op (if it's a target
	// shuffle). The source op should only be combined if it either has a
	// single use (i.e. current Op) or all its users have already been combined.
	for (int i = 0, e = Ops.size(); i < e; ++i)
	if (Ops[i].getNode()->hasOneUse() \|\|
	SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
	if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
	Depth + 1, HasVariableMask, DAG, DCI,
	Subtarget))
	return true;

	// Attempt to constant fold all of the constant source ops.
	if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
	Subtarget))
	return true;

	// We can only combine unary and binary shuffle mask cases.
	if (Ops.size() > 2)
	return false;

	// Minor canonicalization of the accumulated shuffle mask to make it easier
	// to match below. All this does is detect masks with sequential pairs of
	// elements, and shrink them to the half-width mask. It does this in a loop
	// so it will reduce the size of the mask to the minimal width mask which
	// performs an equivalent shuffle.
	SmallVector<int, 64> WidenedMask;
	while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
	Mask = std::move(WidenedMask);
	}

	// Canonicalization of binary shuffle masks to improve pattern matching by
	// commuting the inputs.
	if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(Ops[0], Ops[1]);
	}

	return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
	DCI, Subtarget);
	}

	/// \brief Get the PSHUF-style mask from PSHUF node.
	///
	/// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
	/// PSHUF-style masks that can be reused with such instructions.
	static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;
	SmallVector<SDValue, 2> Ops;
	bool IsUnary;
	bool HaveMask =
	getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
	(void)HaveMask;
	assert(HaveMask);

	// If we have more than 128-bits, only the low 128-bits of shuffle mask
	// matter. Check that the upper masks are repeats and remove them.
	if (VT.getSizeInBits() > 128) {
	int LaneElts = 128 / VT.getScalarSizeInBits();
	#ifndef NDEBUG
	for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
	for (int j = 0; j < LaneElts; ++j)
	assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
	"Mask doesn't repeat in high 128-bit lanes!");
	#endif
	Mask.resize(LaneElts);
	}

	switch (N.getOpcode()) {
	case X86ISD::PSHUFD:
	return Mask;
	case X86ISD::PSHUFLW:
	Mask.resize(4);
	return Mask;
	case X86ISD::PSHUFHW:
	Mask.erase(Mask.begin(), Mask.begin() + 4);
	for (int &M : Mask)
	M -= 4;
	return Mask;
	default:
	llvm_unreachable("No valid shuffle instruction found!");
	}
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshufd.
	///
	/// We walk up the chain and look for a combinable shuffle, skipping over
	/// shuffles that we could hoist this shuffle's transformation past without
	/// altering anything.
	static SDValue
	combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG) {
	assert(N.getOpcode() == X86ISD::PSHUFD &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);

	// Walk up a single-use chain looking for a combinable shuffle. Keep a stack
	// of the shuffles in the chain so that we can form a fresh chain to replace
	// this one.
	SmallVector<SDValue, 8> Chain;
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFD:
	// Found another dword shuffle.
	break;

	case X86ISD::PSHUFLW:
	// Check that the low words (being shuffled) are the identity in the
	// dword shuffle, and the high words are self-contained.
	if (Mask[0] != 0 \|\| Mask[1] != 1 \|\|
	!(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::PSHUFHW:
	// Check that the high words (being shuffled) are the identity in the
	// dword shuffle, and the low words are self-contained.
	if (Mask[2] != 2 \|\| Mask[3] != 3 \|\|
	!(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
	return SDValue();

	Chain.push_back(V);
	continue;

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	// For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
	// shuffle into a preceding word shuffle.
	if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
	V.getSimpleValueType().getVectorElementType() != MVT::i16)
	return SDValue();

	// Search for a half-shuffle which we can combine with.
	unsigned CombineOp =
	V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
	if (V.getOperand(0) != V.getOperand(1) \|\|
	!V->isOnlyUserOf(V.getOperand(0).getNode()))
	return SDValue();
	Chain.push_back(V);
	V = V.getOperand(0);
	do {
	switch (V.getOpcode()) {
	default:
	return SDValue(); // Nothing to combine.

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOp)
	break;

	Chain.push_back(V);

	LLVM_FALLTHROUGH;
	case ISD::BITCAST:
	V = V.getOperand(0);
	continue;
	}
	break;
	} while (V.hasOneUse());
	break;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return SDValue();

	// Merge this node's mask and our incoming mask.
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Rebuild the chain around this new shuffle.
	while (!Chain.empty()) {
	SDValue W = Chain.pop_back_val();

	if (V.getValueType() != W.getOperand(0).getValueType())
	V = DAG.getBitcast(W.getOperand(0).getValueType(), V);

	switch (W.getOpcode()) {
	default:
	llvm_unreachable("Only PSHUF and UNPCK instructions get here!");

	case X86ISD::UNPCKL:
	case X86ISD::UNPCKH:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
	break;

	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
	break;
	}
	}
	if (V.getValueType() != N.getValueType())
	V = DAG.getBitcast(N.getValueType(), V);

	// Return the new chain to replace N.
	return V;
	}

	/// \brief Search for a combinable shuffle across a chain ending in pshuflw or
	/// pshufhw.
	///
	/// We walk up the chain, skipping shuffles of the other half and looking
	/// through shuffles which switch halves trying to find a shuffle of the same
	/// pair of dwords.
	static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
	SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	assert(
	(N.getOpcode() == X86ISD::PSHUFLW \|\| N.getOpcode() == X86ISD::PSHUFHW) &&
	"Called with something other than an x86 128-bit half shuffle!");
	SDLoc DL(N);
	unsigned CombineOpcode = N.getOpcode();

	// Walk up a single-use chain looking for a combinable shuffle.
	SDValue V = N.getOperand(0);
	for (; V.hasOneUse(); V = V.getOperand(0)) {
	switch (V.getOpcode()) {
	default:
	return false; // Nothing combined!

	case ISD::BITCAST:
	// Skip bitcasts as we always know the type for the target specific
	// instructions.
	continue;

	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	if (V.getOpcode() == CombineOpcode)
	break;

	// Other-half shuffles are no-ops.
	continue;
	}
	// Break out of the loop if we break out of the switch.
	break;
	}

	if (!V.hasOneUse())
	// We fell out of the loop without finding a viable combining instruction.
	return false;

	// Combine away the bottom node as its shuffle will be accumulated into
	// a preceding shuffle.
	DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Record the old value.
	SDValue Old = V;

	// Merge this node's mask and our incoming mask (adjusted to account for all
	// the pshufd instructions encountered).
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	for (int &M : Mask)
	M = VMask[M];
	V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
	getV4X86ShuffleImm8ForMask(Mask, DL, DAG));

	// Check that the shuffles didn't cancel each other out. If not, we need to
	// combine to the new one.
	if (Old != V)
	// Replace the combinable shuffle with the combined one, updating all users
	// so that we re-evaluate the chain here.
	DCI.CombineTo(Old.getNode(), V, /AddTo/ true);

	return true;
	}

	/// \brief Try to combine x86 target specific shuffles.
	static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	MVT VT = N.getSimpleValueType();
	SmallVector<int, 4> Mask;

	unsigned Opcode = N.getOpcode();
	switch (Opcode) {
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	Mask = getPSHUFShuffleMask(N);
	assert(Mask.size() == 4);
	break;
	case X86ISD::UNPCKL: {
	auto Op0 = N.getOperand(0);
	auto Op1 = N.getOperand(1);
	unsigned Opcode0 = Op0.getOpcode();
	unsigned Opcode1 = Op1.getOpcode();

	// Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
	// X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
	// TODO: Add other horizontal operations as required.
	if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
	return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));

	// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
	// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
	// moves upper half elements into the lower half part. For example:
	//
	// t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
	// undef:v16i8
	// t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
	//
	// will be combined to:
	//
	// t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1

	// This is only for 128-bit vectors. From SSE4.1 onward this combine may not
	// happen due to advanced instructions.
	if (!VT.is128BitVector())
	return SDValue();

	if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();

	unsigned NumElts = VT.getVectorNumElements();
	SmallVector<int, 8> ExpectedMask(NumElts, -1);
	std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
	NumElts / 2);

	auto ShufOp = Op1.getOperand(0);
	if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
	return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
	}
	return SDValue();
	}
	case X86ISD::BLENDI: {
	SDValue V0 = N->getOperand(0);
	SDValue V1 = N->getOperand(1);
	assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
	"Unexpected input vector types");

	// Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
	// operands and changing the mask to 1. This saves us a bunch of
	// pattern-matching possibilities related to scalar math ops in SSE/AVX.
	// x86InstrInfo knows how to commute this back after instruction selection
	// if it would help register allocation.

	// TODO: If optimizing for size or a processor that doesn't suffer from
	// partial register update stalls, this should be transformed into a MOVSD
	// instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.

	if (VT == MVT::v2f64)
	if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
	if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
	SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
	return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
	}

	return SDValue();
	}
	case X86ISD::MOVSD:
	case X86ISD::MOVSS: {
	SDValue V0 = peekThroughBitcasts(N->getOperand(0));
	SDValue V1 = peekThroughBitcasts(N->getOperand(1));
	bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
	bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
	if (isZero0 && isZero1)
	return SDValue();

	// We often lower to MOVSD/MOVSS from integer as well as native float
	// types; remove unnecessary domain-crossing bitcasts if we can to make it
	// easier to combine shuffles later on. We've already accounted for the
	// domain switching cost when we decided to lower with it.
	bool isFloat = VT.isFloatingPoint();
	bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
	bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
	if ((isFloat != isFloat0 \|\| isZero0) && (isFloat != isFloat1 \|\| isZero1)) {
	MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
	: (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
	V0 = DAG.getBitcast(NewVT, V0);
	V1 = DAG.getBitcast(NewVT, V1);
	return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
	}

	return SDValue();
	}
	case X86ISD::INSERTPS: {
	assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
	SDValue Op0 = N.getOperand(0);
	SDValue Op1 = N.getOperand(1);
	SDValue Op2 = N.getOperand(2);
	unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
	unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
	unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
	unsigned ZeroMask = InsertPSMask & 0xF;

	// If we zero out all elements from Op0 then we don't need to reference it.
	if (((ZeroMask \| (1u << DstIdx)) == 0xF) && !Op0.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// If we zero out the element from Op1 then we don't need to reference it.
	if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	// Attempt to merge insertps Op1 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask1;
	SmallVector<SDValue, 2> Ops1;
	if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
	int M = TargetMask1[SrcIdx];
	if (isUndefOrZero(M)) {
	// Zero/UNDEF insertion - zero out element and remove dependency.
	InsertPSMask \|= (1u << DstIdx);
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}
	// Update insertps mask srcidx and reference the source input directly.
	assert(0 <= M && M < 8 && "Shuffle index out of range");
	InsertPSMask = (InsertPSMask & 0x3f) \| ((M & 0x3) << 6);
	Op1 = Ops1[M < 4 ? 0 : 1];
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));
	}

	// Attempt to merge insertps Op0 with an inner target shuffle node.
	SmallVector<int, 8> TargetMask0;
	SmallVector<SDValue, 2> Ops0;
	if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
	return SDValue();

	bool Updated = false;
	bool UseInput00 = false;
	bool UseInput01 = false;
	for (int i = 0; i != 4; ++i) {
	int M = TargetMask0[i];
	if ((InsertPSMask & (1u << i)) \|\| (i == (int)DstIdx)) {
	// No change if element is already zero or the inserted element.
	continue;
	} else if (isUndefOrZero(M)) {
	// If the target mask is undef/zero then we must zero the element.
	InsertPSMask \|= (1u << i);
	Updated = true;
	continue;
	}

	// The input vector element must be inline.
	if (M != i && M != (i + 4))
	return SDValue();

	// Determine which inputs of the target shuffle we're using.
	UseInput00 \|= (0 <= M && M < 4);
	UseInput01 \|= (4 <= M);
	}

	// If we're not using both inputs of the target shuffle then use the
	// referenced input directly.
	if (UseInput00 && !UseInput01) {
	Updated = true;
	Op0 = Ops0[0];
	} else if (!UseInput00 && UseInput01) {
	Updated = true;
	Op0 = Ops0[1];
	}

	if (Updated)
	return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
	DAG.getConstant(InsertPSMask, DL, MVT::i8));

	return SDValue();
	}
	default:
	return SDValue();
	}

	// Nuke no-op shuffles that show up after combining.
	if (isNoopShuffleMask(Mask))
	return DCI.CombineTo(N.getNode(), N.getOperand(0), /AddTo/ true);

	// Look for simplifications involving one or two shuffle instructions.
	SDValue V = N.getOperand(0);
	switch (N.getOpcode()) {
	default:
	break;
	case X86ISD::PSHUFLW:
	case X86ISD::PSHUFHW:
	assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");

	if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
	return SDValue(); // We combined away this shuffle, so we're done.

	// See if this reduces to a PSHUFD which is no more expensive and can
	// combine with more operations. Note that it has to at least flip the
	// dwords as otherwise it would have been removed as a no-op.
	if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
	int DMask[] = {0, 1, 2, 3};
	int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
	DMask[DOffset + 0] = DOffset + 1;
	DMask[DOffset + 1] = DOffset + 0;
	MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	V = DAG.getBitcast(DVT, V);
	DCI.AddToWorklist(V.getNode());
	V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
	getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
	DCI.AddToWorklist(V.getNode());
	return DAG.getBitcast(VT, V);
	}

	// Look for shuffle patterns which can be implemented as a single unpack.
	// FIXME: This doesn't handle the location of the PSHUFD generically, and
	// only works when we have a PSHUFD followed by two half-shuffles.
	if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
	(V.getOpcode() == X86ISD::PSHUFLW \|\|
	V.getOpcode() == X86ISD::PSHUFHW) &&
	V.getOpcode() != N.getOpcode() &&
	V.hasOneUse()) {
	SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
	if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
	SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
	SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
	int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
	int WordMask[8];
	for (int i = 0; i < 4; ++i) {
	WordMask[i + NOffset] = Mask[i] + NOffset;
	WordMask[i + VOffset] = VMask[i] + VOffset;
	}
	// Map the word mask through the DWord mask.
	int MappedMask[8];
	for (int i = 0; i < 8; ++i)
	MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
	if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) \|\|
	makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
	// We can replace all three shuffles with an unpack.
	V = DAG.getBitcast(VT, D.getOperand(0));
	DCI.AddToWorklist(V.getNode());
	return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
	: X86ISD::UNPCKH,
	DL, VT, V, V);
	}
	}
	}

	break;

	case X86ISD::PSHUFD:
	if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
	return NewN;

	break;
	}

	return SDValue();
	}

	/// Returns true iff the shuffle node \p N can be replaced with ADDSUB
	/// operation. If true is returned then the operands of ADDSUB operation
	/// are written to the parameters \p Opnd0 and \p Opnd1.
	///
	/// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
	/// so it is easier to generically match. We also insert dummy vector shuffle
	/// nodes for the operands which explicitly discard the lanes which are unused
	/// by this operation to try to flow through the rest of the combiner
	/// the fact that they're unused.
	static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
	SDValue &Opnd0, SDValue &Opnd1) {

	EVT VT = N->getValueType(0);
	if ((!Subtarget.hasSSE3() \|\| (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
	(!Subtarget.hasAVX() \|\| (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
	(!Subtarget.hasAVX512() \|\| (VT != MVT::v16f32 && VT != MVT::v8f64)))
	return false;

	// We only handle target-independent shuffles.
	// FIXME: It would be easy and harmless to use the target shuffle mask
	// extraction tool to support more.
	if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
	SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());

	SDValue V1 = N->getOperand(0);
	SDValue V2 = N->getOperand(1);

	// We require the first shuffle operand to be the FSUB node, and the second to
	// be the FADD node.
	if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
	ShuffleVectorSDNode::commuteMask(Mask);
	std::swap(V1, V2);
	} else if (V1.getOpcode() != ISD::FSUB \|\| V2.getOpcode() != ISD::FADD)
	return false;

	// If there are other uses of these operations we can't fold them.
	if (!V1->hasOneUse() \|\| !V2->hasOneUse())
	return false;

	// Ensure that both operations have the same operands. Note that we can
	// commute the FADD operands.
	SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
	if ((V2->getOperand(0) != LHS \|\| V2->getOperand(1) != RHS) &&
	(V2->getOperand(0) != RHS \|\| V2->getOperand(1) != LHS))
	return false;

	// We're looking for blends between FADD and FSUB nodes. We insist on these
	// nodes being lined up in a specific expected pattern.
	if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) \|\|
	isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
	8, 25, 10, 27, 12, 29, 14, 31})))
	return false;

	Opnd0 = LHS;
	Opnd1 = RHS;
	return true;
	}

	/// \brief Try to combine a shuffle into a target-specific add-sub or
	/// mul-add-sub node.
	static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
	const X86Subtarget &Subtarget,
	SelectionDAG &DAG) {
	SDValue Opnd0, Opnd1;
	if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Try to generate X86ISD::FMADDSUB node here.
	SDValue Opnd2;
	if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
	return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);

	// Do not generate X86ISD::ADDSUB node for 512-bit types even though
	// the ADDSUB idiom has been successfully recognized. There are no known
	// X86 targets with 512-bit ADDSUB instructions!
	if (VT.is512BitVector())
	return SDValue();

	return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
	}

	// We are looking for a shuffle where both sources are concatenated with undef
	// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
	// if we can express this as a single-source shuffle, that's preferable.
	static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX2() \|\| !isa<ShuffleVectorSDNode>(N))
	return SDValue();

	EVT VT = N->getValueType(0);

	// We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();

	if (VT.getVectorElementType() != MVT::i32 &&
	VT.getVectorElementType() != MVT::i64 &&
	VT.getVectorElementType() != MVT::f32 &&
	VT.getVectorElementType() != MVT::f64)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// Check that both sources are concats with undef.
	if (N0.getOpcode() != ISD::CONCAT_VECTORS \|\|
	N1.getOpcode() != ISD::CONCAT_VECTORS \|\| N0.getNumOperands() != 2 \|\|
	N1.getNumOperands() != 2 \|\| !N0.getOperand(1).isUndef() \|\|
	!N1.getOperand(1).isUndef())
	return SDValue();

	// Construct the new shuffle mask. Elements from the first source retain their
	// index, but elements from the second source no longer need to skip an undef.
	SmallVector<int, 8> Mask;
	int NumElts = VT.getVectorNumElements();

	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (int Elt : SVOp->getMask())
	Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));

	SDLoc DL(N);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
	N1.getOperand(0));
	return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
	}

	static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	// If we have legalized the vector types, look for blends of FADD and FSUB
	// nodes that we can fuse into an ADDSUB node.
	if (TLI.isTypeLegal(VT))
	if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
	return AddSub;

	// During Type Legalization, when promoting illegal vector types,
	// the backend might introduce new shuffle dag nodes and bitcasts.
	//
	// This code performs the following transformation:
	// fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
	// (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
	//
	// We do this only if both the bitcast and the BINOP dag nodes have
	// one use. Also, perform this transformation only if the new binary
	// operation is legal. This is to avoid introducing dag nodes that
	// potentially need to be further expanded (or custom lowered) into a
	// less optimal sequence of dag nodes.
	if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
	N->getOpcode() == ISD::VECTOR_SHUFFLE &&
	N->getOperand(0).getOpcode() == ISD::BITCAST &&
	N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	SDValue BC0 = N0.getOperand(0);
	EVT SVT = BC0.getValueType();
	unsigned Opcode = BC0.getOpcode();
	unsigned NumElts = VT.getVectorNumElements();

	if (BC0.hasOneUse() && SVT.isVector() &&
	SVT.getVectorNumElements() * 2 == NumElts &&
	TLI.isOperationLegal(Opcode, VT)) {
	bool CanFold = false;
	switch (Opcode) {
	default : break;
	case ISD::ADD:
	case ISD::SUB:
	case ISD::MUL:
	// isOperationLegal lies for integer ops on floating point types.
	CanFold = VT.isInteger();
	break;
	case ISD::FADD:
	case ISD::FSUB:
	case ISD::FMUL:
	// isOperationLegal lies for floating point ops on integer types.
	CanFold = VT.isFloatingPoint();
	break;
	}

	unsigned SVTNumElts = SVT.getVectorNumElements();
	ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
	for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
	for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
	CanFold = SVOp->getMaskElt(i) < 0;

	if (CanFold) {
	SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
	SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
	SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
	return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
	}
	}
	}

	// Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
	// load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
	// consecutive, non-overlapping, and in the right order.
	SmallVector<SDValue, 16> Elts;
	for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
	if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
	Elts.push_back(Elt);
	continue;
	}
	Elts.clear();
	break;
	}

	if (Elts.size() == VT.getVectorNumElements())
	if (SDValue LD =
	EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
	return LD;

	// For AVX2, we sometimes want to combine
	// (vector_shuffle <mask> (concat_vectors t1, undef)
	// (concat_vectors t2, undef))
	// Into:
	// (vector_shuffle <mask> (concat_vectors t1, t2), undef)
	// Since the latter can be efficiently lowered with VPERMD/VPERMQ
	if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
	return ShufConcat;

	if (isTargetShuffle(N->getOpcode())) {
	SDValue Op(N, 0);
	if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
	return Shuffle;

	// Try recursively combining arbitrary sequences of x86 shuffle
	// instructions into higher-order shuffles. We do this after combining
	// specific PSHUF instruction sequences into their minimal form so that we
	// can evaluate how many specialized shuffle instructions are involved in
	// a particular chain.
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget))
	return SDValue(); // This routine will use CombineTo to replace N.
	}

	return SDValue();
	}

	/// Check if a vector extract from a target-specific shuffle of a load can be
	/// folded into a single element load.
	/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
	/// shuffles have been custom lowered so we need to handle those here.
	static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue InVec = N->getOperand(0);
	SDValue EltNo = N->getOperand(1);
	EVT EltVT = N->getValueType(0);

	if (!isa<ConstantSDNode>(EltNo))
	return SDValue();

	EVT OriginalVT = InVec.getValueType();

	// Peek through bitcasts, don't duplicate a load with other uses.
	InVec = peekThroughOneUseBitcasts(InVec);

	EVT CurrentVT = InVec.getValueType();
	if (!CurrentVT.isVector() \|\|
	CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
	return SDValue();

	if (!isTargetShuffle(InVec.getOpcode()))
	return SDValue();

	// Don't duplicate a load with other uses.
	if (!InVec.hasOneUse())
	return SDValue();

	SmallVector<int, 16> ShuffleMask;
	SmallVector<SDValue, 2> ShuffleOps;
	bool UnaryShuffle;
	if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
	ShuffleOps, ShuffleMask, UnaryShuffle))
	return SDValue();

	// Select the input vector, guarding against out of range extract vector.
	unsigned NumElems = CurrentVT.getVectorNumElements();
	int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
	int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];

	if (Idx == SM_SentinelZero)
	return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
	: DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
	if (Idx == SM_SentinelUndef)
	return DAG.getUNDEF(EltVT);

	assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
	SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
	: ShuffleOps[1];

	// If inputs to shuffle are the same for both ops, then allow 2 uses
	unsigned AllowedUses =
	(ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;

	if (LdNode.getOpcode() == ISD::BITCAST) {
	// Don't duplicate a load with other uses.
	if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
	return SDValue();

	AllowedUses = 1; // only allow 1 load use if we have a bitcast
	LdNode = LdNode.getOperand(0);
	}

	if (!ISD::isNormalLoad(LdNode.getNode()))
	return SDValue();

	LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);

	if (!LN0 \|\|!LN0->hasNUsesOfValue(AllowedUses, 0) \|\| LN0->isVolatile())
	return SDValue();

	// If there's a bitcast before the shuffle, check if the load type and
	// alignment is valid.
	unsigned Align = LN0->getAlignment();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
	EltVT.getTypeForEVT(*DAG.getContext()));

	if (NewAlign > Align \|\| !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
	return SDValue();

	// All checks match so transform back to vector_shuffle so that DAG combiner
	// can finish the job
	SDLoc dl(N);

	// Create shuffle node taking into account the case that its a unary shuffle
	SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
	Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
	ShuffleMask);
	Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
	EltNo);
	}

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the illegal vector is scalarized on subtargets that don't have legal
	// vxi1 types.
	static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
	const X86Subtarget &Subtarget) {
	EVT VT = BitCast.getValueType();
	SDValue N0 = BitCast.getOperand(0);
	EVT VecVT = N0->getValueType(0);

	if (!VT.isScalarInteger() \|\| !VecVT.isSimple())
	return SDValue();

	// With AVX512 vxi1 types are legal and we prefer using k-regs.
	// MOVMSK is supported in SSE2 or later.
	if (Subtarget.hasAVX512() \|\| !Subtarget.hasSSE2())
	return SDValue();

	// There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
	// v8f64. So all legal 128-bit and 256-bit vectors are covered except for
	// v8i16 and v16i16.
	// For these two cases, we can shuffle the upper element bytes to a
	// consecutive sequence at the start of the vector and treat the results as
	// v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
	// for v16i16 this is not the case, because the shuffle is expensive, so we
	// avoid sign-extending to this type entirely.
	// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
	// (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
	MVT SExtVT;
	MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
	switch (VecVT.getSimpleVT().SimpleTy) {
	default:
	return SDValue();
	case MVT::v2i1:
	SExtVT = MVT::v2i64;
	FPCastVT = MVT::v2f64;
	break;
	case MVT::v4i1:
	SExtVT = MVT::v4i32;
	FPCastVT = MVT::v4f32;
	// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
	// sign-extend to a 256-bit operation to avoid truncation.
	if (N0->getOpcode() == ISD::SETCC &&
	N0->getOperand(0)->getValueType(0).is256BitVector() &&
	Subtarget.hasInt256()) {
	SExtVT = MVT::v4i64;
	FPCastVT = MVT::v4f64;
	}
	break;
	case MVT::v8i1:
	SExtVT = MVT::v8i16;
	// For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
	// sign-extend to a 256-bit operation to match the compare.
	// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
	// 256-bit because the shuffle is cheaper than sign extending the result of
	// the compare.
	if (N0->getOpcode() == ISD::SETCC &&
	N0->getOperand(0)->getValueType(0).is256BitVector() &&
	Subtarget.hasInt256()) {
	SExtVT = MVT::v8i32;
	FPCastVT = MVT::v8f32;
	}
	break;
	case MVT::v16i1:
	SExtVT = MVT::v16i8;
	// For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
	// it is not profitable to sign-extend to 256-bit because this will
	// require an extra cross-lane shuffle which is more expensive than
	// truncating the result of the compare to 128-bits.
	break;
	case MVT::v32i1:
	// TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
	if (!Subtarget.hasInt256())
	return SDValue();
	SExtVT = MVT::v32i8;
	break;
	};

	SDLoc DL(BitCast);
	SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
	if (SExtVT == MVT::v8i16) {
	V = DAG.getBitcast(MVT::v16i8, V);
	V = DAG.getVectorShuffle(
	MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
	{0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
	} else
	assert(SExtVT.getScalarType() != MVT::i16 &&
	"Vectors of i16 must be shuffled");
	if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
	V = DAG.getBitcast(FPCastVT, V);
	V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
	return DAG.getZExtOrTrunc(V, DL, VT);
	}

	static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SrcVT = N0.getValueType();

	// Try to match patterns such as
	// (i16 bitcast (v16i1 x))
	// ->
	// (i16 movmsk (16i8 sext (v16i1 x)))
	// before the setcc result is scalarized on subtargets that don't have legal
	// vxi1 types.
	if (DCI.isBeforeLegalize())
	if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
	return V;
	// Since MMX types are special and don't usually play with other vector types,
	// it's better to handle them early to be sure we emit efficient code by
	// avoiding store-load conversions.

	// Detect bitcasts between i32 to x86mmx low word.
	if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
	SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType() == MVT::i32)
	return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
	}

	// Detect bitcasts between element or subvector extraction to x86mmx.
	if (VT == MVT::x86mmx &&
	(N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT \|\|
	N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
	isNullConstant(N0.getOperand(1))) {
	SDValue N00 = N0->getOperand(0);
	if (N00.getValueType().is128BitVector())
	return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
	DAG.getBitcast(MVT::v2i64, N00));
	}

	// Detect bitcasts from FP_TO_SINT to x86mmx.
	if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
	N0.getOpcode() == ISD::FP_TO_SINT) {
	SDLoc DL(N0);
	SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
	DAG.getUNDEF(MVT::v2i32));
	return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
	DAG.getBitcast(MVT::v2i64, Res));
	}

	// Convert a bitcasted integer logic operation that has one bitcasted
	// floating-point operand into a floating-point logic operation. This may
	// create a load of a constant, but that is cheaper than materializing the
	// constant in an integer register and transferring it to an SSE register or
	// transferring the SSE operand to integer register and back.
	unsigned FPOpcode;
	switch (N0.getOpcode()) {
	case ISD::AND: FPOpcode = X86ISD::FAND; break;
	case ISD::OR: FPOpcode = X86ISD::FOR; break;
	case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
	default: return SDValue();
	}

	if (!((Subtarget.hasSSE1() && VT == MVT::f32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::f64)))
	return SDValue();

	SDValue LogicOp0 = N0.getOperand(0);
	SDValue LogicOp1 = N0.getOperand(1);
	SDLoc DL0(N0);

	// bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
	if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
	LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
	SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
	}
	// bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
	if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
	LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
	!isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
	SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
	return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
	}

	return SDValue();
	}

	// Match a binop + shuffle pyramid that represents a horizontal reduction over
	// the elements of a vector.
	// Returns the vector that is being reduced on, or SDValue() if a reduction
	// was not matched.
	static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
	// The pattern must end in an extract from index 0.
	if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) \|\|
	!isNullConstant(Extract->getOperand(1)))
	return SDValue();

	unsigned Stages =
	Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());

	SDValue Op = Extract->getOperand(0);
	// At each stage, we're looking for something that looks like:
	// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
	// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
	// i32 undef, i32 undef, i32 undef, i32 undef>
	// %a = binop <8 x i32> %op, %s
	// Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
	// we expect something like:
	// <4,5,6,7,u,u,u,u>
	// <2,3,u,u,u,u,u,u>
	// <1,u,u,u,u,u,u,u>
	for (unsigned i = 0; i < Stages; ++i) {
	if (Op.getOpcode() != BinOp)
	return SDValue();

	ShuffleVectorSDNode *Shuffle =
	dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
	if (Shuffle) {
	Op = Op.getOperand(1);
	} else {
	Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
	Op = Op.getOperand(0);
	}

	// The first operand of the shuffle should be the same as the other operand
	// of the add.
	if (!Shuffle \|\| (Shuffle->getOperand(0) != Op))
	return SDValue();

	// Verify the shuffle has the expected (at this stage of the pyramid) mask.
	for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
	if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
	return SDValue();
	}

	return Op;
	}

	// Given a select, detect the following pattern:
	// 1: %2 = zext <N x i8> %0 to <N x i32>
	// 2: %3 = zext <N x i8> %1 to <N x i32>
	// 3: %4 = sub nsw <N x i32> %2, %3
	// 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
	// 5: %6 = sub nsw <N x i32> zeroinitializer, %4
	// 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
	// This is useful as it is the input into a SAD pattern.
	static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
	SDValue &Op1) {
	// Check the condition of the select instruction is greater-than.
	SDValue SetCC = Select->getOperand(0);
	if (SetCC.getOpcode() != ISD::SETCC)
	return false;
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
	if (CC != ISD::SETGT && CC != ISD::SETLT)
	return false;

	SDValue SelectOp1 = Select->getOperand(1);
	SDValue SelectOp2 = Select->getOperand(2);

	// The following instructions assume SelectOp1 is the subtraction operand
	// and SelectOp2 is the negation operand.
	// In the case of SETLT this is the other way around.
	if (CC == ISD::SETLT)
	std::swap(SelectOp1, SelectOp2);

	// The second operand of the select should be the negation of the first
	// operand, which is implemented as 0 - SelectOp1.
	if (!(SelectOp2.getOpcode() == ISD::SUB &&
	ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
	SelectOp2.getOperand(1) == SelectOp1))
	return false;

	// The first operand of SetCC is the first operand of the select, which is the
	// difference between the two input vectors.
	if (SetCC.getOperand(0) != SelectOp1)
	return false;

	// In SetLT case, The second operand of the comparison can be either 1 or 0.
	APInt SplatVal;
	if ((CC == ISD::SETLT) &&
	!((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
	SplatVal == 1) \|\|
	(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
	return false;

	// In SetGT case, The second operand of the comparison can be either -1 or 0.
	if ((CC == ISD::SETGT) &&
	!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) \|\|
	ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
	return false;

	// The first operand of the select is the difference between the two input
	// vectors.
	if (SelectOp1.getOpcode() != ISD::SUB)
	return false;

	Op0 = SelectOp1.getOperand(0);
	Op1 = SelectOp1.getOperand(1);

	// Check if the operands of the sub are zero-extended from vectors of i8.
	if (Op0.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 \|\|
	Op1.getOpcode() != ISD::ZERO_EXTEND \|\|
	Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
	return false;

	return true;
	}

	// Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
	// to these zexts.
	static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
	const SDValue &Zext1, const SDLoc &DL) {

	// Find the appropriate width for the PSADBW.
	EVT InVT = Zext0.getOperand(0).getValueType();
	unsigned RegSize = std::max(128u, InVT.getSizeInBits());

	// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
	// fill in the missing vector elements with 0.
	unsigned NumConcat = RegSize / InVT.getSizeInBits();
	SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
	Ops[0] = Zext0.getOperand(0);
	MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
	SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
	Ops[0] = Zext1.getOperand(0);
	SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);

	// Actually build the SAD
	MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
	return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
	}

	// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
	static SDValue combineHorizontalPredicateResult(SDNode *Extract,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Bail without SSE2 or with AVX512VL (which uses predicate registers).
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasVLX())
	return SDValue();

	EVT ExtractVT = Extract->getValueType(0);
	unsigned BitWidth = ExtractVT.getSizeInBits();
	if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
	ExtractVT != MVT::i8)
	return SDValue();

	// Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
	for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
	SDValue Match = matchBinOpReduction(Extract, Op);
	if (!Match)
	continue;

	// EXTRACT_VECTOR_ELT can require implicit extension of the vector element
	// which we can't support here for now.
	if (Match.getScalarValueSizeInBits() != BitWidth)
	continue;

	// We require AVX2 for PMOVMSKB for v16i16/v32i8;
	unsigned MatchSizeInBits = Match.getValueSizeInBits();
	if (!(MatchSizeInBits == 128 \|\|
	(MatchSizeInBits == 256 &&
	((Subtarget.hasAVX() && BitWidth >= 32) \|\| Subtarget.hasAVX2()))))
	return SDValue();

	// Don't bother performing this for 2-element vectors.
	if (Match.getValueType().getVectorNumElements() <= 2)
	return SDValue();

	// Check that we are extracting a reduction of all sign bits.
	if (DAG.ComputeNumSignBits(Match) != BitWidth)
	return SDValue();

	// For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
	MVT MaskVT;
	if (64 == BitWidth \|\| 32 == BitWidth)
	MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
	MatchSizeInBits / BitWidth);
	else
	MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);

	APInt CompareBits;
	ISD::CondCode CondCode;
	if (Op == ISD::OR) {
	// any_of -> MOVMSK != 0
	CompareBits = APInt::getNullValue(32);
	CondCode = ISD::CondCode::SETNE;
	} else {
	// all_of -> MOVMSK == ((1 << NumElts) - 1)
	CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
	CondCode = ISD::CondCode::SETEQ;
	}

	// Perform the select as i32/i64 and then truncate to avoid partial register
	// stalls.
	unsigned ResWidth = std::max(BitWidth, 32u);
	EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
	SDLoc DL(Extract);
	SDValue Zero = DAG.getConstant(0, DL, ResVT);
	SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
	SDValue Res = DAG.getBitcast(MaskVT, Match);
	Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
	Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
	Ones, Zero, CondCode);
	return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
	}

	return SDValue();
	}

	static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// PSADBW is only supported on SSE2 and up.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Verify the type we're extracting from is any integer type above i16.
	EVT VT = Extract->getOperand(0).getValueType();
	if (!VT.isSimple() \|\| !(VT.getVectorElementType().getSizeInBits() > 16))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (RegSize / VT.getVectorNumElements() < 8)
	return SDValue();

	// Match shuffle + add pyramid.
	SDValue Root = matchBinOpReduction(Extract, ISD::ADD);

	// The operand is expected to be zero extended from i8
	// (verified in detectZextAbsDiff).
	// In order to convert to i64 and above, additional any/zero/sign
	// extend is expected.
	// The zero extend from 32 bit has no mathematical effect on the result.
	// Also the sign extend is basically zero extend
	// (extends the sign bit which is zero).
	// So it is correct to skip the sign/zero extend instruction.
	if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND \|\|
	Root.getOpcode() == ISD::ZERO_EXTEND \|\|
	Root.getOpcode() == ISD::ANY_EXTEND))
	Root = Root.getOperand(0);

	// If there was a match, we want Root to be a select that is the root of an
	// abs-diff pattern.
	if (!Root \|\| (Root.getOpcode() != ISD::VSELECT))
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	SDValue Zext0, Zext1;
	if (!detectZextAbsDiff(Root, Zext0, Zext1))
	return SDValue();

	// Create the SAD instruction.
	SDLoc DL(Extract);
	SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);

	// If the original vector was wider than 8 elements, sum over the results
	// in the SAD vector.
	unsigned Stages = Log2_32(VT.getVectorNumElements());
	MVT SadVT = SAD.getSimpleValueType();
	if (Stages > 3) {
	unsigned SadElems = SadVT.getVectorNumElements();

	for(unsigned i = Stages - 3; i > 0; --i) {
	SmallVector<int, 16> Mask(SadElems, -1);
	for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
	Mask[j] = MaskEnd + j;

	SDValue Shuffle =
	DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
	SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
	}
	}

	MVT Type = Extract->getSimpleValueType(0);
	unsigned TypeSizeInBits = Type.getSizeInBits();
	// Return the lowest TypeSizeInBits bits.
	MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
	SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
	Extract->getOperand(1));
	}

	// Attempt to peek through a target shuffle and extract the scalar from the
	// source.
	static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDValue Src = N->getOperand(0);
	SDValue Idx = N->getOperand(1);

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();
	EVT SrcSVT = SrcVT.getVectorElementType();
	unsigned NumSrcElts = SrcVT.getVectorNumElements();

	// Don't attempt this for boolean mask vectors or unknown extraction indices.
	if (SrcSVT == MVT::i1 \|\| !isa<ConstantSDNode>(Idx))
	return SDValue();

	// Resolve the target shuffle inputs and mask.
	SmallVector<int, 16> Mask;
	SmallVector<SDValue, 2> Ops;
	if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
	return SDValue();

	// Attempt to narrow/widen the shuffle mask to the correct size.
	if (Mask.size() != NumSrcElts) {
	if ((NumSrcElts % Mask.size()) == 0) {
	SmallVector<int, 16> ScaledMask;
	int Scale = NumSrcElts / Mask.size();
	scaleShuffleMask(Scale, Mask, ScaledMask);
	Mask = std::move(ScaledMask);
	} else if ((Mask.size() % NumSrcElts) == 0) {
	SmallVector<int, 16> WidenedMask;
	while (Mask.size() > NumSrcElts &&
	canWidenShuffleElements(Mask, WidenedMask))
	Mask = std::move(WidenedMask);
	// TODO - investigate support for wider shuffle masks with known upper
	// undef/zero elements for implicit zero-extension.
	}
	}

	// Check if narrowing/widening failed.
	if (Mask.size() != NumSrcElts)
	return SDValue();

	int SrcIdx = Mask[N->getConstantOperandVal(1)];
	SDLoc dl(N);

	// If the shuffle source element is undef/zero then we can just accept it.
	if (SrcIdx == SM_SentinelUndef)
	return DAG.getUNDEF(VT);

	if (SrcIdx == SM_SentinelZero)
	return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
	: DAG.getConstant(0, dl, VT);

	SDValue SrcOp = Ops[SrcIdx / Mask.size()];
	SrcOp = DAG.getBitcast(SrcVT, SrcOp);
	SrcIdx = SrcIdx % Mask.size();

	// We can only extract other elements from 128-bit vectors and in certain
	// circumstances, depending on SSE-level.
	// TODO: Investigate using extract_subvector for larger vectors.
	// TODO: Investigate float/double extraction if it will be just stored.
	if ((SrcVT == MVT::v4i32 \|\| SrcVT == MVT::v2i64) &&
	((SrcIdx == 0 && Subtarget.hasSSE2()) \|\| Subtarget.hasSSE41())) {
	assert(SrcSVT == VT && "Unexpected extraction type");
	return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	}

	if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) \|\|
	(SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
	assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
	"Unexpected extraction type");
	unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
	SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
	DAG.getIntPtrConstant(SrcIdx, dl));
	SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
	DAG.getValueType(SrcSVT));
	return DAG.getZExtOrTrunc(Assert, dl, VT);
	}

	return SDValue();
	}

	/// Detect vector gather/scatter index generation and convert it from being a
	/// bunch of shuffles and extracts into a somewhat faster sequence.
	/// For i686, the best sequence is apparently storing the value and loading
	/// scalars back, while for x64 we should use 64-bit extracts and shifts.
	static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
	return NewOp;

	if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
	return NewOp;

	SDValue InputVector = N->getOperand(0);
	SDValue EltIdx = N->getOperand(1);

	EVT SrcVT = InputVector.getValueType();
	EVT VT = N->getValueType(0);
	SDLoc dl(InputVector);

	// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getBitcast(VT, InputVector);
	}

	// Detect mmx to i32 conversion through a v2i32 elt extract.
	if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
	VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
	SDValue MMXSrc = InputVector.getOperand(0);

	// The bitcast source is a direct mmx result.
	if (MMXSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
	}

	if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
	isa<ConstantSDNode>(EltIdx) &&
	isa<ConstantSDNode>(InputVector.getOperand(0))) {
	uint64_t ExtractedElt = N->getConstantOperandVal(1);
	uint64_t InputValue = InputVector.getConstantOperandVal(0);
	uint64_t Res = (InputValue >> ExtractedElt) & 1;
	return DAG.getConstant(Res, dl, MVT::i1);
	}

	// Check whether this extract is the root of a sum of absolute differences
	// pattern. This has to be done here because we really want it to happen
	// pre-legalization,
	if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
	return SAD;

	// Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
	if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
	return Cmp;

	// Only operate on vectors of 4 elements, where the alternative shuffling
	// gets to be more expensive.
	if (SrcVT != MVT::v4i32)
	return SDValue();

	// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
	// single use which is a sign-extend or zero-extend, and all elements are
	// used.
	SmallVector<SDNode *, 4> Uses;
	unsigned ExtractedElements = 0;
	for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
	UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
	if (UI.getUse().getResNo() != InputVector.getResNo())
	return SDValue();

	SDNode Extract = UI;
	if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
	return SDValue();

	if (Extract->getValueType(0) != MVT::i32)
	return SDValue();
	if (!Extract->hasOneUse())
	return SDValue();
	if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
	Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();
	if (!isa<ConstantSDNode>(Extract->getOperand(1)))
	return SDValue();

	// Record which element was extracted.
	ExtractedElements \|= 1 << Extract->getConstantOperandVal(1);
	Uses.push_back(Extract);
	}

	// If not all the elements were used, this may not be worthwhile.
	if (ExtractedElements != 15)
	return SDValue();

	// Ok, we've now decided to do the transformation.
	// If 64-bit shifts are legal, use the extract-shift sequence,
	// otherwise bounce the vector off the cache.
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	SDValue Vals[4];

	if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
	SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
	auto &DL = DAG.getDataLayout();
	EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
	SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(0, dl, VecIdxTy));
	SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
	DAG.getConstant(1, dl, VecIdxTy));

	SDValue ShAmt = DAG.getConstant(
	32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
	Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
	Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
	Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
	Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
	DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
	} else {
	// Store the value to a temporary stack slot.
	SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
	SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
	MachinePointerInfo());

	EVT ElementType = SrcVT.getVectorElementType();
	unsigned EltSize = ElementType.getSizeInBits() / 8;

	// Replace each use (extract) with a load of the appropriate element.
	for (unsigned i = 0; i < 4; ++i) {
	uint64_t Offset = EltSize * i;
	auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
	SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);

	SDValue ScalarAddr =
	DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);

	// Load the scalar.
	Vals[i] =
	DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
	}
	}

	// Replace the extracts
	for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
	UE = Uses.end(); UI != UE; ++UI) {
	SDNode Extract = UI;

	uint64_t IdxVal = Extract->getConstantOperandVal(1);
	DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
	}

	// The replacement was made in place; don't return anything.
	return SDValue();
	}

	// TODO - merge with combineExtractVectorElt once it can handle the implicit
	// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
	// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
	// combineBasicSADPattern.
	static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
	}

	/// If a vector select has an operand that is -1 or 0, try to simplify the
	/// select to a bitwise logic operation.
	static SDValue
	combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	SDLoc DL(N);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (N->getOpcode() != ISD::VSELECT)
	return SDValue();

	assert(CondVT.isVector() && "Vector select expects a vector selector!");

	bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	// Check if the first operand is all zeros and Cond type is vXi1.
	// This situation only applies to avx512.
	if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
	CondVT.getVectorElementType() == MVT::i1) {
	// Invert the cond to not(cond) : xor(op,allones)=not(op)
	SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
	DAG.getAllOnesConstant(DL, CondVT));
	// Vselect cond, op1, op2 = Vselect not(cond), op2, op1
	return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
	}

	// To use the condition operand as a bitwise mask, it must have elements that
	// are the same size as the select elements. Ie, the condition operand must
	// have already been promoted from the IR select condition type <N x i1>.
	// Don't check if the types themselves are equal because that excludes
	// vector floating-point selects.
	if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
	return SDValue();

	bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
	FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());

	// Try to invert the condition if true value is not all 1s and false value is
	// not all 0s.
	if (!TValIsAllOnes && !FValIsAllZeros &&
	// Check if the selector will be produced by CMPP/PCMP.
	Cond.getOpcode() == ISD::SETCC &&
	// Check if SETCC has already been promoted.
	TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
	CondVT) {
	bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
	bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());

	if (TValIsAllZeros \|\| FValIsAllOnes) {
	SDValue CC = Cond.getOperand(2);
	ISD::CondCode NewCC =
	ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
	Cond.getOperand(0).getValueType().isInteger());
	Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
	NewCC);
	std::swap(LHS, RHS);
	TValIsAllOnes = FValIsAllOnes;
	FValIsAllZeros = TValIsAllZeros;
	}
	}

	// vselect Cond, 111..., 000... -> Cond
	if (TValIsAllOnes && FValIsAllZeros)
	return DAG.getBitcast(VT, Cond);

	if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
	return SDValue();

	// vselect Cond, 111..., X -> or Cond, X
	if (TValIsAllOnes) {
	SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
	SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
	return DAG.getBitcast(VT, Or);
	}

	// vselect Cond, X, 000... -> and Cond, X
	if (FValIsAllZeros) {
	SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
	SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
	return DAG.getBitcast(VT, And);
	}

	return SDValue();
	}

	static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
	SDValue Cond = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	SDLoc DL(N);

	auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
	auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
	if (!TrueC \|\| !FalseC)
	return SDValue();

	// Don't do this for crazy integer types.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
	return SDValue();

	// If this is efficiently invertible, canonicalize the LHSC/RHSC values
	// so that TrueC (the true value) is larger than FalseC.
	bool NeedsCondInvert = false;
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
	// Efficiently invertible.
	(Cond.getOpcode() == ISD::SETCC \|\| // setcc -> invertible.
	(Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
	isa<ConstantSDNode>(Cond.getOperand(1))))) {
	NeedsCondInvert = true;
	std::swap(TrueC, FalseC);
	}

	// Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	if (NeedsCondInvert) // Invert the condition if needed.
	Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
	DAG.getConstant(1, DL, Cond.getValueType()));

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32)
	Diff = (unsigned)Diff;

	bool IsFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default:
	break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	IsFastMultiplier = true;
	break;
	}
	}

	if (IsFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
	if (NeedsCondInvert) // Invert the condition if needed.
	Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
	DAG.getConstant(1, DL, Cond.getValueType()));

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	return Cond;
	}
	}

	return SDValue();
	}

	// If this is a bitcasted op that can be represented as another type, push the
	// the bitcast to the inputs. This allows more opportunities for pattern
	// matching masked instructions. This is called when we know that the operation
	// is used as one of the inputs of a vselect.
	static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// Make sure we have a bitcast.
	if (OrigOp.getOpcode() != ISD::BITCAST)
	return false;

	SDValue Op = OrigOp.getOperand(0);

	// If the operation is used by anything other than the bitcast, we shouldn't
	// do this combine as that would replicate the operation.
	if (!Op.hasOneUse())
	return false;

	MVT VT = OrigOp.getSimpleValueType();
	MVT EltVT = VT.getVectorElementType();
	SDLoc DL(Op.getNode());

	auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
	SDValue Op2) {
	Op0 = DAG.getBitcast(VT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	Op1 = DAG.getBitcast(VT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
	return true;
	};

	unsigned Opcode = Op.getOpcode();
	switch (Opcode) {
	case X86ISD::PALIGNR:
	// PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
	if (!VT.is128BitVector())
	return false;
	Opcode = X86ISD::VALIGN;
	LLVM_FALLTHROUGH;
	case X86ISD::VALIGN: {
	if (EltVT != MVT::i32 && EltVT != MVT::i64)
	return false;
	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
	unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
	unsigned EltSize = EltVT.getSizeInBits();
	// Make sure we can represent the same shift with the new VT.
	if ((ShiftAmt % EltSize) != 0)
	return false;
	Imm = ShiftAmt / EltSize;
	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
	DAG.getConstant(Imm, DL, MVT::i8));
	}
	case X86ISD::SHUF128: {
	if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
	Op.getOperand(2));
	}
	case ISD::INSERT_SUBVECTOR: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
	// Only change element size, not type.
	if (EltVT.isInteger() != OpEltVT.isInteger())
	return false;
	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
	Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
	SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
	DCI.AddToWorklist(Op0.getNode());
	// Op1 needs to be bitcasted to a smaller vector with the same element type.
	SDValue Op1 = Op.getOperand(1);
	MVT Op1VT = MVT::getVectorVT(EltVT,
	Op1.getSimpleValueType().getSizeInBits() / EltSize);
	Op1 = DAG.getBitcast(Op1VT, Op1);
	DCI.AddToWorklist(Op1.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0, Op1,
	DAG.getIntPtrConstant(Imm, DL)));
	return true;
	}
	case ISD::EXTRACT_SUBVECTOR: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
	// Only change element size, not type.
	if (EltVT.isInteger() != OpEltVT.isInteger())
	return false;
	uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
	Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
	// Op0 needs to be bitcasted to a larger vector with the same element type.
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = MVT::getVectorVT(EltVT,
	Op0.getSimpleValueType().getSizeInBits() / EltSize);
	Op0 = DAG.getBitcast(Op0VT, Op0);
	DCI.AddToWorklist(Op0.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0,
	DAG.getIntPtrConstant(Imm, DL)));
	return true;
	}
	case X86ISD::SUBV_BROADCAST: {
	unsigned EltSize = EltVT.getSizeInBits();
	if (EltSize != 32 && EltSize != 64)
	return false;
	// Only change element size, not type.
	if (VT.isInteger() != Op.getSimpleValueType().isInteger())
	return false;
	SDValue Op0 = Op.getOperand(0);
	MVT Op0VT = MVT::getVectorVT(EltVT,
	Op0.getSimpleValueType().getSizeInBits() / EltSize);
	Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
	DCI.AddToWorklist(Op0.getNode());
	DCI.CombineTo(OrigOp.getNode(),
	DAG.getNode(Opcode, DL, VT, Op0));
	return true;
	}
	}

	return false;
	}

	/// Do target-specific dag combines on SELECT and VSELECT nodes.
	static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue Cond = N->getOperand(0);
	// Get the LHS/RHS of the select.
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	EVT VT = LHS.getValueType();
	EVT CondVT = Cond.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we have SSE[12] support, try to form min/max nodes. SSE min/max
	// instructions match the semantics of the common C idiom x<y?x:y but not
	// x<=y?x:y, because of how they handle negative zero (which can be
	// ignored in unsafe-math mode).
	// We also try to create v2f32 min/max nodes, which we later widen to v4f32.
	if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
	VT != MVT::f80 && VT != MVT::f128 &&
	(TLI.isTypeLegal(VT) \|\| VT == MVT::v2f32) &&
	(Subtarget.hasSSE2() \|\|
	(Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	unsigned Opcode = 0;
	// Check for x CC y ? x : y.
	if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	switch (CC) {
	default: break;
	case ISD::SETULT:
	// Converting this to a min would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETOLE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETULE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETOGE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGT:
	// Converting this to a max would handle NaNs incorrectly, and swapping
	// the operands would cause it to handle comparisons between positive
	// and negative zero incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)) {
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS)))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETUGE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMAX;
	break;
	}
	// Check for x CC y ? y : x -- a min/max with reversed arms.
	} else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(0))) {
	switch (CC) {
	default: break;
	case ISD::SETOGE:
	// Converting this to a min would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!(DAG.isKnownNeverZero(LHS) \|\| DAG.isKnownNeverZero(RHS))) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGT:
	// Converting this to a min would handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	(!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS)))
	break;
	Opcode = X86ISD::FMIN;
	break;
	case ISD::SETUGE:
	// Converting this to a min would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOGT:
	case ISD::SETGT:
	case ISD::SETGE:
	Opcode = X86ISD::FMIN;
	break;

	case ISD::SETULT:
	// Converting this to a max would handle NaNs incorrectly.
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETOLE:
	// Converting this to a max would handle comparisons between positive
	// and negative zero incorrectly, and swapping the operands would
	// cause it to handle NaNs incorrectly.
	if (!DAG.getTarget().Options.UnsafeFPMath &&
	!DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
	if (!DAG.isKnownNeverNaN(LHS) \|\| !DAG.isKnownNeverNaN(RHS))
	break;
	std::swap(LHS, RHS);
	}
	Opcode = X86ISD::FMAX;
	break;
	case ISD::SETULE:
	// Converting this to a max would handle both negative zeros and NaNs
	// incorrectly, but we can swap the operands to fix both.
	std::swap(LHS, RHS);
	LLVM_FALLTHROUGH;
	case ISD::SETOLT:
	case ISD::SETLT:
	case ISD::SETLE:
	Opcode = X86ISD::FMAX;
	break;
	}
	}

	if (Opcode)
	return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
	}

	// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
	// lowering on KNL. In this case we convert it to
	// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
	// The same situation for all 128 and 256-bit vectors of i8 and i16.
	// Since SKX these selects have a proper lowering.
	if (Subtarget.hasAVX512() && CondVT.isVector() &&
	CondVT.getVectorElementType() == MVT::i1 &&
	(VT.is128BitVector() \|\| VT.is256BitVector()) &&
	(VT.getVectorElementType() == MVT::i8 \|\|
	VT.getVectorElementType() == MVT::i16) &&
	!(Subtarget.hasBWI() && Subtarget.hasVLX())) {
	Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
	DCI.AddToWorklist(Cond.getNode());
	return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
	}

	if (SDValue V = combineSelectOfTwoConstants(N, DAG))
	return V;

	// Canonicalize max and min:
	// (x > y) ? x : y -> (x >= y) ? x : y
	// (x < y) ? x : y -> (x <= y) ? x : y
	// This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
	// the need for an extra compare
	// against zero. e.g.
	// (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
	// subl %esi, %edi
	// testl %edi, %edi
	// movl $0, %eax
	// cmovgl %edi, %eax
	// =>
	// xorl %eax, %eax
	// subl %esi, $edi
	// cmovsl %eax, %edi
	if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
	DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
	DAG.isEqualTo(RHS, Cond.getOperand(1))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
	switch (CC) {
	default: break;
	case ISD::SETLT:
	case ISD::SETGT: {
	ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
	Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
	Cond.getOperand(0), Cond.getOperand(1), NewCC);
	return DAG.getSelect(DL, VT, Cond, LHS, RHS);
	}
	}
	}

	// Early exit check
	if (!TLI.isTypeLegal(VT))
	return SDValue();

	// Match VSELECTs into subs with unsigned saturation.
	if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
	// psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
	((Subtarget.hasSSE2() && (VT == MVT::v16i8 \|\| VT == MVT::v8i16)) \|\|
	(Subtarget.hasAVX2() && (VT == MVT::v32i8 \|\| VT == MVT::v16i16)))) {
	ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();

	// Check if one of the arms of the VSELECT is a zero vector. If it's on the
	// left side invert the predicate to simplify logic below.
	SDValue Other;
	if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
	Other = RHS;
	CC = ISD::getSetCCInverse(CC, true);
	} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
	Other = LHS;
	}

	if (Other.getNode() && Other->getNumOperands() == 2 &&
	DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
	SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
	SDValue CondRHS = Cond->getOperand(1);

	// Look for a general sub with unsigned saturation first.
	// x >= y ? x-y : 0 --> subus x, y
	// x > y ? x-y : 0 --> subus x, y
	if ((CC == ISD::SETUGE \|\| CC == ISD::SETUGT) &&
	Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
	return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);

	if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
	if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
	if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
	if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
	// If the RHS is a constant we have to reverse the const
	// canonicalization.
	// x > C-1 ? x+-C : 0 --> subus x, C
	if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
	CondRHSConst->getAPIntValue() ==
	(-OpRHSConst->getAPIntValue() - 1))
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));

	// Another special case: If C was a sign bit, the sub has been
	// canonicalized into a xor.
	// FIXME: Would it be better to use computeKnownBits to determine
	// whether it's safe to decanonicalize the xor?
	// x s< 0 ? x^C : 0 --> subus x, C
	if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
	OpRHSConst->getAPIntValue().isSignMask())
	// Note that we have to rebuild the RHS constant here to ensure we
	// don't rely on particular values of undef lanes.
	return DAG.getNode(
	X86ISD::SUBUS, DL, VT, OpLHS,
	DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
	}
	}
	}

	if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
	return V;

	// If this is a dynamic select (non-constant condition) and we can match
	// this node with one of the variable blend instructions, restructure the
	// condition so that blends can use the high (sign) bit of each element and
	// use SimplifyDemandedBits to simplify the condition operand.
	if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
	!DCI.isBeforeLegalize() &&
	!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
	unsigned BitWidth = Cond.getScalarValueSizeInBits();

	// Don't optimize vector selects that map to mask-registers.
	if (BitWidth == 1)
	return SDValue();

	// We can only handle the cases where VSELECT is directly legal on the
	// subtarget. We custom lower VSELECT nodes with constant conditions and
	// this makes it hard to see whether a dynamic VSELECT will correctly
	// lower, so we both check the operation's status and explicitly handle the
	// cases where a dynamic blend will fail even though a constant-condition
	// blend could be custom lowered.
	// FIXME: We should find a better way to handle this class of problems.
	// Potentially, we should combine constant-condition vselect nodes
	// pre-legalization into shuffles and not mark as many types as custom
	// lowered.
	if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
	return SDValue();
	// FIXME: We don't support i16-element blends currently. We could and
	// should support them by making all the bits in the condition be set
	// rather than just the high bit and using an i8-element blend.
	if (VT.getVectorElementType() == MVT::i16)
	return SDValue();
	// Dynamic blending was only available from SSE4.1 onward.
	if (VT.is128BitVector() && !Subtarget.hasSSE41())
	return SDValue();
	// Byte blends are only available in AVX2
	if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
	return SDValue();

	assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
	APInt DemandedMask(APInt::getSignMask(BitWidth));
	KnownBits Known;
	- TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
	- DCI.isBeforeLegalizeOps());
	+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	+ !DCI.isBeforeLegalizeOps());
	if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) \|\|
	TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
	// If we changed the computation somewhere in the DAG, this change will
	// affect all users of Cond. Make sure it is fine and update all the nodes
	// so that we do not use the generic VSELECT anymore. Otherwise, we may
	// perform wrong optimizations as we messed with the actual expectation
	// for the vector boolean values.
	if (Cond != TLO.Old) {
	// Check all uses of the condition operand to check whether it will be
	// consumed by non-BLEND instructions. Those may require that all bits
	// are set properly.
	for (SDNode *U : Cond->uses()) {
	// TODO: Add other opcodes eventually lowered into BLEND.
	if (U->getOpcode() != ISD::VSELECT)
	return SDValue();
	}

	// Update all users of the condition before committing the change, so
	// that the VSELECT optimizations that expect the correct vector boolean
	// value will not be triggered.
	for (SDNode *U : Cond->uses()) {
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
	U->getValueType(0), Cond, U->getOperand(1),
	U->getOperand(2));
	DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
	}
	DCI.CommitTargetLoweringOpt(TLO);
	return SDValue();
	}
	// Only Cond (rather than other nodes in the computation chain) was
	// changed. Change the condition just for N to keep the opportunity to
	// optimize all other users their own way.
	SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
	return SDValue();
	}
	}

	// Look for vselects with LHS/RHS being bitcasted from an operation that
	// can be executed on another type. Push the bitcast to the inputs of
	// the operation. This exposes opportunities for using masking instructions.
	if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
	CondVT.getVectorElementType() == MVT::i1) {
	if (combineBitcastForMaskedOp(LHS, DAG, DCI))
	return SDValue(N, 0);
	if (combineBitcastForMaskedOp(RHS, DAG, DCI))
	return SDValue(N, 0);
	}

	return SDValue();
	}

	/// Combine:
	/// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
	/// to:
	/// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
	/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
	/// Note that this is only legal for some op/cc combinations.
	static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
	SelectionDAG &DAG) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Can't replace the cmp if it has more uses than the one we're looking at.
	// FIXME: We would like to be able to handle this, but would need to make sure
	// all uses were updated.
	if (!Cmp.hasOneUse())
	return SDValue();

	// This only applies to variations of the common case:
	// (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
	// (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
	// (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
	// (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
	// Using the proper condcodes (see below), overflow is checked for.

	// FIXME: We can generalize both constraints:
	// - XOR/OR/AND (if they were made to survive AtomicExpand)
	// - LHS != 1
	// if the result is compared.

	SDValue CmpLHS = Cmp.getOperand(0);
	SDValue CmpRHS = Cmp.getOperand(1);

	if (!CmpLHS.hasOneUse())
	return SDValue();

	auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
	if (!CmpRHSC \|\| CmpRHSC->getZExtValue() != 0)
	return SDValue();

	const unsigned Opc = CmpLHS.getOpcode();

	if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
	return SDValue();

	SDValue OpRHS = CmpLHS.getOperand(2);
	auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
	if (!OpRHSC)
	return SDValue();

	APInt Addend = OpRHSC->getAPIntValue();
	if (Opc == ISD::ATOMIC_LOAD_SUB)
	Addend = -Addend;

	if (CC == X86::COND_S && Addend == 1)
	CC = X86::COND_LE;
	else if (CC == X86::COND_NS && Addend == 1)
	CC = X86::COND_G;
	else if (CC == X86::COND_G && Addend == -1)
	CC = X86::COND_GE;
	else if (CC == X86::COND_LE && Addend == -1)
	CC = X86::COND_L;
	else
	return SDValue();

	SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
	DAG.getUNDEF(CmpLHS.getValueType()));
	DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
	return LockOp;
	}

	// Check whether a boolean test is testing a boolean value generated by
	// X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
	// code.
	//
	// Simplify the following patterns:
	// (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
	// to (Op EFLAGS Cond)
	//
	// (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
	// (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
	// to (Op EFLAGS !Cond)
	//
	// where Op could be BRCOND or CMOV.
	//
	static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
	// This combine only operates on CMP-like nodes.
	if (!(Cmp.getOpcode() == X86ISD::CMP \|\|
	(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
	return SDValue();

	// Quit if not used as a boolean value.
	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	// Check CMP operands. One of them should be 0 or 1 and the other should be
	// an SetCC or extended from it.
	SDValue Op1 = Cmp.getOperand(0);
	SDValue Op2 = Cmp.getOperand(1);

	SDValue SetCC;
	const ConstantSDNode* C = nullptr;
	bool needOppositeCond = (CC == X86::COND_E);
	bool checkAgainstTrue = false; // Is it a comparison against 1?

	if ((C = dyn_cast<ConstantSDNode>(Op1)))
	SetCC = Op2;
	else if ((C = dyn_cast<ConstantSDNode>(Op2)))
	SetCC = Op1;
	else // Quit if all operands are not constants.
	return SDValue();

	if (C->getZExtValue() == 1) {
	needOppositeCond = !needOppositeCond;
	checkAgainstTrue = true;
	} else if (C->getZExtValue() != 0)
	// Quit if the constant is neither 0 or 1.
	return SDValue();

	bool truncatedToBoolWithAnd = false;
	// Skip (zext $x), (trunc $x), or (and $x, 1) node.
	while (SetCC.getOpcode() == ISD::ZERO_EXTEND \|\|
	SetCC.getOpcode() == ISD::TRUNCATE \|\|
	SetCC.getOpcode() == ISD::AND) {
	if (SetCC.getOpcode() == ISD::AND) {
	int OpIdx = -1;
	if (isOneConstant(SetCC.getOperand(0)))
	OpIdx = 1;
	if (isOneConstant(SetCC.getOperand(1)))
	OpIdx = 0;
	if (OpIdx < 0)
	break;
	SetCC = SetCC.getOperand(OpIdx);
	truncatedToBoolWithAnd = true;
	} else
	SetCC = SetCC.getOperand(0);
	}

	switch (SetCC.getOpcode()) {
	case X86ISD::SETCC_CARRY:
	// Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
	// simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
	// i.e. it's a comparison against true but the result of SETCC_CARRY is not
	// truncated to i1 using 'and'.
	if (checkAgainstTrue && !truncatedToBoolWithAnd)
	break;
	assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
	"Invalid use of SETCC_CARRY!");
	LLVM_FALLTHROUGH;
	case X86ISD::SETCC:
	// Set the condition code or opposite one if necessary.
	CC = X86::CondCode(SetCC.getConstantOperandVal(0));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(1);
	case X86ISD::CMOV: {
	// Check whether false/true value has canonical one, i.e. 0 or 1.
	ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
	ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
	// Quit if true value is not a constant.
	if (!TVal)
	return SDValue();
	// Quit if false value is not a constant.
	if (!FVal) {
	SDValue Op = SetCC.getOperand(0);
	// Skip 'zext' or 'trunc' node.
	if (Op.getOpcode() == ISD::ZERO_EXTEND \|\|
	Op.getOpcode() == ISD::TRUNCATE)
	Op = Op.getOperand(0);
	// A special case for rdrand/rdseed, where 0 is set if false cond is
	// found.
	if ((Op.getOpcode() != X86ISD::RDRAND &&
	Op.getOpcode() != X86ISD::RDSEED) \|\| Op.getResNo() != 0)
	return SDValue();
	}
	// Quit if false value is not the constant 0 or 1.
	bool FValIsFalse = true;
	if (FVal && FVal->getZExtValue() != 0) {
	if (FVal->getZExtValue() != 1)
	return SDValue();
	// If FVal is 1, opposite cond is needed.
	needOppositeCond = !needOppositeCond;
	FValIsFalse = false;
	}
	// Quit if TVal is not the constant opposite of FVal.
	if (FValIsFalse && TVal->getZExtValue() != 1)
	return SDValue();
	if (!FValIsFalse && TVal->getZExtValue() != 0)
	return SDValue();
	CC = X86::CondCode(SetCC.getConstantOperandVal(2));
	if (needOppositeCond)
	CC = X86::GetOppositeBranchCondition(CC);
	return SetCC.getOperand(3);
	}
	}

	return SDValue();
	}

	/// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
	/// Match:
	/// (X86or (X86setcc) (X86setcc))
	/// (X86cmp (and (X86setcc) (X86setcc)), 0)
	static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
	X86::CondCode &CC1, SDValue &Flags,
	bool &isAnd) {
	if (Cond->getOpcode() == X86ISD::CMP) {
	if (!isNullConstant(Cond->getOperand(1)))
	return false;

	Cond = Cond->getOperand(0);
	}

	isAnd = false;

	SDValue SetCC0, SetCC1;
	switch (Cond->getOpcode()) {
	default: return false;
	case ISD::AND:
	case X86ISD::AND:
	isAnd = true;
	LLVM_FALLTHROUGH;
	case ISD::OR:
	case X86ISD::OR:
	SetCC0 = Cond->getOperand(0);
	SetCC1 = Cond->getOperand(1);
	break;
	};

	// Make sure we have SETCC nodes, using the same flags value.
	if (SetCC0.getOpcode() != X86ISD::SETCC \|\|
	SetCC1.getOpcode() != X86ISD::SETCC \|\|
	SetCC0->getOperand(1) != SetCC1->getOperand(1))
	return false;

	CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
	CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
	Flags = SetCC0->getOperand(1);
	return true;
	}

	/// Optimize an EFLAGS definition used according to the condition code \p CC
	/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
	/// uses of chain values.
	static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
	SelectionDAG &DAG) {
	if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
	return R;
	return combineSetCCAtomicArith(EFLAGS, CC, DAG);
	}

	/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
	static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);

	// If the flag operand isn't dead, don't touch this CMOV.
	if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
	return SDValue();

	SDValue FalseOp = N->getOperand(0);
	SDValue TrueOp = N->getOperand(1);
	X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
	SDValue Cond = N->getOperand(3);

	if (CC == X86::COND_E \|\| CC == X86::COND_NE) {
	switch (Cond.getOpcode()) {
	default: break;
	case X86ISD::BSR:
	case X86ISD::BSF:
	// If operand of BSR / BSF are proven never zero, then ZF cannot be set.
	if (DAG.isKnownNeverZero(Cond.getOperand(0)))
	return (CC == X86::COND_E) ? FalseOp : TrueOp;
	}
	}

	// Try to simplify the EFLAGS and condition code operands.
	// We can't always do this as FCMOV only supports a subset of X86 cond.
	if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
	if (FalseOp.getValueType() != MVT::f80 \|\| hasFPCMov(CC)) {
	SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
	Flags};
	return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
	}
	}

	// If this is a select between two integer constants, try to do some
	// optimizations. Note that the operands are ordered the opposite of SELECT
	// operands.
	if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
	if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
	// Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
	// larger than FalseC (the false value).
	if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueC, FalseC);
	std::swap(TrueOp, FalseOp);
	}

	// Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
	// This is efficient for any integer data type (including i8/i16) and
	// shift amount.
	if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);

	unsigned ShAmt = TrueC->getAPIntValue().logBase2();
	Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(ShAmt, DL, MVT::i8));
	if (N->getNumValues() == 2) // Dead flag value?
	return DCI.CombineTo(N, Cond, SDValue());
	return Cond;
	}

	// Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
	// for any integer data type, including i8/i16.
	if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
	Cond = getSETCC(CC, Cond, DL, DAG);

	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
	FalseC->getValueType(0), Cond);
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));

	if (N->getNumValues() == 2) // Dead flag value?
	return DCI.CombineTo(N, Cond, SDValue());
	return Cond;
	}

	// Optimize cases that will turn into an LEA instruction. This requires
	// an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
	if (N->getValueType(0) == MVT::i32 \|\| N->getValueType(0) == MVT::i64) {
	uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
	if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;

	bool isFastMultiplier = false;
	if (Diff < 10) {
	switch ((unsigned char)Diff) {
	default: break;
	case 1: // result = add base, cond
	case 2: // result = lea base( , cond*2)
	case 3: // result = lea base(cond, cond*2)
	case 4: // result = lea base( , cond*4)
	case 5: // result = lea base(cond, cond*4)
	case 8: // result = lea base( , cond*8)
	case 9: // result = lea base(cond, cond*8)
	isFastMultiplier = true;
	break;
	}
	}

	if (isFastMultiplier) {
	APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
	Cond = getSETCC(CC, Cond, DL ,DAG);
	// Zero extend the condition if needed.
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
	Cond);
	// Scale the condition by the difference.
	if (Diff != 1)
	Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
	DAG.getConstant(Diff, DL, Cond.getValueType()));

	// Add the base if non-zero.
	if (FalseC->getAPIntValue() != 0)
	Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
	SDValue(FalseC, 0));
	if (N->getNumValues() == 2) // Dead flag value?
	return DCI.CombineTo(N, Cond, SDValue());
	return Cond;
	}
	}
	}
	}

	// Handle these cases:
	// (select (x != c), e, c) -> select (x != c), e, x),
	// (select (x == c), c, e) -> select (x == c), x, e)
	// where the c is an integer constant, and the "select" is the combination
	// of CMOV and CMP.
	//
	// The rationale for this change is that the conditional-move from a constant
	// needs two instructions, however, conditional-move from a register needs
	// only one instruction.
	//
	// CAVEAT: By replacing a constant with a symbolic value, it may obscure
	// some instruction-combining opportunities. This opt needs to be
	// postponed as late as possible.
	//
	if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
	// the DCI.xxxx conditions are provided to postpone the optimization as
	// late as possible.

	ConstantSDNode *CmpAgainst = nullptr;
	if ((Cond.getOpcode() == X86ISD::CMP \|\| Cond.getOpcode() == X86ISD::SUB) &&
	(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
	!isa<ConstantSDNode>(Cond.getOperand(0))) {

	if (CC == X86::COND_NE &&
	CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
	CC = X86::GetOppositeBranchCondition(CC);
	std::swap(TrueOp, FalseOp);
	}

	if (CC == X86::COND_E &&
	CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
	SDValue Ops[] = { FalseOp, Cond.getOperand(0),
	DAG.getConstant(CC, DL, MVT::i8), Cond };
	return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
	}
	}
	}

	// Fold and/or of setcc's to double CMOV:
	// (CMOV F, T, ((cc1 \| cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
	// (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
	//
	// This combine lets us generate:
	// cmovcc1 (jcc1 if we don't have CMOV)
	// cmovcc2 (same)
	// instead of:
	// setcc1
	// setcc2
	// and/or
	// cmovne (jne if we don't have CMOV)
	// When we can't use the CMOV instruction, it might increase branch
	// mispredicts.
	// When we can use CMOV, or when there is no mispredict, this improves
	// throughput and reduces register pressure.
	//
	if (CC == X86::COND_NE) {
	SDValue Flags;
	X86::CondCode CC0, CC1;
	bool isAndSetCC;
	if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
	if (isAndSetCC) {
	std::swap(FalseOp, TrueOp);
	CC0 = X86::GetOppositeBranchCondition(CC0);
	CC1 = X86::GetOppositeBranchCondition(CC1);
	}

	SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
	Flags};
	SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
	SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
	SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
	DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
	return CMOV;
	}
	}

	return SDValue();
	}

	/// Different mul shrinking modes.
	enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };

	static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
	EVT VT = N->getOperand(0).getValueType();
	if (VT.getScalarSizeInBits() != 32)
	return false;

	assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
	unsigned SignBits[2] = {1, 1};
	bool IsPositive[2] = {false, false};
	for (unsigned i = 0; i < 2; i++) {
	SDValue Opd = N->getOperand(i);

	// DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
	// compute signbits for it separately.
	if (Opd.getOpcode() == ISD::ANY_EXTEND) {
	// For anyextend, it is safe to assume an appropriate number of leading
	// sign/zero bits.
	if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
	SignBits[i] = 25;
	else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
	MVT::i16)
	SignBits[i] = 17;
	else
	return false;
	IsPositive[i] = true;
	} else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
	// All the operands of BUILD_VECTOR need to be int constant.
	// Find the smallest value range which all the operands belong to.
	SignBits[i] = 32;
	IsPositive[i] = true;
	for (const SDValue &SubOp : Opd.getNode()->op_values()) {
	if (SubOp.isUndef())
	continue;
	auto *CN = dyn_cast<ConstantSDNode>(SubOp);
	if (!CN)
	return false;
	APInt IntVal = CN->getAPIntValue();
	if (IntVal.isNegative())
	IsPositive[i] = false;
	SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
	}
	} else {
	SignBits[i] = DAG.ComputeNumSignBits(Opd);
	if (Opd.getOpcode() == ISD::ZERO_EXTEND)
	IsPositive[i] = true;
	}
	}

	bool AllPositive = IsPositive[0] && IsPositive[1];
	unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
	// When ranges are from -128 ~ 127, use MULS8 mode.
	if (MinSignBits >= 25)
	Mode = MULS8;
	// When ranges are from 0 ~ 255, use MULU8 mode.
	else if (AllPositive && MinSignBits >= 24)
	Mode = MULU8;
	// When ranges are from -32768 ~ 32767, use MULS16 mode.
	else if (MinSignBits >= 17)
	Mode = MULS16;
	// When ranges are from 0 ~ 65535, use MULU16 mode.
	else if (AllPositive && MinSignBits >= 16)
	Mode = MULU16;
	else
	return false;
	return true;
	}

	/// When the operands of vector mul are extended from smaller size values,
	/// like i8 and i16, the type of mul may be shrinked to generate more
	/// efficient code. Two typical patterns are handled:
	/// Pattern1:
	/// %2 = sext/zext <N x i8> %1 to <N x i32>
	/// %4 = sext/zext <N x i8> %3 to <N x i32>
	// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// Pattern2:
	/// %2 = zext/sext <N x i16> %1 to <N x i32>
	/// %4 = zext/sext <N x i16> %3 to <N x i32>
	/// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
	/// %5 = mul <N x i32> %2, %4
	///
	/// There are four mul shrinking modes:
	/// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// -128 to 128, and the scalar value range of %4 is also -128 to 128,
	/// generate pmullw+sext32 for it (MULS8 mode).
	/// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
	/// 0 to 255, and the scalar value range of %4 is also 0 to 255,
	/// generate pmullw+zext32 for it (MULU8 mode).
	/// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
	/// generate pmullw+pmulhw for it (MULS16 mode).
	/// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
	/// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
	/// generate pmullw+pmulhuw for it (MULU16 mode).
	static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Check for legality
	// pmullw/pmulhw are not supported by SSE.
	if (!Subtarget.hasSSE2())
	return SDValue();

	// Check for profitability
	// pmulld is supported since SSE41. It is better to use pmulld
	// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
	// the expansion.
	bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
	if (Subtarget.hasSSE41() && (OptForMinSize \|\| !Subtarget.isPMULLDSlow()))
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(N, DAG, Mode))
	return SDValue();

	SDLoc DL(N);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getOperand(0).getValueType();
	unsigned RegSize = 128;
	MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
	EVT ReducedVT =
	EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
	// Shrink the operands of mul.
	SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
	SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);

	if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
	// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
	// lower part is needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
	if (Mode == MULU8 \|\| Mode == MULS8) {
	return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
	DL, VT, MulLo);
	} else {
	MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
	// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
	// the higher part is also needed.
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	ReducedVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result.
	// Generate shuffle functioning as punpcklwd.
	SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
	for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
	ShuffleMask[2 * i] = i;
	ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
	}
	SDValue ResLo =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
	// Generate shuffle functioning as punpckhwd.
	for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
	ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
	ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
	}
	SDValue ResHi =
	DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
	ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
	}
	} else {
	// When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
	// to legalize the mul explicitly because implicit legalization for type
	// <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
	// instructions which will not exist when we explicitly legalize it by
	// extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
	// <4 x i16> undef).
	//
	// Legalize the operands of mul.
	// FIXME: We may be able to handle non-concatenated vectors by insertion.
	unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
	if ((RegSize % ReducedSizeInBits) != 0)
	return SDValue();

	SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
	DAG.getUNDEF(ReducedVT));
	Ops[0] = NewN0;
	NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
	Ops[0] = NewN1;
	NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);

	if (Mode == MULU8 \|\| Mode == MULS8) {
	// Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
	// part is needed.
	SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);

	// convert the type of mul result to VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
	: ISD::SIGN_EXTEND_VECTOR_INREG,
	DL, ResVT, Mul);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	} else {
	// Generate the lower and higher part of mul: pmulhw/pmulhuw. For
	// MULU16/MULS16, both parts are needed.
	SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
	SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
	OpsVT, NewN0, NewN1);

	// Repack the lower part and higher part result of mul into a wider
	// result. Make sure the type of mul result is VT.
	MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
	SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
	Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
	DAG.getIntPtrConstant(0, DL));
	}
	}
	}

	static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
	EVT VT, SDLoc DL) {

	auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(Mult, DL, VT));
	Result = DAG.getNode(ISD::SHL, DL, VT, Result,
	DAG.getConstant(Shift, DL, MVT::i8));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	auto combineMulMulAddOrSub = [&](bool isAdd) {
	SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(9, DL, VT));
	Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
	Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
	N->getOperand(0));
	return Result;
	};

	switch (MulAmt) {
	default:
	break;
	case 11:
	// mul x, 11 => add ((shl (mul x, 5), 1), x)
	return combineMulShlAddOrSub(5, 1, /isAdd/ true);
	case 21:
	// mul x, 21 => add ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ true);
	case 22:
	// mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(5, 2, /isAdd/ true));
	case 19:
	// mul x, 19 => sub ((shl (mul x, 5), 2), x)
	return combineMulShlAddOrSub(5, 2, /isAdd/ false);
	case 13:
	// mul x, 13 => add ((shl (mul x, 3), 2), x)
	return combineMulShlAddOrSub(3, 2, /isAdd/ true);
	case 23:
	// mul x, 13 => sub ((shl (mul x, 3), 3), x)
	return combineMulShlAddOrSub(3, 3, /isAdd/ false);
	case 14:
	// mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulShlAddOrSub(3, 2, /isAdd/ true));
	case 26:
	// mul x, 26 => sub ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ false);
	case 28:
	// mul x, 28 => add ((mul (mul x, 9), 3), x)
	return combineMulMulAddOrSub(/isAdd/ true);
	case 29:
	// mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
	return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
	combineMulMulAddOrSub(/isAdd/ true));
	case 30:
	// mul x, 30 => sub (sub ((shl x, 5), x), x)
	return DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(5, DL, MVT::i8)),
	N->getOperand(0)),
	N->getOperand(0));
	}
	return SDValue();
	}

	/// Optimize a single multiply with constant into two operations in order to
	/// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
	static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (DCI.isBeforeLegalize() && VT.isVector())
	return reduceVMULWidth(N, DAG, Subtarget);

	if (!MulConstantOptimization)
	return SDValue();
	// An imul is usually smaller than the alternative sequence.
	if (DAG.getMachineFunction().getFunction()->optForMinSize())
	return SDValue();

	if (DCI.isBeforeLegalize() \|\| DCI.isCalledByLegalizer())
	return SDValue();

	if (VT != MVT::i64 && VT != MVT::i32)
	return SDValue();

	ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
	if (!C)
	return SDValue();
	uint64_t MulAmt = C->getZExtValue();
	if (isPowerOf2_64(MulAmt) \|\| MulAmt == 3 \|\| MulAmt == 5 \|\| MulAmt == 9)
	return SDValue();

	uint64_t MulAmt1 = 0;
	uint64_t MulAmt2 = 0;
	if ((MulAmt % 9) == 0) {
	MulAmt1 = 9;
	MulAmt2 = MulAmt / 9;
	} else if ((MulAmt % 5) == 0) {
	MulAmt1 = 5;
	MulAmt2 = MulAmt / 5;
	} else if ((MulAmt % 3) == 0) {
	MulAmt1 = 3;
	MulAmt2 = MulAmt / 3;
	}

	SDLoc DL(N);
	SDValue NewMul;
	if (MulAmt2 &&
	(isPowerOf2_64(MulAmt2) \|\| MulAmt2 == 3 \|\| MulAmt2 == 5 \|\| MulAmt2 == 9)){

	if (isPowerOf2_64(MulAmt2) &&
	!(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
	// If second multiplifer is pow2, issue it first. We want the multiply by
	// 3, 5, or 9 to be folded into the addressing mode unless the lone use
	// is an add.
	std::swap(MulAmt1, MulAmt2);

	if (isPowerOf2_64(MulAmt1))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
	DAG.getConstant(MulAmt1, DL, VT));

	if (isPowerOf2_64(MulAmt2))
	NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
	DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
	else
	NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
	DAG.getConstant(MulAmt2, DL, VT));
	} else if (!Subtarget.slowLEA())
	NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);

	if (!NewMul) {
	assert(MulAmt != 0 &&
	MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
	"Both cases that could cause potential overflows should have "
	"already been handled.");
	int64_t SignMulAmt = C->getSExtValue();
	if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
	(SignMulAmt != -INT64_MAX)) {
	int NumSign = SignMulAmt > 0 ? 1 : -1;
	bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
	bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
	if (IsPowerOf2_64PlusOne) {
	// (mul x, 2^N + 1) => (add (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::ADD, DL, VT, N->getOperand(0),
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
	MVT::i8)));
	} else if (IsPowerOf2_64MinusOne) {
	// (mul x, 2^N - 1) => (sub (shl x, N), x)
	NewMul = DAG.getNode(
	ISD::SUB, DL, VT,
	DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
	DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
	MVT::i8)),
	N->getOperand(0));
	}
	// To negate, subtract the number from zero
	if ((IsPowerOf2_64PlusOne \|\| IsPowerOf2_64MinusOne) && NumSign == -1)
	NewMul =
	DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
	}
	}

	if (NewMul)
	// Do not add new nodes to DAG combiner worklist.
	DCI.CombineTo(N, NewMul, false);

	return SDValue();
	}

	static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
	EVT VT = N0.getValueType();

	// fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
	// since the result of setcc_c is all zero's or all ones.
	if (VT.isInteger() && !VT.isVector() &&
	N1C && N0.getOpcode() == ISD::AND &&
	N0.getOperand(1).getOpcode() == ISD::Constant) {
	SDValue N00 = N0.getOperand(0);
	APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
	Mask <<= N1C->getAPIntValue();
	bool MaskOK = false;
	// We can handle cases concerning bit-widening nodes containing setcc_c if
	// we carefully interrogate the mask to make sure we are semantics
	// preserving.
	// The transform is not safe if the result of C1 << C2 exceeds the bitwidth
	// of the underlying setcc_c operation if the setcc_c was zero extended.
	// Consider the following example:
	// zext(setcc_c) -> i32 0x0000FFFF
	// c1 -> i32 0x0000FFFF
	// c2 -> i32 0x00000001
	// (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
	// (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = true;
	} else if ((N00.getOpcode() == ISD::ZERO_EXTEND \|\|
	N00.getOpcode() == ISD::ANY_EXTEND) &&
	N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
	MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
	}
	if (MaskOK && Mask != 0) {
	SDLoc DL(N);
	return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
	}
	}

	// Hardware support for vector shifts is sparse which makes us scalarize the
	// vector operations in many cases. Also, on sandybridge ADD is faster than
	// shl.
	// (shl V, 1) -> add V,V
	if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
	if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
	assert(N0.getValueType().isVector() && "Invalid vector shift type");
	// We shift all of the values by one. In many cases we do not have
	// hardware support for this operation. This is better expressed as an ADD
	// of two values.
	if (N1SplatC->getAPIntValue() == 1)
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
	}

	return SDValue();
	}

	static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N0.getValueType();
	unsigned Size = VT.getSizeInBits();

	// fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
	// into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
	// into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
	// depending on sign of (SarConst - [56,48,32,24,16])

	// sexts in X86 are MOVs. The MOVs have the same code size
	// as above SHIFTs (only SHIFT on 1 has lower code size).
	// However the MOVs have 2 advantages to a SHIFT:
	// 1. MOVs can write to a register that differs from source
	// 2. MOVs accept memory operands

	if (!VT.isInteger() \|\| VT.isVector() \|\| N1.getOpcode() != ISD::Constant \|\|
	N0.getOpcode() != ISD::SHL \|\| !N0.hasOneUse() \|\|
	N0.getOperand(1).getOpcode() != ISD::Constant)
	return SDValue();

	SDValue N00 = N0.getOperand(0);
	SDValue N01 = N0.getOperand(1);
	APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
	APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
	EVT CVT = N1.getValueType();

	if (SarConst.isNegative())
	return SDValue();

	for (MVT SVT : MVT::integer_valuetypes()) {
	unsigned ShiftSize = SVT.getSizeInBits();
	// skipping types without corresponding sext/zext and
	// ShlConst that is not one of [56,48,32,24,16]
	if (ShiftSize < 8 \|\| ShiftSize > 64 \|\| ShlConst != Size - ShiftSize)
	continue;
	SDLoc DL(N);
	SDValue NN =
	DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
	SarConst = SarConst - (Size - ShiftSize);
	if (SarConst == 0)
	return NN;
	else if (SarConst.isNegative())
	return DAG.getNode(ISD::SHL, DL, VT, NN,
	DAG.getConstant(-SarConst, DL, CVT));
	else
	return DAG.getNode(ISD::SRA, DL, VT, NN,
	DAG.getConstant(SarConst, DL, CVT));
	}
	return SDValue();
	}

	/// \brief Returns a vector of 0s if the node in input is a vector logical
	/// shift by a constant amount which is known to be bigger than or equal
	/// to the vector element size in bits.
	static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);

	if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
	(!Subtarget.hasInt256() \|\|
	(VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
	return SDValue();

	SDValue Amt = N->getOperand(1);
	SDLoc DL(N);
	if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
	if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
	const APInt &ShiftAmt = AmtSplat->getAPIntValue();
	unsigned MaxAmount =
	VT.getSimpleVT().getScalarSizeInBits();

	// SSE2/AVX2 logical shifts always return a vector of 0s
	// if the shift amount is bigger than or equal to
	// the element size. The constant shift amount will be
	// encoded as a 8-bit immediate.
	if (ShiftAmt.trunc(8).uge(MaxAmount))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
	}

	return SDValue();
	}

	static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (N->getOpcode() == ISD::SHL)
	if (SDValue V = combineShiftLeft(N, DAG))
	return V;

	if (N->getOpcode() == ISD::SRA)
	if (SDValue V = combineShiftRightAlgebraic(N, DAG))
	return V;

	// Try to fold this logical shift into a zero vector.
	if (N->getOpcode() != ISD::SRA)
	if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
	return V;

	return SDValue();
	}

	static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	assert((X86ISD::VSHLI == Opcode \|\| X86ISD::VSRAI == Opcode \|\|
	X86ISD::VSRLI == Opcode) &&
	"Unexpected shift opcode");
	bool LogicalShift = X86ISD::VSHLI == Opcode \|\| X86ISD::VSRLI == Opcode;
	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	unsigned NumBitsPerElt = VT.getScalarSizeInBits();
	assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
	"Unexpected value type");

	// Out of range logical bit shifts are guaranteed to be zero.
	// Out of range arithmetic bit shifts splat the sign bit.
	APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
	if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
	if (LogicalShift)
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
	else
	ShiftVal = NumBitsPerElt - 1;
	}

	// Shift N0 by zero -> N0.
	if (!ShiftVal)
	return N0;

	// Shift zero -> zero.
	if (ISD::isBuildVectorAllZeros(N0.getNode()))
	return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));

	// fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
	// This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
	// TODO - support other sra opcodes as needed.
	if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
	N0.getOpcode() == X86ISD::VSRAI)
	return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);

	// We can decode 'whole byte' logical bit shifts as shuffles.
	if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
	SDValue Op(N, 0);
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget))
	return SDValue(); // This routine will use CombineTo to replace N.
	}

	// Constant Folding.
	APInt UndefElts;
	SmallVector<APInt, 32> EltBits;
	if (N->isOnlyUserOf(N0.getNode()) &&
	getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
	assert(EltBits.size() == VT.getVectorNumElements() &&
	"Unexpected shift value type");
	unsigned ShiftImm = ShiftVal.getZExtValue();
	for (APInt &Elt : EltBits) {
	if (X86ISD::VSHLI == Opcode)
	Elt <<= ShiftImm;
	else if (X86ISD::VSRAI == Opcode)
	Elt.ashrInPlace(ShiftImm);
	else
	Elt.lshrInPlace(ShiftImm);
	}
	return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
	}

	return SDValue();
	}

	static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	assert(
	((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) \|\|
	(N->getOpcode() == X86ISD::PINSRW &&
	N->getValueType(0) == MVT::v8i16)) &&
	"Unexpected vector insertion");

	// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
	SDValue Op(N, 0);
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget);
	return SDValue();
	}

	/// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
	/// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
	/// OR -> CMPNEQSS.
	static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned opcode;

	// SSE1 supports CMP{eq\|ne}SS, and SSE2 added CMP{eq\|ne}SD, but
	// we're requiring SSE2 for both.
	if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDValue CMP0 = N0->getOperand(1);
	SDValue CMP1 = N1->getOperand(1);
	SDLoc DL(N);

	// The SETCCs should both refer to the same CMP.
	if (CMP0.getOpcode() != X86ISD::CMP \|\| CMP0 != CMP1)
	return SDValue();

	SDValue CMP00 = CMP0->getOperand(0);
	SDValue CMP01 = CMP0->getOperand(1);
	EVT VT = CMP00.getValueType();

	if (VT == MVT::f32 \|\| VT == MVT::f64) {
	bool ExpectingFlags = false;
	// Check for any users that want flags:
	for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
	!ExpectingFlags && UI != UE; ++UI)
	switch (UI->getOpcode()) {
	default:
	case ISD::BR_CC:
	case ISD::BRCOND:
	case ISD::SELECT:
	ExpectingFlags = true;
	break;
	case ISD::CopyToReg:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	break;
	}

	if (!ExpectingFlags) {
	enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
	enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);

	if (cc1 == X86::COND_E \|\| cc1 == X86::COND_NE) {
	X86::CondCode tmp = cc0;
	cc0 = cc1;
	cc1 = tmp;
	}

	if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) \|\|
	(cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
	// FIXME: need symbolic constants for these magic numbers.
	// See X86ATTInstPrinter.cpp:printSSECC().
	unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
	if (Subtarget.hasAVX512()) {
	SDValue FSetCC =
	DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
	DAG.getConstant(x86cc, DL, MVT::i8));
	return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
	FSetCC, DAG.getIntPtrConstant(0, DL));
	}
	SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
	CMP00.getValueType(), CMP00, CMP01,
	DAG.getConstant(x86cc, DL,
	MVT::i8));

	bool is64BitFP = (CMP00.getValueType() == MVT::f64);
	MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;

	if (is64BitFP && !Subtarget.is64Bit()) {
	// On a 32-bit target, we cannot bitcast the 64-bit float to a
	// 64-bit integer, since that's not a legal type. Since
	// OnesOrZeroesF is all ones of all zeroes, we don't need all the
	// bits, but can do this little dance to extract the lowest 32 bits
	// and work with those going forward.
	SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
	OnesOrZeroesF);
	SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
	OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
	Vector32, DAG.getIntPtrConstant(0, DL));
	IntVT = MVT::i32;
	}

	SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
	SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
	DAG.getConstant(1, DL, IntVT));
	SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
	ANDed);
	return OneBitOfTruth;
	}
	}
	}
	}
	return SDValue();
	}

	/// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
	static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == ISD::AND);

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
	return SDValue();

	if (N0.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);

	if (N1.getOpcode() == ISD::XOR &&
	ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
	return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
	// register. In most cases we actually compare or select YMM-sized registers
	// and mixing the two types creates horrible code. This method optimizes
	// some of the transition sequences.
	static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.is256BitVector())
	return SDValue();

	assert((N->getOpcode() == ISD::ANY_EXTEND \|\|
	N->getOpcode() == ISD::ZERO_EXTEND \|\|
	N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");

	SDValue Narrow = N->getOperand(0);
	EVT NarrowVT = Narrow->getValueType(0);
	if (!NarrowVT.is128BitVector())
	return SDValue();

	if (Narrow->getOpcode() != ISD::XOR &&
	Narrow->getOpcode() != ISD::AND &&
	Narrow->getOpcode() != ISD::OR)
	return SDValue();

	SDValue N0 = Narrow->getOperand(0);
	SDValue N1 = Narrow->getOperand(1);
	SDLoc DL(Narrow);

	// The Left side has to be a trunc.
	if (N0.getOpcode() != ISD::TRUNCATE)
	return SDValue();

	// The type of the truncated inputs.
	EVT WideVT = N0->getOperand(0)->getValueType(0);
	if (WideVT != VT)
	return SDValue();

	// The right side has to be a 'trunc' or a constant vector.
	bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
	ConstantSDNode *RHSConstSplat = nullptr;
	if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
	RHSConstSplat = RHSBV->getConstantSplatNode();
	if (!RHSTrunc && !RHSConstSplat)
	return SDValue();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
	return SDValue();

	// Set N0 and N1 to hold the inputs to the new wide operation.
	N0 = N0->getOperand(0);
	if (RHSConstSplat) {
	N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
	SDValue(RHSConstSplat, 0));
	N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
	} else if (RHSTrunc) {
	N1 = N1->getOperand(0);
	}

	// Generate the wide operation.
	SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
	unsigned Opcode = N->getOpcode();
	switch (Opcode) {
	case ISD::ANY_EXTEND:
	return Op;
	case ISD::ZERO_EXTEND: {
	unsigned InBits = NarrowVT.getScalarSizeInBits();
	APInt Mask = APInt::getAllOnesValue(InBits);
	Mask = Mask.zext(VT.getScalarSizeInBits());
	return DAG.getNode(ISD::AND, DL, VT,
	Op, DAG.getConstant(Mask, DL, VT));
	}
	case ISD::SIGN_EXTEND:
	return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
	Op, DAG.getValueType(NarrowVT));
	default:
	llvm_unreachable("Unexpected opcode");
	}
	}

	/// If both input operands of a logic op are being cast from floating point
	/// types, try to convert this into a floating point logic node to avoid
	/// unnecessary moves from SSE to integer registers.
	static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	unsigned FPOpcode = ISD::DELETED_NODE;
	if (N->getOpcode() == ISD::AND)
	FPOpcode = X86ISD::FAND;
	else if (N->getOpcode() == ISD::OR)
	FPOpcode = X86ISD::FOR;
	else if (N->getOpcode() == ISD::XOR)
	FPOpcode = X86ISD::FXOR;

	assert(FPOpcode != ISD::DELETED_NODE &&
	"Unexpected input node for FP logic conversion");

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);
	if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
	((Subtarget.hasSSE1() && VT == MVT::i32) \|\|
	(Subtarget.hasSSE2() && VT == MVT::i64))) {
	SDValue N00 = N0.getOperand(0);
	SDValue N10 = N1.getOperand(0);
	EVT N00Type = N00.getValueType();
	EVT N10Type = N10.getValueType();
	if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
	SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
	return DAG.getBitcast(VT, FPLogic);
	}
	}
	return SDValue();
	}

	/// If this is a zero/all-bits result that is bitwise-anded with a low bits
	/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
	/// with a shift-right to eliminate loading the vector constant mask value.
	static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
	SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
	EVT VT0 = Op0.getValueType();
	EVT VT1 = Op1.getValueType();

	if (VT0 != VT1 \|\| !VT0.isSimple() \|\| !VT0.isInteger())
	return SDValue();

	APInt SplatVal;
	if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) \|\|
	!SplatVal.isMask())
	return SDValue();

	if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
	return SDValue();

	unsigned EltBitWidth = VT0.getScalarSizeInBits();
	if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
	return SDValue();

	SDLoc DL(N);
	unsigned ShiftVal = SplatVal.countTrailingOnes();
	SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
	SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
	return DAG.getBitcast(N->getValueType(0), Shift);
	}

	static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
	return R;

	if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
	return ShiftRight;

	EVT VT = N->getValueType(0);
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Attempt to recursively combine a bitmask AND with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget))
	return SDValue(); // This routine will use CombineTo to replace N.
	}

	// Create BEXTR instructions
	// BEXTR is ((X >> imm) & (2**size-1))
	if (VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
	return SDValue();
	if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
	return SDValue();

	ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
	ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
	if (MaskNode && ShiftNode) {
	uint64_t Mask = MaskNode->getZExtValue();
	uint64_t Shift = ShiftNode->getZExtValue();
	if (isMask_64(Mask)) {
	uint64_t MaskSize = countPopulation(Mask);
	if (Shift + MaskSize <= VT.getSizeInBits())
	return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
	DAG.getConstant(Shift \| (MaskSize << 8), DL,
	VT));
	}
	}
	return SDValue();
	}

	// Try to fold:
	// (or (and (m, y), (pandn m, x)))
	// into:
	// (vselect m, x, y)
	// As a special case, try to fold:
	// (or (and (m, (sub 0, x)), (pandn m, x)))
	// into:
	// (sub (xor X, M), M)
	static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (!((VT.is128BitVector() && Subtarget.hasSSE2()) \|\|
	(VT.is256BitVector() && Subtarget.hasInt256())))
	return SDValue();

	// Canonicalize AND to LHS.
	if (N1.getOpcode() == ISD::AND)
	std::swap(N0, N1);

	// TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
	// ANDNP combine allows other combines to happen that prevent matching.
	if (N0.getOpcode() != ISD::AND \|\| N1.getOpcode() != X86ISD::ANDNP)
	return SDValue();

	SDValue Mask = N1.getOperand(0);
	SDValue X = N1.getOperand(1);
	SDValue Y;
	if (N0.getOperand(0) == Mask)
	Y = N0.getOperand(1);
	if (N0.getOperand(1) == Mask)
	Y = N0.getOperand(0);

	// Check to see if the mask appeared in both the AND and ANDNP.
	if (!Y.getNode())
	return SDValue();

	// Validate that X, Y, and Mask are bitcasts, and see through them.
	Mask = peekThroughBitcasts(Mask);
	X = peekThroughBitcasts(X);
	Y = peekThroughBitcasts(Y);

	EVT MaskVT = Mask.getValueType();
	unsigned EltBits = MaskVT.getScalarSizeInBits();

	// TODO: Attempt to handle floating point cases as well?
	if (!MaskVT.isInteger() \|\| DAG.ComputeNumSignBits(Mask) != EltBits)
	return SDValue();

	SDLoc DL(N);

	// Try to match:
	// (or (and (M, (sub 0, X)), (pandn M, X)))
	// which is a special case of vselect:
	// (vselect M, (sub 0, X), X)
	// Per:
	// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
	// We know that, if fNegate is 0 or 1:
	// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
	//
	// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
	// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
	// ( M ? -X : X) == ((X ^ M ) + (M & 1))
	// This lets us transform our vselect to:
	// (add (xor X, M), (and M, 1))
	// And further to:
	// (sub (xor X, M), M)
	if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
	DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
	auto IsNegV = [](SDNode *N, SDValue V) {
	return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
	ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
	};
	SDValue V;
	if (IsNegV(Y.getNode(), X))
	V = X;
	else if (IsNegV(X.getNode(), Y))
	V = Y;

	if (V) {
	SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
	SDValue SubOp2 = Mask;

	// If the negate was on the false side of the select, then
	// the operands of the SUB need to be swapped. PR 27251.
	// This is because the pattern being matched above is
	// (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
	// but if the pattern matched was
	// (vselect M, X, (sub (0, X))), that is really negation of the pattern
	// above, -(vselect M, (sub 0, X), X), and therefore the replacement
	// pattern also needs to be a negation of the replacement pattern above.
	// And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
	// sub accomplishes the negation of the replacement pattern.
	if (V == Y)
	std::swap(SubOp1, SubOp2);

	SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
	return DAG.getBitcast(VT, Res);
	}
	}

	// PBLENDVB is only available on SSE 4.1.
	if (!Subtarget.hasSSE41())
	return SDValue();

	MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;

	X = DAG.getBitcast(BlendVT, X);
	Y = DAG.getBitcast(BlendVT, Y);
	Mask = DAG.getBitcast(BlendVT, Mask);
	Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
	return DAG.getBitcast(VT, Mask);
	}

	// Helper function for combineOrCmpEqZeroToCtlzSrl
	// Transforms:
	// seteq(cmp x, 0)
	// into:
	// srl(ctlz x), log2(bitsize(x))
	// Input pattern is checked by caller.
	static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
	SelectionDAG &DAG) {
	SDValue Cmp = Op.getOperand(1);
	EVT VT = Cmp.getOperand(0).getValueType();
	unsigned Log2b = Log2_32(VT.getSizeInBits());
	SDLoc dl(Op);
	SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
	// The result of the shift is true or false, and on X86, the 32-bit
	// encoding of shr and lzcnt is more desirable.
	SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
	SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
	DAG.getConstant(Log2b, dl, VT));
	return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
	}

	// Try to transform:
	// zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
	// into:
	// srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
	// Will also attempt to match more generic cases, eg:
	// zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
	// Only applies if the target supports the FastLZCNT feature.
	static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize() \|\| !Subtarget.getTargetLowering()->isCtlzFast())
	return SDValue();

	auto isORCandidate = [](SDValue N) {
	return (N->getOpcode() == ISD::OR && N->hasOneUse());
	};

	// Check the zero extend is extending to 32-bit or more. The code generated by
	// srl(ctlz) for 16-bit or less variants of the pattern would require extra
	// instructions to clear the upper bits.
	if (!N->hasOneUse() \|\| !N->getSimpleValueType(0).bitsGE(MVT::i32) \|\|
	!isORCandidate(N->getOperand(0)))
	return SDValue();

	// Check the node matches: setcc(eq, cmp 0)
	auto isSetCCCandidate = [](SDValue N) {
	return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
	X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
	N->getOperand(1).getOpcode() == X86ISD::CMP &&
	isNullConstant(N->getOperand(1).getOperand(1)) &&
	N->getOperand(1).getValueType().bitsGE(MVT::i32);
	};

	SDNode *OR = N->getOperand(0).getNode();
	SDValue LHS = OR->getOperand(0);
	SDValue RHS = OR->getOperand(1);

	// Save nodes matching or(or, setcc(eq, cmp 0)).
	SmallVector<SDNode *, 2> ORNodes;
	while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	(isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
	ORNodes.push_back(OR);
	OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	}

	// The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
	if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) \|\|
	!isORCandidate(SDValue(OR, 0)))
	return SDValue();

	// We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
	// to
	// or(srl(ctlz),srl(ctlz)).
	// The dag combiner can then fold it into:
	// srl(or(ctlz, ctlz)).
	EVT VT = OR->getValueType(0);
	SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
	SDValue Ret, NewRHS;
	if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);

	if (!Ret)
	return SDValue();

	// Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
	while (ORNodes.size() > 0) {
	OR = ORNodes.pop_back_val();
	LHS = OR->getOperand(0);
	RHS = OR->getOperand(1);
	// Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
	if (RHS->getOpcode() == ISD::OR)
	std::swap(LHS, RHS);
	EVT VT = OR->getValueType(0);
	SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
	if (!NewRHS)
	return SDValue();
	Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
	}

	if (Ret)
	Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);

	return Ret;
	}

	static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
	return R;

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);

	if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
	return SDValue();

	// fold (or (x << c) \| (y >> (64 - c))) ==> (shld64 x, y, c)
	bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();

	// SHLD/SHRD instructions have lower register pressure, but on some
	// platforms they have higher latency than the equivalent
	// series of shifts/or that would otherwise be generated.
	// Don't fold (or (x << c) \| (y >> (64 - c))) if SHLD/SHRD instructions
	// have higher latencies and we are not optimizing for size.
	if (!OptForSize && Subtarget.isSHLDSlow())
	return SDValue();

	if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
	std::swap(N0, N1);
	if (N0.getOpcode() != ISD::SHL \|\| N1.getOpcode() != ISD::SRL)
	return SDValue();
	if (!N0.hasOneUse() \|\| !N1.hasOneUse())
	return SDValue();

	SDValue ShAmt0 = N0.getOperand(1);
	if (ShAmt0.getValueType() != MVT::i8)
	return SDValue();
	SDValue ShAmt1 = N1.getOperand(1);
	if (ShAmt1.getValueType() != MVT::i8)
	return SDValue();
	if (ShAmt0.getOpcode() == ISD::TRUNCATE)
	ShAmt0 = ShAmt0.getOperand(0);
	if (ShAmt1.getOpcode() == ISD::TRUNCATE)
	ShAmt1 = ShAmt1.getOperand(0);

	SDLoc DL(N);
	unsigned Opc = X86ISD::SHLD;
	SDValue Op0 = N0.getOperand(0);
	SDValue Op1 = N1.getOperand(0);
	if (ShAmt0.getOpcode() == ISD::SUB \|\|
	ShAmt0.getOpcode() == ISD::XOR) {
	Opc = X86ISD::SHRD;
	std::swap(Op0, Op1);
	std::swap(ShAmt0, ShAmt1);
	}

	// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
	// OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
	// OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
	unsigned Bits = VT.getSizeInBits();
	if (ShAmt1.getOpcode() == ISD::SUB) {
	SDValue Sum = ShAmt1.getOperand(0);
	if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
	SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
	if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op1 = ShAmt1Op1.getOperand(0);
	if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
	return DAG.getNode(Opc, DL, VT,
	Op0, Op1,
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	}
	} else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
	ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
	if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
	return DAG.getNode(Opc, DL, VT,
	N0.getOperand(0), N1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL,
	MVT::i8, ShAmt0));
	} else if (ShAmt1.getOpcode() == ISD::XOR) {
	SDValue Mask = ShAmt1.getOperand(1);
	if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
	unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
	SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
	if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
	ShAmt1Op0 = ShAmt1Op0.getOperand(0);
	if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
	if (Op1.getOpcode() == InnerShift &&
	isa<ConstantSDNode>(Op1.getOperand(1)) &&
	Op1.getConstantOperandVal(1) == 1) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
	if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
	Op1.getOperand(0) == Op1.getOperand(1)) {
	return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
	DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
	}
	}
	}
	}

	return SDValue();
	}

	/// Generate NEG and CMOV for integer abs.
	static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
	EVT VT = N->getValueType(0);

	// Since X86 does not have CMOV for 8-bit integer, we don't convert
	// 8-bit integer abs to NEG and CMOV.
	if (VT.isInteger() && VT.getSizeInBits() == 8)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	SDLoc DL(N);

	// Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
	// and change it to SUB and CMOV.
	if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
	N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
	N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
	auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
	if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
	// Generate SUB & CMOV.
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
	DAG.getConstant(0, DL, VT), N0.getOperand(0));
	SDValue Ops[] = {N0.getOperand(0), Neg,
	DAG.getConstant(X86::COND_GE, DL, MVT::i8),
	SDValue(Neg.getNode(), 1)};
	return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
	}
	}
	return SDValue();
	}

	/// Try to turn tests against the signbit in the form of:
	/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
	/// into:
	/// SETGT(X, -1)
	static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
	// This is only worth doing if the output type is i8 or i1.
	EVT ResultType = N->getValueType(0);
	if (ResultType != MVT::i8 && ResultType != MVT::i1)
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);

	// We should be performing an xor against a truncated shift.
	if (N0.getOpcode() != ISD::TRUNCATE \|\| !N0.hasOneUse())
	return SDValue();

	// Make sure we are performing an xor against one.
	if (!isOneConstant(N1))
	return SDValue();

	// SetCC on x86 zero extends so only act on this if it's a logical shift.
	SDValue Shift = N0.getOperand(0);
	if (Shift.getOpcode() != ISD::SRL \|\| !Shift.hasOneUse())
	return SDValue();

	// Make sure we are truncating from one of i16, i32 or i64.
	EVT ShiftTy = Shift.getValueType();
	if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
	return SDValue();

	// Make sure the shift amount extracts the sign bit.
	if (!isa<ConstantSDNode>(Shift.getOperand(1)) \|\|
	Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1.
	// N.B. Using SETGE against 0 works but we want a canonical looking
	// comparison, using SETGT matches up with what TranslateX86CC.
	SDLoc DL(N);
	SDValue ShiftOp = Shift.getOperand(0);
	EVT ShiftOpTy = ShiftOp.getValueType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
	*DAG.getContext(), ResultType);
	SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
	DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
	if (SetCCResultType != ResultType)
	Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
	return Cond;
	}

	/// Turn vector tests of the signbit in the form of:
	/// xor (sra X, elt_size(X)-1), -1
	/// into:
	/// pcmpgt X, -1
	///
	/// This should be called before type legalization because the pattern may not
	/// persist after that.
	static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isSimple())
	return SDValue();

	switch (VT.getSimpleVT().SimpleTy) {
	default: return SDValue();
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
	case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
	}

	// There must be a shift right algebraic before the xor, and the xor must be a
	// 'not' operation.
	SDValue Shift = N->getOperand(0);
	SDValue Ones = N->getOperand(1);
	if (Shift.getOpcode() != ISD::SRA \|\| !Shift.hasOneUse() \|\|
	!ISD::isBuildVectorAllOnes(Ones.getNode()))
	return SDValue();

	// The shift should be smearing the sign bit across each vector element.
	auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
	if (!ShiftBV)
	return SDValue();

	EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
	auto *ShiftAmt = ShiftBV->getConstantSplatNode();
	if (!ShiftAmt \|\| ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
	return SDValue();

	// Create a greater-than comparison against -1. We don't use the more obvious
	// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
	return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
	}

	/// Check if truncation with saturation form type \p SrcVT to \p DstVT
	/// is valid for the given \p Subtarget.
	static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
	const X86Subtarget &Subtarget) {
	if (!Subtarget.hasAVX512())
	return false;

	// FIXME: Scalar type may be supported if we move it to vector register.
	if (!SrcVT.isVector() \|\| !SrcVT.isSimple() \|\| SrcVT.getSizeInBits() > 512)
	return false;

	EVT SrcElVT = SrcVT.getScalarType();
	EVT DstElVT = DstVT.getScalarType();
	if (SrcElVT.getSizeInBits() < 16 \|\| SrcElVT.getSizeInBits() > 64)
	return false;
	if (DstElVT.getSizeInBits() < 8 \|\| DstElVT.getSizeInBits() > 32)
	return false;
	if (SrcVT.is512BitVector() \|\| Subtarget.hasVLX())
	return SrcElVT.getSizeInBits() >= 32 \|\| Subtarget.hasBWI();
	return false;
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectUSatPattern(SDValue In, EVT VT) {
	if (In.getOpcode() != ISD::UMIN)
	return SDValue();

	//Saturation with truncation. We truncate from InVT to VT.
	assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
	"Unexpected types for truncate operation");

	APInt C;
	if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
	// C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
	// the element size of the destination type.
	return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
	SDValue();
	}
	return SDValue();
	}

	/// Detect a pattern of truncation with saturation:
	/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
	/// The types should allow to use VPMOVUS* instruction on AVX512.
	/// Return the source value to be truncated or SDValue() if the pattern was not
	/// matched.
	static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
	const X86Subtarget &Subtarget) {
	if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return SDValue();
	return detectUSatPattern(In, VT);
	}

	static SDValue
	combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (!TLI.isTypeLegal(In.getValueType()) \|\| !TLI.isTypeLegal(VT))
	return SDValue();
	if (auto USatVal = detectUSatPattern(In, VT))
	if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
	return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
	return SDValue();
	}

	/// This function detects the AVG pattern between vectors of unsigned i8/i16,
	/// which is c = (a + b + 1) / 2, and replace this operation with the efficient
	/// X86ISD::AVG instruction.
	static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	const SDLoc &DL) {
	if (!VT.isVector() \|\| !VT.isSimple())
	return SDValue();
	EVT InVT = In.getValueType();
	unsigned NumElems = VT.getVectorNumElements();

	EVT ScalarVT = VT.getVectorElementType();
	if (!((ScalarVT == MVT::i8 \|\| ScalarVT == MVT::i16) &&
	isPowerOf2_32(NumElems)))
	return SDValue();

	// InScalarVT is the intermediate type in AVG pattern and it should be greater
	// than the original input type (i8/i16).
	EVT InScalarVT = InVT.getVectorElementType();
	if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
	return SDValue();

	if (!Subtarget.hasSSE2())
	return SDValue();
	if (Subtarget.hasBWI()) {
	if (VT.getSizeInBits() > 512)
	return SDValue();
	} else if (Subtarget.hasAVX2()) {
	if (VT.getSizeInBits() > 256)
	return SDValue();
	} else {
	if (VT.getSizeInBits() > 128)
	return SDValue();
	}

	// Detect the following pattern:
	//
	// %1 = zext <N x i8> %a to <N x i32>
	// %2 = zext <N x i8> %b to <N x i32>
	// %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
	// %4 = add nuw nsw <N x i32> %3, %2
	// %5 = lshr <N x i32> %N, <i32 1 x N>
	// %6 = trunc <N x i32> %5 to <N x i8>
	//
	// In AVX512, the last instruction can also be a trunc store.

	if (In.getOpcode() != ISD::SRL)
	return SDValue();

	// A lambda checking the given SDValue is a constant vector and each element
	// is in the range [Min, Max].
	auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
	BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| !BV->isConstant())
	return false;
	for (SDValue Op : V->ops()) {
	ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
	if (!C)
	return false;
	uint64_t Val = C->getZExtValue();
	if (Val < Min \|\| Val > Max)
	return false;
	}
	return true;
	};

	// Check if each element of the vector is left-shifted by one.
	auto LHS = In.getOperand(0);
	auto RHS = In.getOperand(1);
	if (!IsConstVectorInRange(RHS, 1, 1))
	return SDValue();
	if (LHS.getOpcode() != ISD::ADD)
	return SDValue();

	// Detect a pattern of a + b + 1 where the order doesn't matter.
	SDValue Operands[3];
	Operands[0] = LHS.getOperand(0);
	Operands[1] = LHS.getOperand(1);

	// Take care of the case when one of the operands is a constant vector whose
	// element is in the range [1, 256].
	if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
	Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
	Operands[0].getOperand(0).getValueType() == VT) {
	// The pattern is detected. Subtract one from the constant vector, then
	// demote it and emit X86ISD::AVG instruction.
	SDValue VecOnes = DAG.getConstant(1, DL, InVT);
	Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
	Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
	return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
	Operands[1]);
	}

	if (Operands[0].getOpcode() == ISD::ADD)
	std::swap(Operands[0], Operands[1]);
	else if (Operands[1].getOpcode() != ISD::ADD)
	return SDValue();
	Operands[2] = Operands[1].getOperand(0);
	Operands[1] = Operands[1].getOperand(1);

	// Now we have three operands of two additions. Check that one of them is a
	// constant vector with ones, and the other two are promoted from i8/i16.
	for (int i = 0; i < 3; ++i) {
	if (!IsConstVectorInRange(Operands[i], 1, 1))
	continue;
	std::swap(Operands[i], Operands[2]);

	// Check if Operands[0] and Operands[1] are results of type promotion.
	for (int j = 0; j < 2; ++j)
	if (Operands[j].getOpcode() != ISD::ZERO_EXTEND \|\|
	Operands[j].getOperand(0).getValueType() != VT)
	return SDValue();

	// The pattern is detected, emit X86ISD::AVG instruction.
	return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
	Operands[1].getOperand(0));
	}

	return SDValue();
	}

	static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	LoadSDNode *Ld = cast<LoadSDNode>(N);
	EVT RegVT = Ld->getValueType(0);
	EVT MemVT = Ld->getMemoryVT();
	SDLoc dl(Ld);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// For chips with slow 32-byte unaligned loads, break the 32-byte operation
	// into two 16-byte operations. Also split non-temporal aligned loads on
	// pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
	ISD::LoadExtType Ext = Ld->getExtensionType();
	bool Fast;
	unsigned AddressSpace = Ld->getAddressSpace();
	unsigned Alignment = Ld->getAlignment();
	if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
	Ext == ISD::NON_EXTLOAD &&
	((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) \|\|
	(TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
	AddressSpace, Alignment, &Fast) && !Fast))) {
	unsigned NumElems = RegVT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Ptr = Ld->getBasePtr();

	EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
	NumElems/2);
	SDValue Load1 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	Alignment, Ld->getMemOperand()->getFlags());

	Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
	SDValue Load2 =
	DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
	std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
	SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
	Load1.getValue(1),
	Load2.getValue(1));

	SDValue NewVec = DAG.getUNDEF(RegVT);
	NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
	NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
	return DCI.CombineTo(N, NewVec, TF, true);
	}

	return SDValue();
	}

	/// If V is a build vector of boolean constants and exactly one of those
	/// constants is true, return the operand index of that true element.
	/// Otherwise, return -1.
	static int getOneTrueElt(SDValue V) {
	// This needs to be a build vector of booleans.
	// TODO: Checking for the i1 type matches the IR definition for the mask,
	// but the mask check could be loosened to i8 or other types. That might
	// also require checking more than 'allOnesValue'; eg, the x86 HW
	// instructions only require that the MSB is set for each mask element.
	// The ISD::MSTORE comments/definition do not specify how the mask operand
	// is formatted.
	auto *BV = dyn_cast<BuildVectorSDNode>(V);
	if (!BV \|\| BV->getValueType(0).getVectorElementType() != MVT::i1)
	return -1;

	int TrueIndex = -1;
	unsigned NumElts = BV->getValueType(0).getVectorNumElements();
	for (unsigned i = 0; i < NumElts; ++i) {
	const SDValue &Op = BV->getOperand(i);
	if (Op.isUndef())
	continue;
	auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
	if (!ConstNode)
	return -1;
	if (ConstNode->getAPIntValue().isAllOnesValue()) {
	// If we already found a one, this is too many.
	if (TrueIndex >= 0)
	return -1;
	TrueIndex = i;
	}
	}
	return TrueIndex;
	}

	/// Given a masked memory load/store operation, return true if it has one mask
	/// bit set. If it has one mask bit set, then also return the memory address of
	/// the scalar element to load/store, the vector index to insert/extract that
	/// scalar element, and the alignment for the scalar memory access.
	static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
	SelectionDAG &DAG, SDValue &Addr,
	SDValue &Index, unsigned &Alignment) {
	int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
	if (TrueMaskElt < 0)
	return false;

	// Get the address of the one scalar element that is specified by the mask
	// using the appropriate offset from the base pointer.
	EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
	Addr = MaskedOp->getBasePtr();
	if (TrueMaskElt != 0) {
	unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
	Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
	}

	Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
	Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
	return true;
	}

	/// If exactly one element of the mask is set for a non-extending masked load,
	/// it is a scalar load and vector insert.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue
	reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Load the one scalar element that is specified by the mask using the
	// appropriate offset from the base pointer.
	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);
	EVT EltVT = VT.getVectorElementType();
	SDValue Load =
	DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
	Alignment, ML->getMemOperand()->getFlags());

	// Insert the loaded element into the appropriate place in the vector.
	SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
	Load, VecIndex);
	return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
	}

	static SDValue
	combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
	return SDValue();

	SDLoc DL(ML);
	EVT VT = ML->getValueType(0);

	// If we are loading the first and last elements of a vector, it is safe and
	// always faster to load the whole vector. Replace the masked load with a
	// vector load and select.
	unsigned NumElts = VT.getVectorNumElements();
	BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
	bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
	bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
	if (LoadFirstElt && LoadLastElt) {
	SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMemOperand());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
	return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
	}

	// Convert a masked load with a constant mask into a masked load and a select.
	// This allows the select operation to use a faster kind of select instruction
	// (for example, vblendvps -> vblendps).

	// Don't try this if the pass-through operand is already undefined. That would
	// cause an infinite loop because that's what we're about to create.
	if (ML->getSrc0().isUndef())
	return SDValue();

	// The new masked load has an undef pass-through operand. The select uses the
	// original pass-through operand.
	SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
	ML->getMask(), DAG.getUNDEF(VT),
	ML->getMemoryVT(), ML->getMemOperand(),
	ML->getExtensionType());
	SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());

	return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
	}

	static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);

	// TODO: Expanding load with constant mask may be optimized as well.
	if (Mld->isExpandingLoad())
	return SDValue();

	if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
	if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
	return ScalarLoad;
	// TODO: Do some AVX512 subsets benefit from this transform?
	if (!Subtarget.hasAVX512())
	if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
	return Blend;
	}

	if (Mld->getExtensionType() != ISD::SEXTLOAD)
	return SDValue();

	// Resolve extending loads.
	EVT VT = Mld->getValueType(0);
	unsigned NumElems = VT.getVectorNumElements();
	EVT LdVT = Mld->getMemoryVT();
	SDLoc dl(Mld);

	assert(LdVT != VT && "Cannot extend to the same type");
	unsigned ToSz = VT.getScalarSizeInBits();
	unsigned FromSz = LdVT.getScalarSizeInBits();
	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for extending masked load");

	unsigned SizeRatio = ToSz / FromSz;
	assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	LdVT.getScalarType(), NumElems*SizeRatio);
	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	// Convert Src0 value.
	SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
	if (!Mld->getSrc0().isUndef()) {
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");
	WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
	DAG.getUNDEF(WideVecVT), ShuffleVec);
	}
	// Prepare the new mask.
	SDValue NewMask;
	SDValue Mask = Mld->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
	ShuffleVec[i] = NumElems * SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	Ops[0] = Mask;
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = ZeroVal;

	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
	Mld->getBasePtr(), NewMask, WideSrc0,
	Mld->getMemoryVT(), Mld->getMemOperand(),
	ISD::NON_EXTLOAD);
	SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
	return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
	}

	/// If exactly one element of the mask is set for a non-truncating masked store,
	/// it is a vector extract and scalar store.
	/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
	/// mask have already been optimized in IR, so we don't bother with those here.
	static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
	SelectionDAG &DAG) {
	// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
	// However, some target hooks may need to be added to know when the transform
	// is profitable. Endianness would also have to be considered.

	SDValue Addr, VecIndex;
	unsigned Alignment;
	if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
	return SDValue();

	// Extract the one scalar element that is actually being stored.
	SDLoc DL(MS);
	EVT VT = MS->getValue().getValueType();
	EVT EltVT = VT.getVectorElementType();
	SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
	MS->getValue(), VecIndex);

	// Store that element at the appropriate offset from the base pointer.
	return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
	Alignment, MS->getMemOperand()->getFlags());
	}

	static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);

	if (Mst->isCompressingStore())
	return SDValue();

	if (!Mst->isTruncatingStore())
	return reduceMaskedStoreToScalarStore(Mst, DAG);

	// Resolve truncating stores.
	EVT VT = Mst->getValue().getValueType();
	unsigned NumElems = VT.getVectorNumElements();
	EVT StVT = Mst->getMemoryVT();
	SDLoc dl(Mst);

	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegal(VT, StVT))
	return SDValue();

	// From/To sizes and ElemCount must be pow of two.
	assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
	"Unexpected size for truncating masked store");
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	assert (((NumElems * FromSz) % ToSz) == 0 &&
	"Unexpected ratio for truncating masked store");

	unsigned SizeRatio = FromSz / ToSz;
	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle.
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
	SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
	"WideVecVT should be legal");

	SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);

	SDValue NewMask;
	SDValue Mask = Mst->getMask();
	if (Mask.getValueType() == VT) {
	// Mask and original value have the same type.
	NewMask = DAG.getBitcast(WideVecVT, Mask);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;
	for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
	ShuffleVec[i] = NumElems*SizeRatio;
	NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
	DAG.getConstant(0, dl, WideVecVT),
	ShuffleVec);
	} else {
	assert(Mask.getValueType().getVectorElementType() == MVT::i1);
	unsigned WidenNumElts = NumElems*SizeRatio;
	unsigned MaskNumElts = VT.getVectorNumElements();
	EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
	WidenNumElts);

	unsigned NumConcat = WidenNumElts / MaskNumElts;
	SmallVector<SDValue, 16> Ops(NumConcat);
	SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
	Ops[0] = Mask;
	for (unsigned i = 1; i != NumConcat; ++i)
	Ops[i] = ZeroVal;

	NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
	}

	return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
	Mst->getBasePtr(), NewMask, StVT,
	Mst->getMemOperand(), false);
	}

	static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	StoreSDNode *St = cast<StoreSDNode>(N);
	EVT VT = St->getValue().getValueType();
	EVT StVT = St->getMemoryVT();
	SDLoc dl(St);
	SDValue StoredVal = St->getOperand(1);
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// If we are saving a concatenation of two XMM registers and 32-byte stores
	// are slow, such as on Sandy Bridge, perform two 16-byte stores.
	bool Fast;
	unsigned AddressSpace = St->getAddressSpace();
	unsigned Alignment = St->getAlignment();
	if (VT.is256BitVector() && StVT == VT &&
	TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
	AddressSpace, Alignment, &Fast) &&
	!Fast) {
	unsigned NumElems = VT.getVectorNumElements();
	if (NumElems < 2)
	return SDValue();

	SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
	SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);

	SDValue Ptr0 = St->getBasePtr();
	SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);

	SDValue Ch0 =
	DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
	Alignment, St->getMemOperand()->getFlags());
	SDValue Ch1 =
	DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
	std::min(16U, Alignment), St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
	}

	// Optimize trunc store (of multiple scalars) to shuffle and store.
	// First, pack all of the elements in one place. Next, store to memory
	// in fewer chunks.
	if (St->isTruncatingStore() && VT.isVector()) {
	// Check if we can detect an AVG pattern from the truncation. If yes,
	// replace the trunc store by a normal store with the result of X86ISD::AVG
	// instruction.
	if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
	Subtarget, dl))
	return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());

	if (SDValue Val =
	detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
	return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
	dl, Val, St->getBasePtr(),
	St->getMemoryVT(), St->getMemOperand(), DAG);

	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	unsigned NumElems = VT.getVectorNumElements();
	assert(StVT != VT && "Cannot truncate to the same type");
	unsigned FromSz = VT.getScalarSizeInBits();
	unsigned ToSz = StVT.getScalarSizeInBits();

	// The truncating store is legal in some cases. For example
	// vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
	// are designated for truncate store.
	// In this case we don't need any further transformations.
	if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
	return SDValue();

	// From, To sizes and ElemCount must be pow of two
	if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
	// We are going to use the original vector elt for storing.
	// Accumulated smaller vector elements must be a multiple of the store size.
	if (0 != (NumElems * FromSz) % ToSz) return SDValue();

	unsigned SizeRatio = FromSz / ToSz;

	assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());

	// Create a type on which we perform the shuffle
	EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
	StVT.getScalarType(), NumElems*SizeRatio);

	assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());

	SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
	SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
	for (unsigned i = 0; i != NumElems; ++i)
	ShuffleVec[i] = i * SizeRatio;

	// Can't shuffle using an illegal type.
	if (!TLI.isTypeLegal(WideVecVT))
	return SDValue();

	SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
	DAG.getUNDEF(WideVecVT),
	ShuffleVec);
	// At this point all of the data is stored at the bottom of the
	// register. We now need to save it to mem.

	// Find the largest store unit
	MVT StoreType = MVT::i8;
	for (MVT Tp : MVT::integer_valuetypes()) {
	if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
	StoreType = Tp;
	}

	// On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
	if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
	(64 <= NumElems * ToSz))
	StoreType = MVT::f64;

	// Bitcast the original vector into a vector of store-size units
	EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
	StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
	assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
	SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
	SmallVector<SDValue, 8> Chains;
	SDValue Ptr = St->getBasePtr();

	// Perform one or more big stores into memory.
	for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
	SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
	StoreType, ShuffWide,
	DAG.getIntPtrConstant(i, dl));
	SDValue Ch =
	DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
	Chains.push_back(Ch);
	}

	return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
	}

	// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
	// the FP state in cases where an emms may be missing.
	// A preferable solution to the general problem is to figure out the right
	// places to insert EMMS. This qualifies as a quick hack.

	// Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
	if (VT.getSizeInBits() != 64)
	return SDValue();

	const Function *F = DAG.getMachineFunction().getFunction();
	bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
	bool F64IsLegal =
	!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
	if ((VT.isVector() \|\|
	(VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
	isa<LoadSDNode>(St->getValue()) &&
	!cast<LoadSDNode>(St->getValue())->isVolatile() &&
	St->getChain().hasOneUse() && !St->isVolatile()) {
	SDNode* LdVal = St->getValue().getNode();
	LoadSDNode *Ld = nullptr;
	int TokenFactorIndex = -1;
	SmallVector<SDValue, 8> Ops;
	SDNode* ChainVal = St->getChain().getNode();
	// Must be a store of a load. We currently handle two cases: the load
	// is a direct child, and it's under an intervening TokenFactor. It is
	// possible to dig deeper under nested TokenFactors.
	if (ChainVal == LdVal)
	Ld = cast<LoadSDNode>(St->getChain());
	else if (St->getValue().hasOneUse() &&
	ChainVal->getOpcode() == ISD::TokenFactor) {
	for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
	if (ChainVal->getOperand(i).getNode() == LdVal) {
	TokenFactorIndex = i;
	Ld = cast<LoadSDNode>(St->getValue());
	} else
	Ops.push_back(ChainVal->getOperand(i));
	}
	}

	if (!Ld \|\| !ISD::isNormalLoad(Ld))
	return SDValue();

	// If this is not the MMX case, i.e. we are just turning i64 load/store
	// into f64 load/store, avoid the transformation if there are multiple
	// uses of the loaded value.
	if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
	return SDValue();

	SDLoc LdDL(Ld);
	SDLoc StDL(N);
	// If we are a 64-bit capable x86, lower to a single movq load/store pair.
	// Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
	// pair instead.
	if (Subtarget.is64Bit() \|\| F64IsLegal) {
	MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
	SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue NewChain = NewLd.getValue(1);
	if (TokenFactorIndex >= 0) {
	Ops.push_back(NewChain);
	NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
	}
	return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	// Otherwise, lower to two pairs of 32-bit loads / stores.
	SDValue LoAddr = Ld->getBasePtr();
	SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);

	SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
	Ld->getPointerInfo(), Ld->getAlignment(),
	Ld->getMemOperand()->getFlags());
	SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
	Ld->getPointerInfo().getWithOffset(4),
	MinAlign(Ld->getAlignment(), 4),
	Ld->getMemOperand()->getFlags());

	SDValue NewChain = LoLd.getValue(1);
	if (TokenFactorIndex >= 0) {
	Ops.push_back(LoLd);
	Ops.push_back(HiLd);
	NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
	}

	LoAddr = St->getBasePtr();
	HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);

	SDValue LoSt =
	DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
	St->getAlignment(), St->getMemOperand()->getFlags());
	SDValue HiSt = DAG.getStore(
	NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
	MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
	return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
	}

	// This is similar to the above case, but here we handle a scalar 64-bit
	// integer store that is extracted from a vector on a 32-bit target.
	// If we have SSE2, then we can treat it like a floating-point double
	// to get past legalization. The execution dependencies fixup pass will
	// choose the optimal machine instruction for the store if this really is
	// an integer or v2f32 rather than an f64.
	if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
	St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
	SDValue OldExtract = St->getOperand(1);
	SDValue ExtOp0 = OldExtract.getOperand(0);
	unsigned VecSize = ExtOp0.getValueSizeInBits();
	EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
	SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
	SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
	BitCast, OldExtract.getOperand(1));
	return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
	St->getPointerInfo(), St->getAlignment(),
	St->getMemOperand()->getFlags());
	}

	return SDValue();
	}

	/// Return 'true' if this vector operation is "horizontal"
	/// and return the operands for the horizontal operation in LHS and RHS. A
	/// horizontal operation performs the binary operation on successive elements
	/// of its first operand, then on successive elements of its second operand,
	/// returning the resulting values in a vector. For example, if
	/// A = < float a0, float a1, float a2, float a3 >
	/// and
	/// B = < float b0, float b1, float b2, float b3 >
	/// then the result of doing a horizontal operation on A and B is
	/// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
	/// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
	/// A horizontal-op B, for some already available A and B, and if so then LHS is
	/// set to A, RHS to B, and the routine returns 'true'.
	/// Note that the binary operation should have the property that if one of the
	/// operands is UNDEF then the result is UNDEF.
	static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
	// Look for the following pattern: if
	// A = < float a0, float a1, float a2, float a3 >
	// B = < float b0, float b1, float b2, float b3 >
	// and
	// LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
	// RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
	// then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
	// which is A horizontal-op B.

	// At least one of the operands should be a vector shuffle.
	if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
	RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
	return false;

	MVT VT = LHS.getSimpleValueType();

	assert((VT.is128BitVector() \|\| VT.is256BitVector()) &&
	"Unsupported vector type for horizontal add/sub");

	// Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
	// operate independently on 128-bit lanes.
	unsigned NumElts = VT.getVectorNumElements();
	unsigned NumLanes = VT.getSizeInBits()/128;
	unsigned NumLaneElts = NumElts / NumLanes;
	assert((NumLaneElts % 2 == 0) &&
	"Vector type should have an even number of elements in each lane");
	unsigned HalfLaneElts = NumLaneElts/2;

	// View LHS in the form
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// If LHS is not a shuffle then pretend it is the shuffle
	// LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
	// NOTE: in what follows a default initialized SDValue represents an UNDEF of
	// type VT.
	SDValue A, B;
	SmallVector<int, 16> LMask(NumElts);
	if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!LHS.getOperand(0).isUndef())
	A = LHS.getOperand(0);
	if (!LHS.getOperand(1).isUndef())
	B = LHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), LMask.begin());
	} else {
	if (!LHS.isUndef())
	A = LHS;
	for (unsigned i = 0; i != NumElts; ++i)
	LMask[i] = i;
	}

	// Likewise, view RHS in the form
	// RHS = VECTOR_SHUFFLE C, D, RMask
	SDValue C, D;
	SmallVector<int, 16> RMask(NumElts);
	if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
	if (!RHS.getOperand(0).isUndef())
	C = RHS.getOperand(0);
	if (!RHS.getOperand(1).isUndef())
	D = RHS.getOperand(1);
	ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
	std::copy(Mask.begin(), Mask.end(), RMask.begin());
	} else {
	if (!RHS.isUndef())
	C = RHS;
	for (unsigned i = 0; i != NumElts; ++i)
	RMask[i] = i;
	}

	// Check that the shuffles are both shuffling the same vectors.
	if (!(A == C && B == D) && !(A == D && B == C))
	return false;

	// If everything is UNDEF then bail out: it would be better to fold to UNDEF.
	if (!A.getNode() && !B.getNode())
	return false;

	// If A and B occur in reverse order in RHS, then "swap" them (which means
	// rewriting the mask).
	if (A != C)
	ShuffleVectorSDNode::commuteMask(RMask);

	// At this point LHS and RHS are equivalent to
	// LHS = VECTOR_SHUFFLE A, B, LMask
	// RHS = VECTOR_SHUFFLE A, B, RMask
	// Check that the masks correspond to performing a horizontal operation.
	for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
	for (unsigned i = 0; i != NumLaneElts; ++i) {
	int LIdx = LMask[i+l], RIdx = RMask[i+l];

	// Ignore any UNDEF components.
	if (LIdx < 0 \|\| RIdx < 0 \|\|
	(!A.getNode() && (LIdx < (int)NumElts \|\| RIdx < (int)NumElts)) \|\|
	(!B.getNode() && (LIdx >= (int)NumElts \|\| RIdx >= (int)NumElts)))
	continue;

	// Check that successive elements are being operated on. If not, this is
	// not a horizontal operation.
	unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
	int Index = 2(i%HalfLaneElts) + NumEltsSrc + l;
	if (!(LIdx == Index && RIdx == Index + 1) &&
	!(IsCommutative && LIdx == Index + 1 && RIdx == Index))
	return false;
	}
	}

	LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
	RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
	return true;
	}

	/// Do target-specific dag combines on floating-point adds/subs.
	static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	bool IsFadd = N->getOpcode() == ISD::FADD;
	assert((IsFadd \|\| N->getOpcode() == ISD::FSUB) && "Wrong opcode");

	// Try to synthesize horizontal add/sub from adds/subs of shuffles.
	if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasFp256() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))) &&
	isHorizontalBinOp(LHS, RHS, IsFadd)) {
	auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
	}
	return SDValue();
	}

	/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
	/// the codegen.
	/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
	static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget,
	SDLoc &DL) {
	assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
	SDValue Src = N->getOperand(0);
	unsigned Opcode = Src.getOpcode();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	EVT VT = N->getValueType(0);
	EVT SrcVT = Src.getValueType();

	auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
	unsigned TruncSizeInBits = VT.getScalarSizeInBits();

	// Repeated operand, so we are only trading one output truncation for
	// one input truncation.
	if (Op0 == Op1)
	return true;

	// See if either operand has been extended from a smaller/equal size to
	// the truncation size, allowing a truncation to combine with the extend.
	unsigned Opcode0 = Op0.getOpcode();
	if ((Opcode0 == ISD::ANY_EXTEND \|\| Opcode0 == ISD::SIGN_EXTEND \|\|
	Opcode0 == ISD::ZERO_EXTEND) &&
	Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	unsigned Opcode1 = Op1.getOpcode();
	if ((Opcode1 == ISD::ANY_EXTEND \|\| Opcode1 == ISD::SIGN_EXTEND \|\|
	Opcode1 == ISD::ZERO_EXTEND) &&
	Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
	return true;

	// See if either operand is a single use constant which can be constant
	// folded.
	SDValue BC0 = peekThroughOneUseBitcasts(Op0);
	SDValue BC1 = peekThroughOneUseBitcasts(Op1);
	return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) \|\|
	ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
	};

	auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
	SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
	SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
	return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
	};

	// Don't combine if the operation has other uses.
	if (!N->isOnlyUserOf(Src.getNode()))
	return SDValue();

	// Only support vector truncation for now.
	// TODO: i64 scalar math would benefit as well.
	if (!VT.isVector())
	return SDValue();

	// In most cases its only worth pre-truncating if we're only facing the cost
	// of one truncation.
	// i.e. if one of the inputs will constant fold or the input is repeated.
	switch (Opcode) {
	case ISD::AND:
	case ISD::XOR:
	case ISD::OR: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}

	case ISD::MUL:
	// X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
	// better to truncate if we have the chance.
	if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
	!TLI.isOperationLegal(Opcode, SrcVT))
	return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
	LLVM_FALLTHROUGH;
	case ISD::ADD: {
	SDValue Op0 = Src.getOperand(0);
	SDValue Op1 = Src.getOperand(1);
	if (TLI.isOperationLegal(Opcode, VT) &&
	IsRepeatedOpOrFreeTruncation(Op0, Op1))
	return TruncateArithmetic(Op0, Op1);
	break;
	}
	}

	return SDValue();
	}

	/// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
	static SDValue
	combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 \|\|
	Regs[0].getValueType() == MVT::v2i64));
	EVT OutVT = N->getValueType(0);
	EVT OutSVT = OutVT.getVectorElementType();
	EVT InVT = Regs[0].getValueType();
	EVT InSVT = InVT.getVectorElementType();
	SDLoc DL(N);

	// First, use mask to unset all bits that won't appear in the result.
	assert((OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) &&
	"OutSVT can only be either i8 or i16.");
	APInt Mask =
	APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
	SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
	for (auto &Reg : Regs)
	Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);

	MVT UnpackedVT, PackedVT;
	if (OutSVT == MVT::i8) {
	UnpackedVT = MVT::v8i16;
	PackedVT = MVT::v16i8;
	} else {
	UnpackedVT = MVT::v4i32;
	PackedVT = MVT::v8i16;
	}

	// In each iteration, truncate the type by a half size.
	auto RegNum = Regs.size();
	for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
	j < e; j *= 2, RegNum /= 2) {
	for (unsigned i = 0; i < RegNum; i++)
	Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
	for (unsigned i = 0; i < RegNum / 2; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
	Regs[i * 2 + 1]);
	}

	// If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
	// then extract a subvector as the result since v8i8 is not a legal type.
	if (OutVT == MVT::v8i8) {
	Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
	Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
	DAG.getIntPtrConstant(0, DL));
	return Regs[0];
	} else if (RegNum > 1) {
	Regs.resize(RegNum);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
	static SDValue
	combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
	SelectionDAG &DAG,
	SmallVector<SDValue, 8> &Regs) {
	assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
	EVT OutVT = N->getValueType(0);
	SDLoc DL(N);

	// Shift left by 16 bits, then arithmetic-shift right by 16 bits.
	SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
	for (auto &Reg : Regs) {
	Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
	Subtarget, DAG);
	}

	for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
	Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
	Regs[i * 2 + 1]);

	if (Regs.size() > 2) {
	Regs.resize(Regs.size() / 2);
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
	} else
	return Regs[0];
	}

	/// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
	/// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
	/// legalization the truncation will be translated into a BUILD_VECTOR with each
	/// element that is extracted from a vector and then truncated, and it is
	/// difficult to do this optimization based on them.
	static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OutVT = N->getValueType(0);
	if (!OutVT.isVector())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	EVT InVT = In.getValueType();
	unsigned NumElems = OutVT.getVectorNumElements();

	// TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
	// SSE2, and we need to take care of it specially.
	// AVX512 provides vpmovdb.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX2())
	return SDValue();

	EVT OutSVT = OutVT.getVectorElementType();
	EVT InSVT = InVT.getVectorElementType();
	if (!((InSVT == MVT::i32 \|\| InSVT == MVT::i64) &&
	(OutSVT == MVT::i8 \|\| OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
	NumElems >= 8))
	return SDValue();

	// SSSE3's pshufb results in less instructions in the cases below.
	if (Subtarget.hasSSSE3() && NumElems == 8 &&
	((OutSVT == MVT::i8 && InSVT != MVT::i64) \|\|
	(InSVT == MVT::i32 && OutSVT == MVT::i16)))
	return SDValue();

	SDLoc DL(N);

	// Split a long vector into vectors of legal type.
	unsigned RegNum = InVT.getSizeInBits() / 128;
	SmallVector<SDValue, 8> SubVec(RegNum);
	unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
	EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);

	for (unsigned i = 0; i < RegNum; i++)
	SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
	DAG.getIntPtrConstant(i * NumSubRegElts, DL));

	// SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
	// for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
	// truncate 2 x v4i32 to v8i16.
	if (Subtarget.hasSSE41() \|\| OutSVT == MVT::i8)
	return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
	else if (InSVT == MVT::i32)
	return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
	else
	return SDValue();
	}

	/// This function transforms vector truncation of 'all or none' bits values.
	/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
	static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
	SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// Requires SSE2 but AVX512 has fast truncate.
	if (!Subtarget.hasSSE2() \|\| Subtarget.hasAVX512())
	return SDValue();

	if (!N->getValueType(0).isVector() \|\| !N->getValueType(0).isSimple())
	return SDValue();

	SDValue In = N->getOperand(0);
	if (!In.getValueType().isSimple())
	return SDValue();

	MVT VT = N->getValueType(0).getSimpleVT();
	MVT SVT = VT.getScalarType();

	MVT InVT = In.getValueType().getSimpleVT();
	MVT InSVT = InVT.getScalarType();

	// Use PACKSS if the input is a splatted sign bit.
	// e.g. Comparison result, sext_in_reg, etc.
	unsigned NumSignBits = DAG.ComputeNumSignBits(In);
	if (NumSignBits != InSVT.getSizeInBits())
	return SDValue();

	// Check we have a truncation suited for PACKSS.
	if (!VT.is128BitVector() && !VT.is256BitVector())
	return SDValue();
	if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
	return SDValue();
	if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
	return SDValue();

	return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
	}

	static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	SDValue Src = N->getOperand(0);
	SDLoc DL(N);

	// Attempt to pre-truncate inputs to arithmetic ops instead.
	if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
	return V;

	// Try to detect AVG pattern first.
	if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
	return Avg;

	// Try to combine truncation with unsigned saturation.
	if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
	return Val;

	// The bitcast source is a direct mmx result.
	// Detect bitcasts between i32 to x86mmx
	if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
	SDValue BCSrc = Src.getOperand(0);
	if (BCSrc.getValueType() == MVT::x86mmx)
	return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
	}

	// Try to truncate extended sign bits with PACKSS.
	if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
	return V;

	return combineVectorTruncation(N, DAG, Subtarget);
	}

	/// Returns the negated value if the node \p N flips sign of FP value.
	///
	/// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
	/// AVX512F does not have FXOR, so FNEG is lowered as
	/// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
	/// In this case we go though all bitcasts.
	static SDValue isFNEG(SDNode *N) {
	if (N->getOpcode() == ISD::FNEG)
	return N->getOperand(0);

	SDValue Op = peekThroughBitcasts(SDValue(N, 0));
	if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
	return SDValue();

	SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
	if (!Op1.getValueType().isFloatingPoint())
	return SDValue();

	SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));

	unsigned EltBits = Op1.getScalarValueSizeInBits();
	auto isSignMask = [&](const ConstantFP *C) {
	return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
	};

	// There is more than one way to represent the same constant on
	// the different X86 targets. The type of the node may also depend on size.
	// - load scalar value and broadcast
	// - BUILD_VECTOR node
	// - load from a constant pool.
	// We check all variants here.
	if (Op1.getOpcode() == X86ISD::VBROADCAST) {
	if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
	if (isSignMask(cast<ConstantFP>(C)))
	return Op0;

	} else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
	if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
	if (isSignMask(CN->getConstantFPValue()))
	return Op0;

	} else if (auto *C = getTargetConstantFromNode(Op1)) {
	if (C->getType()->isVectorTy()) {
	if (auto *SplatV = C->getSplatValue())
	if (isSignMask(cast<ConstantFP>(SplatV)))
	return Op0;
	} else if (auto *FPConst = dyn_cast<ConstantFP>(C))
	if (isSignMask(FPConst))
	return Op0;
	}
	return SDValue();
	}

	/// Do target-specific dag combines on floating point negations.
	static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT OrigVT = N->getValueType(0);
	SDValue Arg = isFNEG(N);
	assert(Arg.getNode() && "N is expected to be an FNEG node");

	EVT VT = Arg.getValueType();
	EVT SVT = VT.getScalarType();
	SDLoc DL(N);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	// If we're negating a FMUL node on a target with FMA, then we can avoid the
	// use of a constant by performing (-0 - A*B) instead.
	// FIXME: Check rounding control flags as well once it becomes available.
	if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 \|\| SVT == MVT::f64) &&
	Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
	SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
	SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
	Arg.getOperand(1), Zero);
	return DAG.getBitcast(OrigVT, NewNode);
	}

	// If we're negating an FMA node, then we can adjust the
	// instruction to include the extra negation.
	unsigned NewOpcode = 0;
	if (Arg.hasOneUse()) {
	switch (Arg.getOpcode()) {
	case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
	case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
	case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
	// We can't handle scalar intrinsic node here because it would only
	// invert one element and not the whole vector. But we could try to handle
	// a negation of the lower element only.
	}
	}
	if (NewOpcode)
	return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
	Arg.getNode()->ops()));

	return SDValue();
	}

	static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	// If we have integer vector types available, use the integer opcodes.
	if (VT.isVector() && Subtarget.hasSSE2()) {
	SDLoc dl(N);

	MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);

	SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
	SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
	unsigned IntOpcode;
	switch (N->getOpcode()) {
	default: llvm_unreachable("Unexpected FP logic op");
	case X86ISD::FOR: IntOpcode = ISD::OR; break;
	case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
	case X86ISD::FAND: IntOpcode = ISD::AND; break;
	case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
	}
	SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
	return DAG.getBitcast(VT, IntOp);
	}
	return SDValue();
	}

	static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
	return Cmp;

	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
	return RV;

	if (Subtarget.hasCMov())
	if (SDValue RV = combineIntegerAbs(N, DAG))
	return RV;

	if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
	return FPLogic;

	if (isFNEG(N))
	return combineFneg(N, DAG, Subtarget);
	return SDValue();
	}


	static bool isNullFPScalarOrVectorConst(SDValue V) {
	return isNullFPConstant(V) \|\| ISD::isBuildVectorAllZeros(V.getNode());
	}

	/// If a value is a scalar FP zero or a vector FP zero (potentially including
	/// undefined elements), return a zero constant that may be used to fold away
	/// that value. In the case of a vector, the returned constant will not contain
	/// undefined elements even if the input parameter does. This makes it suitable
	/// to be used as a replacement operand with operations (eg, bitwise-and) where
	/// an undef should not propagate.
	static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (!isNullFPScalarOrVectorConst(V))
	return SDValue();

	if (V.getValueType().isVector())
	return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));

	return V;
	}

	static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
	if (!((VT == MVT::f32 && Subtarget.hasSSE1()) \|\|
	(VT == MVT::f64 && Subtarget.hasSSE2())))
	return SDValue();

	auto isAllOnesConstantFP = [](SDValue V) {
	auto *C = dyn_cast<ConstantFPSDNode>(V);
	return C && C->getConstantFPValue()->isAllOnesValue();
	};

	// fand (fxor X, -1), Y --> fandn X, Y
	if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);

	// fand X, (fxor Y, -1) --> fandn Y, X
	if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
	return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);

	return SDValue();
	}

	/// Do target-specific dag combines on X86ISD::FAND nodes.
	static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FAND(0.0, x) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
	return V;

	// FAND(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FANDN nodes.
	static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// FANDN(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// FANDN(x, 0.0) -> 0.0
	if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
	return V;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
	static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	assert(N->getOpcode() == X86ISD::FOR \|\| N->getOpcode() == X86ISD::FXOR);

	// F[X]OR(0.0, x) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(0)))
	return N->getOperand(1);

	// F[X]OR(x, 0.0) -> x
	if (isNullFPScalarOrVectorConst(N->getOperand(1)))
	return N->getOperand(0);

	if (isFNEG(N))
	if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
	return NewVal;

	return lowerX86FPLogicOp(N, DAG, Subtarget);
	}

	/// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
	static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
	assert(N->getOpcode() == X86ISD::FMIN \|\| N->getOpcode() == X86ISD::FMAX);

	// Only perform optimizations if UnsafeMath is used.
	if (!DAG.getTarget().Options.UnsafeFPMath)
	return SDValue();

	// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
	// into FMINC and FMAXC, which are Commutative operations.
	unsigned NewOp = 0;
	switch (N->getOpcode()) {
	default: llvm_unreachable("unknown opcode");
	case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
	case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
	}

	return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
	N->getOperand(0), N->getOperand(1));
	}

	static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Subtarget.useSoftFloat())
	return SDValue();

	// TODO: Check for global or instruction-level "nnan". In that case, we
	// should be able to lower to FMAX/FMIN alone.
	// TODO: If an operand is already known to be a NaN or not a NaN, this
	// should be an optional swap and FMAX/FMIN.

	EVT VT = N->getValueType(0);
	if (!((Subtarget.hasSSE1() && (VT == MVT::f32 \|\| VT == MVT::v4f32)) \|\|
	(Subtarget.hasSSE2() && (VT == MVT::f64 \|\| VT == MVT::v2f64)) \|\|
	(Subtarget.hasAVX() && (VT == MVT::v8f32 \|\| VT == MVT::v4f64))))
	return SDValue();

	// This takes at least 3 instructions, so favor a library call when operating
	// on a scalar and minimizing code size.
	if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
	return SDValue();

	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);
	SDLoc DL(N);
	EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
	DAG.getDataLayout(), *DAG.getContext(), VT);

	// There are 4 possibilities involving NaN inputs, and these are the required
	// outputs:
	// Op1
	// Num NaN
	// ----------------
	// Num \| Max \| Op0 \|
	// Op0 ----------------
	// NaN \| Op1 \| NaN \|
	// ----------------
	//
	// The SSE FP max/min instructions were not designed for this case, but rather
	// to implement:
	// Min = Op1 < Op0 ? Op1 : Op0
	// Max = Op1 > Op0 ? Op1 : Op0
	//
	// So they always return Op0 if either input is a NaN. However, we can still
	// use those instructions for fmaxnum by selecting away a NaN input.

	// If either operand is NaN, the 2nd source operand (Op0) is passed through.
	auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
	SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
	SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);

	// If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
	// are NaN, the NaN value of Op1 is the result.
	return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
	}

	/// Do target-specific dag combines on X86ISD::ANDNP nodes.
	static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// ANDNP(0, x) -> x
	if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
	return N->getOperand(1);

	// ANDNP(x, 0) -> 0
	if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
	return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));

	EVT VT = N->getValueType(0);

	// Attempt to recursively combine a bitmask ANDNP with shuffles.
	if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
	SDValue Op(N, 0);
	SmallVector<int, 1> NonceMask; // Just a placeholder.
	NonceMask.push_back(0);
	if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
	/Depth/ 1, /HasVarMask/ false, DAG,
	DCI, Subtarget))
	return SDValue(); // This routine will use CombineTo to replace N.
	}

	return SDValue();
	}

	static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI) {
	// BT ignores high bits in the bit index operand.
	SDValue Op1 = N->getOperand(1);
	if (Op1.hasOneUse()) {
	unsigned BitWidth = Op1.getValueSizeInBits();
	APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
	KnownBits Known;
	TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
	!DCI.isBeforeLegalizeOps());
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();
	if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) \|\|
	TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
	DCI.CommitTargetLoweringOpt(TLO);
	}
	return SDValue();
	}

	static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	EVT VT = N->getValueType(0);
	if (!VT.isVector())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	SDValue N1 = N->getOperand(1);
	EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
	SDLoc dl(N);

	// The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
	// both SSE and AVX2 since there is no sign-extended shift right
	// operation on a vector with 64-bit elements.
	//(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
	// (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
	if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND \|\|
	N0.getOpcode() == ISD::SIGN_EXTEND)) {
	SDValue N00 = N0.getOperand(0);

	// EXTLOAD has a better solution on AVX2,
	// it may be replaced with X86ISD::VSEXT node.
	if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
	if (!ISD::isNormalLoad(N00.getNode()))
	return SDValue();

	if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
	SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
	N00, N1);
	return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
	}
	}
	return SDValue();
	}

	/// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
	/// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
	/// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
	/// opportunities to combine math ops, use an LEA, or use a complex addressing
	/// mode. This can eliminate extend, add, and shift instructions.
	static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
	Ext->getOpcode() != ISD::ZERO_EXTEND)
	return SDValue();

	// TODO: This should be valid for other integer types.
	EVT VT = Ext->getValueType(0);
	if (VT != MVT::i64)
	return SDValue();

	SDValue Add = Ext->getOperand(0);
	if (Add.getOpcode() != ISD::ADD)
	return SDValue();

	bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
	bool NSW = Add->getFlags().hasNoSignedWrap();
	bool NUW = Add->getFlags().hasNoUnsignedWrap();

	// We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
	// into the 'zext'
	if ((Sext && !NSW) \|\| (!Sext && !NUW))
	return SDValue();

	// Having a constant operand to the 'add' ensures that we are not increasing
	// the instruction count because the constant is extended for free below.
	// A constant operand can also become the displacement field of an LEA.
	auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
	if (!AddOp1)
	return SDValue();

	// Don't make the 'add' bigger if there's no hope of combining it with some
	// other 'add' or 'shl' instruction.
	// TODO: It may be profitable to generate simpler LEA instructions in place
	// of single 'add' instructions, but the cost model for selecting an LEA
	// currently has a high threshold.
	bool HasLEAPotential = false;
	for (auto *User : Ext->uses()) {
	if (User->getOpcode() == ISD::ADD \|\| User->getOpcode() == ISD::SHL) {
	HasLEAPotential = true;
	break;
	}
	}
	if (!HasLEAPotential)
	return SDValue();

	// Everything looks good, so pull the '{s\|z}ext' ahead of the 'add'.
	int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
	SDValue AddOp0 = Add.getOperand(0);
	SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
	SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);

	// The wider add is guaranteed to not wrap because both operands are
	// sign-extended.
	SDNodeFlags Flags;
	Flags.setNoSignedWrap(NSW);
	Flags.setNoUnsignedWrap(NUW);
	return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
	}

	/// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
	/// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
	/// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
	/// extends from AH (which we otherwise need to do contortions to access).
	static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
	SDValue N0 = N->getOperand(0);
	auto OpcodeN = N->getOpcode();
	auto OpcodeN0 = N0.getOpcode();
	if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) \|\|
	(OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
	return SDValue();

	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	if (N0.getResNo() != 1 \|\| InVT != MVT::i8 \|\| VT != MVT::i32)
	return SDValue();

	SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
	auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
	: X86ISD::UDIVREM8_ZEXT_HREG;
	SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
	N0.getOperand(1));
	DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
	return R.getValue(1);
	}

	/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
	/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
	/// with UNDEFs) of the input to vectors of the same size as the target type
	/// which then extends the lowest elements.
	static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	unsigned Opcode = N->getOpcode();
	if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
	return SDValue();
	if (!DCI.isBeforeLegalizeOps())
	return SDValue();
	if (!Subtarget.hasSSE2())
	return SDValue();

	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT SVT = VT.getScalarType();
	EVT InVT = N0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// Input type must be a vector and we must be extending legal integer types.
	if (!VT.isVector())
	return SDValue();
	if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
	return SDValue();
	if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
	return SDValue();

	// On AVX2+ targets, if the input/output types are both legal then we will be
	// able to use SIGN_EXTEND/ZERO_EXTEND directly.
	if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
	DAG.getTargetLoweringInfo().isTypeLegal(InVT))
	return SDValue();

	SDLoc DL(N);

	auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
	EVT InVT = N.getValueType();
	EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
	Size / InVT.getScalarSizeInBits());
	SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
	DAG.getUNDEF(InVT));
	Opnds[0] = N;
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
	};

	// If target-size is less than 128-bits, extend to a type that would extend
	// to 128 bits, extend that and extract the original target vector.
	if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
	unsigned Scale = 128 / VT.getSizeInBits();
	EVT ExVT =
	EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
	SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
	SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
	return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
	DAG.getIntPtrConstant(0, DL));
	}

	// If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
	// ISD::_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::VEXT.
	// Also use this if we don't have SSE41 to allow the legalizer do its job.
	if (!Subtarget.hasSSE41() \|\| VT.is128BitVector() \|\|
	(VT.is256BitVector() && Subtarget.hasInt256()) \|\|
	(VT.is512BitVector() && Subtarget.hasAVX512())) {
	SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
	return Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
	: DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
	}

	auto SplitAndExtendInReg = [&](unsigned SplitSize) {
	unsigned NumVecs = VT.getSizeInBits() / SplitSize;
	unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
	EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
	EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);

	SmallVector<SDValue, 8> Opnds;
	for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
	SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
	DAG.getIntPtrConstant(Offset, DL));
	SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
	SrcVec = Opcode == ISD::SIGN_EXTEND
	? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
	: DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
	Opnds.push_back(SrcVec);
	}
	return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
	};

	// On pre-AVX2 targets, split into 128-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
	return SplitAndExtendInReg(128);

	// On pre-AVX512 targets, split into 256-bit nodes of
	// ISD::*_EXTEND_VECTOR_INREG.
	if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
	return SplitAndExtendInReg(256);

	return SDValue();
	}

	static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = N0.getValueType();
	SDLoc DL(N);

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (!DCI.isBeforeLegalizeOps()) {
	if (InVT == MVT::i1) {
	SDValue Zero = DAG.getConstant(0, DL, VT);
	SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
	return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
	}
	return SDValue();
	}

	if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
	isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
	// Invert and sign-extend a boolean is the same as zero-extend and subtract
	// 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
	// lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
	// sext (xor Bool, -1) --> sub (zext Bool), 1
	SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
	return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (Subtarget.hasAVX() && VT.is256BitVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	return SDValue();
	}

	static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc dl(N);
	EVT VT = N->getValueType(0);

	// Let legalize expand this if it isn't a legal type yet.
	if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
	return SDValue();

	EVT ScalarVT = VT.getScalarType();
	if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) \|\| !Subtarget.hasAnyFMA())
	return SDValue();

	SDValue A = N->getOperand(0);
	SDValue B = N->getOperand(1);
	SDValue C = N->getOperand(2);

	auto invertIfNegative = [](SDValue &V) {
	if (SDValue NegVal = isFNEG(V.getNode())) {
	V = NegVal;
	return true;
	}
	return false;
	};

	// Do not convert the passthru input of scalar intrinsics.
	// FIXME: We could allow negations of the lower element only.
	bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
	bool NegB = invertIfNegative(B);
	bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);

	// Negative multiplication when NegA xor NegB
	bool NegMul = (NegA != NegB);

	unsigned NewOpcode;
	if (!NegMul)
	NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
	else
	NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;


	if (N->getOpcode() == X86ISD::FMADD_RND) {
	switch (NewOpcode) {
	case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
	switch (NewOpcode) {
	case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
	}
	} else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
	switch (NewOpcode) {
	case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
	case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
	case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
	case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
	}
	} else {
	assert((N->getOpcode() == X86ISD::FMADD \|\| N->getOpcode() == ISD::FMA) &&
	"Unexpected opcode!");
	return DAG.getNode(NewOpcode, dl, VT, A, B, C);
	}

	return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
	}

	static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	// (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
	// (and (i32 x86isd::setcc_carry), 1)
	// This eliminates the zext. This transformation is necessary because
	// ISD::SETCC is always legalized to i8.
	SDLoc dl(N);
	SDValue N0 = N->getOperand(0);
	EVT VT = N->getValueType(0);

	if (N0.getOpcode() == ISD::AND &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	if (!isOneConstant(N0.getOperand(1)))
	return SDValue();
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (N0.getOpcode() == ISD::TRUNCATE &&
	N0.hasOneUse() &&
	N0.getOperand(0).hasOneUse()) {
	SDValue N00 = N0.getOperand(0);
	if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
	return DAG.getNode(ISD::AND, dl, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
	N00.getOperand(0), N00.getOperand(1)),
	DAG.getConstant(1, dl, VT));
	}
	}

	if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
	return V;

	if (VT.is256BitVector())
	if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
	return R;

	if (SDValue DivRem8 = getDivRem8(N, DAG))
	return DivRem8;

	if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
	return NewAdd;

	if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
	return R;

	return SDValue();
	}

	/// Try to map a 128-bit or larger integer comparison to vector instructions
	/// before type legalization splits it up into chunks.
	static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
	assert((CC == ISD::SETNE \|\| CC == ISD::SETEQ) && "Bad comparison predicate");

	// We're looking for an oversized integer equality comparison, but ignore a
	// comparison with zero because that gets special treatment in EmitTest().
	SDValue X = SetCC->getOperand(0);
	SDValue Y = SetCC->getOperand(1);
	EVT OpVT = X.getValueType();
	unsigned OpSize = OpVT.getSizeInBits();
	if (!OpVT.isScalarInteger() \|\| OpSize < 128 \|\| isNullConstant(Y))
	return SDValue();

	// TODO: Use PXOR + PTEST for SSE4.1 or later?
	// TODO: Add support for AVX-512.
	EVT VT = SetCC->getValueType(0);
	SDLoc DL(SetCC);
	if ((OpSize == 128 && Subtarget.hasSSE2()) \|\|
	(OpSize == 256 && Subtarget.hasAVX2())) {
	EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
	SDValue VecX = DAG.getBitcast(VecVT, X);
	SDValue VecY = DAG.getBitcast(VecVT, Y);

	// If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
	// setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
	// setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
	// setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
	// setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
	SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
	SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
	SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
	MVT::i32);
	return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
	}

	return SDValue();
	}

	static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
	SDValue LHS = N->getOperand(0);
	SDValue RHS = N->getOperand(1);
	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	if (CC == ISD::SETNE \|\| CC == ISD::SETEQ) {
	EVT OpVT = LHS.getValueType();
	// 0-x == y --> x+y == 0
	// 0-x != y --> x+y != 0
	if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
	LHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}
	// x == 0-y --> x+y == 0
	// x != 0-y --> x+y != 0
	if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
	RHS.hasOneUse()) {
	SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
	return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
	}

	if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
	return V;
	}

	if (VT.getScalarType() == MVT::i1 &&
	(CC == ISD::SETNE \|\| CC == ISD::SETEQ \|\| ISD::isSignedIntSetCC(CC))) {
	bool IsSEXT0 =
	(LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
	bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());

	if (!IsSEXT0 \|\| !IsVZero1) {
	// Swap the operands and update the condition code.
	std::swap(LHS, RHS);
	CC = ISD::getSetCCSwappedOperands(CC);

	IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
	(LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
	IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
	}

	if (IsSEXT0 && IsVZero1) {
	assert(VT == LHS.getOperand(0).getValueType() &&
	"Uexpected operand type");
	if (CC == ISD::SETGT)
	return DAG.getConstant(0, DL, VT);
	if (CC == ISD::SETLE)
	return DAG.getConstant(1, DL, VT);
	if (CC == ISD::SETEQ \|\| CC == ISD::SETGE)
	return DAG.getNOT(DL, LHS.getOperand(0), VT);

	assert((CC == ISD::SETNE \|\| CC == ISD::SETLT) &&
	"Unexpected condition code!");
	return LHS.getOperand(0);
	}
	}

	// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
	// to avoid scalarization via legalization because v4i32 is not a legal type.
	if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
	LHS.getValueType() == MVT::v4f32)
	return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);

	return SDValue();
	}

	static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
	SDLoc DL(N);
	// Gather and Scatter instructions use k-registers for masks. The type of
	// the masks is v*i1. So the mask will be truncated anyway.
	// The SIGN_EXTEND_INREG my be dropped.
	SDValue Mask = N->getOperand(2);
	if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
	SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
	NewOps[2] = Mask.getOperand(0);
	DAG.UpdateNodeOperands(N, NewOps);
	}
	return SDValue();
	}

	// Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
	static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
	SDValue EFLAGS = N->getOperand(1);

	// Try to simplify the EFLAGS and condition code operands.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
	return getSETCC(CC, Flags, DL, DAG);

	return SDValue();
	}

	/// Optimize branch condition evaluation.
	static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	SDValue EFLAGS = N->getOperand(3);
	X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

	// Try to simplify the EFLAGS and condition code operands.
	// Make sure to not keep references to operands, as combineSetCCEFLAGS can
	// RAUW them under us.
	if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
	SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
	return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
	N->getOperand(1), Cond, Flags);
	}

	return SDValue();
	}

	static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
	SelectionDAG &DAG) {
	// Take advantage of vector comparisons producing 0 or -1 in each lane to
	// optimize away operation when it's from a constant.
	//
	// The general transformation is:
	// UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
	// AND(VECTOR_CMP(x,y), constant2)
	// constant2 = UNARYOP(constant)

	// Early exit if this isn't a vector operation, the operand of the
	// unary operation isn't a bitwise AND, or if the sizes of the operations
	// aren't the same.
	EVT VT = N->getValueType(0);
	if (!VT.isVector() \|\| N->getOperand(0)->getOpcode() != ISD::AND \|\|
	N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC \|\|
	VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
	return SDValue();

	// Now check that the other operand of the AND is a constant. We could
	// make the transformation for non-constant splats as well, but it's unclear
	// that would be a benefit as it would not eliminate any operations, just
	// perform one more step in scalar code before moving to the vector unit.
	if (BuildVectorSDNode *BV =
	dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
	// Bail out if the vector isn't a constant.
	if (!BV->isConstant())
	return SDValue();

	// Everything checks out. Build up the new and improved node.
	SDLoc DL(N);
	EVT IntVT = BV->getValueType(0);
	// Create a new constant of the appropriate type for the transformed
	// DAG.
	SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
	// The AND node needs bitcasts to/from an integer vector type around it.
	SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
	SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
	N->getOperand(0)->getOperand(0), MaskConst);
	SDValue Res = DAG.getBitcast(VT, NewAnd);
	return Res;
	}

	return SDValue();
	}

	static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();
	const TargetLowering &TLI = DAG.getTargetLoweringInfo();

	// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
	// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
	if (InVT.isVector() && (InSVT == MVT::i8 \|\| InSVT == MVT::i16)) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);

	if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
	return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);

	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
	// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
	// the optimization here.
	if (DAG.SignBitIsZero(Op0))
	return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);

	return SDValue();
	}

	static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	// First try to optimize away the conversion entirely when it's
	// conditionally from a constant. Vectors only.
	if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
	return Res;

	// Now move on to more general possibilities.
	SDValue Op0 = N->getOperand(0);
	EVT VT = N->getValueType(0);
	EVT InVT = Op0.getValueType();
	EVT InSVT = InVT.getScalarType();

	// SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
	// SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
	// SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
	if (InVT.isVector() &&
	(InSVT == MVT::i8 \|\| InSVT == MVT::i16 \|\|
	(InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
	SDLoc dl(N);
	EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	InVT.getVectorNumElements());
	SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
	}

	// Without AVX512DQ we only support i64 to float scalar conversion. For both
	// vectors and scalars, see if we know that the upper bits are all the sign
	// bit, in which case we can truncate the input to i32 and convert from that.
	if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
	unsigned BitWidth = InVT.getScalarSizeInBits();
	unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
	if (NumSignBits >= (BitWidth - 31)) {
	EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
	if (InVT.isVector())
	TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
	InVT.getVectorNumElements());
	SDLoc dl(N);
	SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
	return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
	}
	}

	// Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
	// a 32-bit target where SSE doesn't support i64->FP operations.
	if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
	LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
	EVT LdVT = Ld->getValueType(0);

	// This transformation is not supported if the result type is f16 or f128.
	if (VT == MVT::f16 \|\| VT == MVT::f128)
	return SDValue();

	if (!Ld->isVolatile() && !VT.isVector() &&
	ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
	!Subtarget.is64Bit() && LdVT == MVT::i64) {
	SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
	SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
	DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
	return FILDChain;
	}
	}
	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
	static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
	X86TargetLowering::DAGCombinerInfo &DCI) {
	// When legalizing carry, we create carries via add X, -1
	// If that comes from an actual carry, via setcc, we use the
	// carry directly.
	if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
	SDValue Carry = N->getOperand(0);
	while (Carry.getOpcode() == ISD::TRUNCATE \|\|
	Carry.getOpcode() == ISD::ZERO_EXTEND \|\|
	Carry.getOpcode() == ISD::SIGN_EXTEND \|\|
	Carry.getOpcode() == ISD::ANY_EXTEND \|\|
	(Carry.getOpcode() == ISD::AND &&
	isOneConstant(Carry.getOperand(1))))
	Carry = Carry.getOperand(0);

	if (Carry.getOpcode() == X86ISD::SETCC \|\|
	Carry.getOpcode() == X86ISD::SETCC_CARRY) {
	if (Carry.getConstantOperandVal(0) == X86::COND_B)
	return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
	}
	}

	return SDValue();
	}

	// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
	static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
	X86TargetLowering::DAGCombinerInfo &DCI) {
	// If the LHS and RHS of the ADC node are zero, then it can't overflow and
	// the result is either zero or one (depending on the input carry bit).
	// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
	if (X86::isZeroNode(N->getOperand(0)) &&
	X86::isZeroNode(N->getOperand(1)) &&
	// We don't have a good way to replace an EFLAGS use, so only do this when
	// dead right now.
	SDValue(N, 1).use_empty()) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
	SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
	DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL,
	MVT::i8),
	N->getOperand(2)),
	DAG.getConstant(1, DL, VT));
	return DCI.CombineTo(N, Res1, CarryOut);
	}

	return SDValue();
	}

	/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
	/// which is more useful than 0/1 in some cases.
	static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
	SDLoc DL(N);
	// "Condition code B" is also known as "the carry flag" (CF).
	SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
	SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
	MVT VT = N->getSimpleValueType(0);
	if (VT == MVT::i8)
	return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));

	assert(VT == MVT::i1 && "Unexpected type for SETCC node");
	return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
	}

	/// If this is an add or subtract where one operand is produced by a cmp+setcc,
	/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
	/// with CMP+{ADC, SBB}.
	static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
	bool IsSub = N->getOpcode() == ISD::SUB;
	SDValue X = N->getOperand(0);
	SDValue Y = N->getOperand(1);

	// If this is an add, canonicalize a zext operand to the RHS.
	// TODO: Incomplete? What if both sides are zexts?
	if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
	Y.getOpcode() != ISD::ZERO_EXTEND)
	std::swap(X, Y);

	// Look through a one-use zext.
	bool PeekedThroughZext = false;
	if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
	Y = Y.getOperand(0);
	PeekedThroughZext = true;
	}

	// If this is an add, canonicalize a setcc operand to the RHS.
	// TODO: Incomplete? What if both sides are setcc?
	// TODO: Should we allow peeking through a zext of the other operand?
	if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
	Y.getOpcode() != X86ISD::SETCC)
	std::swap(X, Y);

	if (Y.getOpcode() != X86ISD::SETCC \|\| !Y.hasOneUse())
	return SDValue();

	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	auto *ConstantX = dyn_cast<ConstantSDNode>(X);
	if (ConstantX) {
	if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
	// This is a complicated way to get -1 or 0 from the carry flag:
	// -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	// 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	Y.getOperand(1));
	}

	if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) \|\|
	(IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
	SDValue EFLAGS = Y->getOperand(1);
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	// Swap the operands of a SUB, and we have the same pattern as above.
	// -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
	// 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
	SDValue NewSub = DAG.getNode(
	X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	NewEFLAGS);
	}
	}
	}

	if (CC == X86::COND_B) {
	// X + SETB Z --> X + (mask SBB Z, Z)
	// X - SETB Z --> X - (mask SBB Z, Z)
	// TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
	SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}

	if (CC == X86::COND_A) {
	SDValue EFLAGS = Y->getOperand(1);
	// Try to convert COND_A into COND_B in an attempt to facilitate
	// materializing "setb reg".
	//
	// Do not flip "e > c", where "c" is a constant, because Cmp instruction
	// cannot take an immediate as its first operand.
	//
	if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
	EFLAGS.getValueType().isInteger() &&
	!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
	SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
	EFLAGS.getNode()->getVTList(),
	EFLAGS.getOperand(1), EFLAGS.getOperand(0));
	SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
	SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
	if (SBB.getValueSizeInBits() != VT.getSizeInBits())
	SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
	return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
	}
	}

	if (CC != X86::COND_E && CC != X86::COND_NE)
	return SDValue();

	SDValue Cmp = Y.getOperand(1);
	if (Cmp.getOpcode() != X86ISD::CMP \|\| !Cmp.hasOneUse() \|\|
	!X86::isZeroNode(Cmp.getOperand(1)) \|\|
	!Cmp.getOperand(0).getValueType().isInteger())
	return SDValue();

	SDValue Z = Cmp.getOperand(0);
	EVT ZVT = Z.getValueType();

	// If X is -1 or 0, then we have an opportunity to avoid constants required in
	// the general case below.
	if (ConstantX) {
	// 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
	// fake operands:
	// 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
	// -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
	if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
	SDValue Zero = DAG.getConstant(0, DL, ZVT);
	SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
	SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8),
	SDValue(Neg.getNode(), 1));
	}

	// cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
	// with fake operands:
	// 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
	// -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
	if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) \|\|
	(!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
	return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
	DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
	}
	}

	// (cmp Z, 1) sets the carry flag if Z is 0.
	SDValue One = DAG.getConstant(1, DL, ZVT);
	SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);

	// Add the flags type for ADC/SBB nodes.
	SDVTList VTs = DAG.getVTList(VT, MVT::i32);

	// X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
	// X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
	if (CC == X86::COND_NE)
	return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
	DAG.getConstant(-1ULL, DL, VT), Cmp1);

	// X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
	// X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
	return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
	DAG.getConstant(0, DL, VT), Cmp1);
	}

	static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue MulOp = N->getOperand(0);
	SDValue Phi = N->getOperand(1);

	if (MulOp.getOpcode() != ISD::MUL)
	std::swap(MulOp, Phi);
	if (MulOp.getOpcode() != ISD::MUL)
	return SDValue();

	ShrinkMode Mode;
	if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) \|\| Mode == MULU16)
	return SDValue();

	EVT VT = N->getValueType(0);

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;
	unsigned VectorSize = VT.getVectorNumElements() * 16;
	// If the vector size is less than 128, or greater than the supported RegSize,
	// do not use PMADD.
	if (VectorSize < 128 \|\| VectorSize > RegSize)
	return SDValue();

	SDLoc DL(N);
	EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
	VT.getVectorNumElements());
	EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
	VT.getVectorNumElements() / 2);

	// Shrink the operands of mul.
	SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
	SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));

	// Madd vector size is half of the original vector size
	SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
	// Fill the rest of the output with 0
	SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
	SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
	return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
	}

	static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDLoc DL(N);
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// TODO: There's nothing special about i32, any integer type above i16 should
	// work just as well.
	if (!VT.isVector() \|\| !VT.isSimple() \|\|
	!(VT.getVectorElementType() == MVT::i32))
	return SDValue();

	unsigned RegSize = 128;
	if (Subtarget.hasBWI())
	RegSize = 512;
	else if (Subtarget.hasAVX2())
	RegSize = 256;

	// We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
	// TODO: We should be able to handle larger vectors by splitting them before
	// feeding them into several SADs, and then reducing over those.
	if (VT.getSizeInBits() / 4 > RegSize)
	return SDValue();

	// We know N is a reduction add, which means one of its operands is a phi.
	// To match SAD, we need the other operand to be a vector select.
	SDValue SelectOp, Phi;
	if (Op0.getOpcode() == ISD::VSELECT) {
	SelectOp = Op0;
	Phi = Op1;
	} else if (Op1.getOpcode() == ISD::VSELECT) {
	SelectOp = Op1;
	Phi = Op0;
	} else
	return SDValue();

	// Check whether we have an abs-diff pattern feeding into the select.
	if(!detectZextAbsDiff(SelectOp, Op0, Op1))
	return SDValue();

	// SAD pattern detected. Now build a SAD instruction and an addition for
	// reduction. Note that the number of elements of the result of SAD is less
	// than the number of elements of its input. Therefore, we could only update
	// part of elements in the reduction vector.
	SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);

	// The output of PSADBW is a vector of i64.
	// We need to turn the vector of i64 into a vector of i32.
	// If the reduction vector is at least as wide as the psadbw result, just
	// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
	// anyway.
	MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
	if (VT.getSizeInBits() >= ResVT.getSizeInBits())
	Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
	else
	Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);

	if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
	// Update part of elements of the reduction vector. This is done by first
	// extracting a sub-vector from it, updating this sub-vector, and inserting
	// it back.
	SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
	DAG.getIntPtrConstant(0, DL));
	SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
	return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
	DAG.getIntPtrConstant(0, DL));
	} else
	return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
	}

	/// Convert vector increment or decrement to sub/add with an all-ones constant:
	/// add X, <1, 1...> --> sub X, <-1, -1...>
	/// sub X, <1, 1...> --> add X, <-1, -1...>
	/// The all-ones vector constant can be materialized using a pcmpeq instruction
	/// that is commonly recognized as an idiom (has no register dependency), so
	/// that's better/smaller than loading a splat 1 constant.
	static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
	assert((N->getOpcode() == ISD::ADD \|\| N->getOpcode() == ISD::SUB) &&
	"Unexpected opcode for increment/decrement transform");

	// Pseudo-legality check: getOnesVector() expects one of these types, so bail
	// out and wait for legalization if we have an unsupported vector length.
	EVT VT = N->getValueType(0);
	if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
	return SDValue();

	SDNode *N1 = N->getOperand(1).getNode();
	APInt SplatVal;
	if (!ISD::isConstantSplatVector(N1, SplatVal) \|\| !SplatVal.isOneValue())
	return SDValue();

	SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
	unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
	return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
	}

	static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	const SDNodeFlags Flags = N->getFlags();
	if (Flags.hasVectorReduction()) {
	if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
	return Sad;
	if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
	return MAdd;
	}
	EVT VT = N->getValueType(0);
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// Try to synthesize horizontal adds from adds of shuffles.
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, true))
	return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	// X86 can't encode an immediate LHS of a sub. See if we can push the
	// negation into a preceding instruction.
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
	// If the RHS of the sub is a XOR with one use and a constant, invert the
	// immediate. Then add one to the LHS of the sub so we can turn
	// X-Y -> X+~Y+1, saving one register.
	if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
	isa<ConstantSDNode>(Op1.getOperand(1))) {
	APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
	EVT VT = Op0.getValueType();
	SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
	Op1.getOperand(0),
	DAG.getConstant(~XorC, SDLoc(Op1), VT));
	return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
	DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
	}
	}

	// Try to synthesize horizontal subs from subs of shuffles.
	EVT VT = N->getValueType(0);
	if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 \|\| VT == MVT::v4i32)) \|\|
	(Subtarget.hasInt256() && (VT == MVT::v16i16 \|\| VT == MVT::v8i32))) &&
	isHorizontalBinOp(Op0, Op1, false))
	return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);

	if (SDValue V = combineIncDecVector(N, DAG))
	return V;

	return combineAddOrSubToADCOrSBB(N, DAG);
	}

	static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalize())
	return SDValue();

	SDLoc DL(N);
	unsigned Opcode = N->getOpcode();
	MVT VT = N->getSimpleValueType(0);
	MVT SVT = VT.getVectorElementType();
	unsigned NumElts = VT.getVectorNumElements();
	unsigned EltSizeInBits = SVT.getSizeInBits();

	SDValue Op = N->getOperand(0);
	MVT OpVT = Op.getSimpleValueType();
	MVT OpEltVT = OpVT.getVectorElementType();
	unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
	unsigned InputBits = OpEltSizeInBits * NumElts;

	// Perform any constant folding.
	// FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
	APInt UndefElts;
	SmallVector<APInt, 64> EltBits;
	if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
	APInt Undefs(NumElts, 0);
	SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
	bool IsZEXT =
	(Opcode == X86ISD::VZEXT) \|\| (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
	for (unsigned i = 0; i != NumElts; ++i) {
	if (UndefElts[i]) {
	Undefs.setBit(i);
	continue;
	}
	Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
	: EltBits[i].sextOrTrunc(EltSizeInBits);
	}
	return getConstVector(Vals, Undefs, VT, DAG, DL);
	}

	// (vzext (bitcast (vzext (x)) -> (vzext x)
	// TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
	SDValue V = peekThroughBitcasts(Op);
	if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
	MVT InnerVT = V.getSimpleValueType();
	MVT InnerEltVT = InnerVT.getVectorElementType();

	// If the element sizes match exactly, we can just do one larger vzext. This
	// is always an exact type match as vzext operates on integer types.
	if (OpEltVT == InnerEltVT) {
	assert(OpVT == InnerVT && "Types must match for vzext!");
	return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
	}

	// The only other way we can combine them is if only a single element of the
	// inner vzext is used in the input to the outer vzext.
	if (InnerEltVT.getSizeInBits() < InputBits)
	return SDValue();

	// In this case, the inner vzext is completely dead because we're going to
	// only look at bits inside of the low element. Just do the outer vzext on
	// a bitcast of the input to the inner.
	return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
	}

	// Check if we can bypass extracting and re-inserting an element of an input
	// vector. Essentially:
	// (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
	// TODO: Add X86ISD::VSEXT support
	if (Opcode == X86ISD::VZEXT &&
	V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
	V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
	V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
	SDValue ExtractedV = V.getOperand(0);
	SDValue OrigV = ExtractedV.getOperand(0);
	if (isNullConstant(ExtractedV.getOperand(1))) {
	MVT OrigVT = OrigV.getSimpleValueType();
	// Extract a subvector if necessary...
	if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
	int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
	OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
	OrigVT.getVectorNumElements() / Ratio);
	OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
	DAG.getIntPtrConstant(0, DL));
	}
	Op = DAG.getBitcast(OpVT, OrigV);
	return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
	}
	}

	return SDValue();
	}

	/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
	static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	SDValue Chain = N->getOperand(0);
	SDValue LHS = N->getOperand(1);
	SDValue RHS = N->getOperand(2);
	MVT VT = RHS.getSimpleValueType();
	SDLoc DL(N);

	auto *C = dyn_cast<ConstantSDNode>(RHS);
	if (!C \|\| C->getZExtValue() != 1)
	return SDValue();

	RHS = DAG.getConstant(-1, DL, VT);
	MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
	return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
	DAG.getVTList(MVT::i32, MVT::Other),
	{Chain, LHS, RHS}, VT, MMO);
	}

	// TEST (AND a, b) ,(AND a, b) -> TEST a, b
	static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
	SDValue Op0 = N->getOperand(0);
	SDValue Op1 = N->getOperand(1);

	if (Op0 != Op1 \|\| Op1->getOpcode() != ISD::AND)
	return SDValue();

	EVT VT = N->getValueType(0);
	SDLoc DL(N);

	return DAG.getNode(X86ISD::TESTM, DL, VT,
	Op0->getOperand(0), Op0->getOperand(1));
	}

	static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
	const X86Subtarget &Subtarget) {
	MVT VT = N->getSimpleValueType(0);
	SDLoc DL(N);

	if (N->getOperand(0) == N->getOperand(1)) {
	if (N->getOpcode() == X86ISD::PCMPEQ)
	return getOnesVector(VT, DAG, DL);
	if (N->getOpcode() == X86ISD::PCMPGT)
	return getZeroVector(VT, Subtarget, DAG, DL);
	}

	return SDValue();
	}

	static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
	TargetLowering::DAGCombinerInfo &DCI,
	const X86Subtarget &Subtarget) {
	if (DCI.isBeforeLegalizeOps())
	return SDValue();

	SDLoc dl(N);
	SDValue Vec = N->getOperand(0);
	SDValue SubVec = N->getOperand(1);
	SDValue Idx = N->getOperand(2);

	unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
	MVT OpVT = N->getSimpleValueType(0);
	MVT SubVecVT = SubVec.getSimpleValueType();

	// If this is an insert of an extract, combine to a shuffle. Don't do this
	// if the insert or extract can be represented with a subvector operation.
	if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
	SubVec.getOperand(0).getSimpleValueType() == OpVT &&
	(IdxVal != 0 \|\| !Vec.isUndef())) {
	int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
	if (ExtIdxVal != 0) {
	int VecNumElts = OpVT.getVectorNumElements();
	int SubVecNumElts = SubVecVT.getVectorNumElements();
	SmallVector<int, 64> Mask(VecNumElts);
	// First create an identity shuffle mask.
	for (int i = 0; i != VecNumElts; ++i)
	Mask[i] = i;
	// Now insert the extracted portion.
	for (int i = 0; i != SubVecNumElts; ++i)
	Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;

	return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
	}
	}

	// Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
	// load:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr + 16), Elts/2)
	// --> load32 addr
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr + 32), Elts/2)
	// --> load64 addr
	// or a 16-byte or 32-byte broadcast:
	// (insert_subvector (insert_subvector undef, (load16 addr), 0),
	// (load16 addr), Elts/2)
	// --> X86SubVBroadcast(load16 addr)
	// or:
	// (insert_subvector (insert_subvector undef, (load32 addr), 0),
	// (load32 addr), Elts/2)
	// --> X86SubVBroadcast(load32 addr)
	if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
	Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
	OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
	auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
	if (Idx2 && Idx2->getZExtValue() == 0) {
	SDValue SubVec2 = Vec.getOperand(1);
	// If needed, look through bitcasts to get to the load.
	if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
	bool Fast;
	unsigned Alignment = FirstLd->getAlignment();
	unsigned AS = FirstLd->getAddressSpace();
	const X86TargetLowering *TLI = Subtarget.getTargetLowering();
	if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
	OpVT, AS, Alignment, &Fast) && Fast) {
	SDValue Ops[] = {SubVec2, SubVec};
	if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
	Subtarget, false))
	return Ld;
	}
	}
	// If lower/upper loads are the same and the only users of the load, then
	// lower to a VBROADCASTF128/VBROADCASTI128/etc.
	if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
	if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
	SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
	}
	}
	// If this is subv_broadcast insert into both halves, use a larger
	// subv_broadcast.
	if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
	return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
	SubVec.getOperand(0));
	}
	}
	}

	return SDValue();
	}


	SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {
	SelectionDAG &DAG = DCI.DAG;
	switch (N->getOpcode()) {
	default: break;
	case ISD::EXTRACT_VECTOR_ELT:
	return combineExtractVectorElt(N, DAG, DCI, Subtarget);
	case X86ISD::PEXTRW:
	case X86ISD::PEXTRB:
	return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
	case ISD::INSERT_SUBVECTOR:
	return combineInsertSubvector(N, DAG, DCI, Subtarget);
	case ISD::VSELECT:
	case ISD::SELECT:
	case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
	case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
	case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
	case ISD::ADD: return combineAdd(N, DAG, Subtarget);
	case ISD::SUB: return combineSub(N, DAG, Subtarget);
	case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
	case X86ISD::ADC: return combineADC(N, DAG, DCI);
	case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
	case ISD::SHL:
	case ISD::SRA:
	case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
	case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
	case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
	case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
	case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
	case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
	case ISD::STORE: return combineStore(N, DAG, Subtarget);
	case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
	case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
	case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
	case ISD::FADD:
	case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
	case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
	case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
	case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
	case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
	case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
	case X86ISD::FXOR:
	case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
	case X86ISD::FMIN:
	case X86ISD::FMAX: return combineFMinFMax(N, DAG);
	case ISD::FMINNUM:
	case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
	case X86ISD::BT: return combineBT(N, DAG, DCI);
	case ISD::ANY_EXTEND:
	case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
	case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
	case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
	case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
	case X86ISD::VSHLI:
	case X86ISD::VSRAI:
	case X86ISD::VSRLI:
	return combineVectorShiftImm(N, DAG, DCI, Subtarget);
	case ISD::SIGN_EXTEND_VECTOR_INREG:
	case ISD::ZERO_EXTEND_VECTOR_INREG:
	case X86ISD::VSEXT:
	case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
	case X86ISD::PINSRB:
	case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
	case X86ISD::SHUFP: // Handle all target specific shuffles
	case X86ISD::INSERTPS:
	case X86ISD::EXTRQI:
	case X86ISD::INSERTQI:
	case X86ISD::PALIGNR:
	case X86ISD::VSHLDQ:
	case X86ISD::VSRLDQ:
	case X86ISD::BLENDI:
	case X86ISD::UNPCKH:
	case X86ISD::UNPCKL:
	case X86ISD::MOVHLPS:
	case X86ISD::MOVLHPS:
	case X86ISD::PSHUFB:
	case X86ISD::PSHUFD:
	case X86ISD::PSHUFHW:
	case X86ISD::PSHUFLW:
	case X86ISD::MOVSHDUP:
	case X86ISD::MOVSLDUP:
	case X86ISD::MOVDDUP:
	case X86ISD::MOVSS:
	case X86ISD::MOVSD:
	case X86ISD::VPPERM:
	case X86ISD::VPERMI:
	case X86ISD::VPERMV:
	case X86ISD::VPERMV3:
	case X86ISD::VPERMIV3:
	case X86ISD::VPERMIL2:
	case X86ISD::VPERMILPI:
	case X86ISD::VPERMILPV:
	case X86ISD::VPERM2X128:
	case X86ISD::VZEXT_MOVL:
	case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
	case X86ISD::FMADD:
	case X86ISD::FMADD_RND:
	case X86ISD::FMADDS1_RND:
	case X86ISD::FMADDS3_RND:
	case ISD::FMA: return combineFMA(N, DAG, Subtarget);
	case ISD::MGATHER:
	case ISD::MSCATTER: return combineGatherScatter(N, DAG);
	case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
	case X86ISD::TESTM: return combineTestM(N, DAG);
	case X86ISD::PCMPEQ:
	case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
	}

	return SDValue();
	}

	/// Return true if the target has native support for the specified value type
	/// and it is 'desirable' to use the type for the given node type. e.g. On x86
	/// i16 is legal, but undesirable since i16 instruction encodings are longer and
	/// some i16 instructions are slow.
	bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
	if (!isTypeLegal(VT))
	return false;
	if (VT != MVT::i16)
	return true;

	switch (Opc) {
	default:
	return true;
	case ISD::LOAD:
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	case ISD::SHL:
	case ISD::SRL:
	case ISD::SUB:
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	return false;
	}
	}

	/// This function checks if any of the users of EFLAGS copies the EFLAGS. We
	/// know that the code that lowers COPY of EFLAGS has to use the stack, and if
	/// we don't adjust the stack we clobber the first frame index.
	/// See X86InstrInfo::copyPhysReg.
	static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
	const MachineRegisterInfo &MRI = MF.getRegInfo();
	return any_of(MRI.reg_instructions(X86::EFLAGS),
	[](const MachineInstr &RI) { return RI.isCopy(); });
	}

	void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
	if (hasCopyImplyingStackAdjustment(MF)) {
	MachineFrameInfo &MFI = MF.getFrameInfo();
	MFI.setHasCopyImplyingStackAdjustment(true);
	}

	TargetLoweringBase::finalizeLowering(MF);
	}

	/// This method query the target whether it is beneficial for dag combiner to
	/// promote the specified node. If true, it should return the desired promotion
	/// type by reference.
	bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
	EVT VT = Op.getValueType();
	if (VT != MVT::i16)
	return false;

	bool Promote = false;
	bool Commute = false;
	switch (Op.getOpcode()) {
	default: break;
	case ISD::SIGN_EXTEND:
	case ISD::ZERO_EXTEND:
	case ISD::ANY_EXTEND:
	Promote = true;
	break;
	case ISD::SHL:
	case ISD::SRL: {
	SDValue N0 = Op.getOperand(0);
	// Look out for (store (shl (load), x)).
	if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
	return false;
	Promote = true;
	break;
	}
	case ISD::ADD:
	case ISD::MUL:
	case ISD::AND:
	case ISD::OR:
	case ISD::XOR:
	Commute = true;
	LLVM_FALLTHROUGH;
	case ISD::SUB: {
	SDValue N0 = Op.getOperand(0);
	SDValue N1 = Op.getOperand(1);
	if (!Commute && MayFoldLoad(N1))
	return false;
	// Avoid disabling potential load folding opportunities.
	if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) \|\| MayFoldIntoStore(Op)))
	return false;
	if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) \|\| MayFoldIntoStore(Op)))
	return false;
	Promote = true;
	}
	}

	PVT = MVT::i32;
	return Promote;
	}

	//===----------------------------------------------------------------------===//
	// X86 Inline Assembly Support
	//===----------------------------------------------------------------------===//

	// Helper to match a string separated by whitespace.
	static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
	S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.

	for (StringRef Piece : Pieces) {
	if (!S.startswith(Piece)) // Check if the piece matches.
	return false;

	S = S.substr(Piece.size());
	StringRef::size_type Pos = S.find_first_not_of(" \t");
	if (Pos == 0) // We matched a prefix.
	return false;

	S = S.substr(Pos);
	}

	return S.empty();
	}

	static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {

	if (AsmPieces.size() == 3 \|\| AsmPieces.size() == 4) {
	if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
	std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {

	if (AsmPieces.size() == 3)
	return true;
	else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
	return true;
	}
	}
	return false;
	}

	bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
	InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());

	const std::string &AsmStr = IA->getAsmString();

	IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
	if (!Ty \|\| Ty->getBitWidth() % 16 != 0)
	return false;

	// TODO: should remove alternatives from the asmstring: "foo {a\|b}" -> "foo a"
	SmallVector<StringRef, 4> AsmPieces;
	SplitString(AsmStr, AsmPieces, ";\n");

	switch (AsmPieces.size()) {
	default: return false;
	case 1:
	// FIXME: this should verify that we are targeting a 486 or better. If not,
	// we will turn this bswap into something that will be lowered to logical
	// ops instead of emitting the bswap asm. For now, we don't support 486 or
	// lower so don't worry about this.
	// bswap $0
	if (matchAsm(AsmPieces[0], {"bswap", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "$0"}) \|\|
	matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) \|\|
	matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
	// No need to check constraints, nothing other than the equivalent of
	// "=r,0" would be valid here.
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	// rorw $$8, ${0:w} --> llvm.bswap.i16
	if (CI->getType()->isIntegerTy(16) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	(matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) \|\|
	matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	break;
	case 3:
	if (CI->getType()->isIntegerTy(32) &&
	IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
	matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
	matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
	matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
	AsmPieces.clear();
	StringRef ConstraintsStr = IA->getConstraintString();
	SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
	array_pod_sort(AsmPieces.begin(), AsmPieces.end());
	if (clobbersFlagRegisters(AsmPieces))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}

	if (CI->getType()->isIntegerTy(64)) {
	InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
	if (Constraints.size() >= 2 &&
	Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
	Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
	// bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
	if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
	matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
	matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
	return IntrinsicLowering::LowerToByteSwap(CI);
	}
	}
	break;
	}
	return false;
	}

	/// Given a constraint letter, return the type of constraint for this target.
	X86TargetLowering::ConstraintType
	X86TargetLowering::getConstraintType(StringRef Constraint) const {
	if (Constraint.size() == 1) {
	switch (Constraint[0]) {
	case 'R':
	case 'q':
	case 'Q':
	case 'f':
	case 't':
	case 'u':
	case 'y':
	case 'x':
	case 'v':
	case 'Y':
	case 'l':
	return C_RegisterClass;
	case 'k': // AVX512 masking registers.
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	return C_Register;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	case 'N':
	case 'G':
	case 'C':
	case 'e':
	case 'Z':
	return C_Other;
	default:
	break;
	}
	}
	else if (Constraint.size() == 2) {
	switch (Constraint[0]) {
	default:
	break;
	case 'Y':
	switch (Constraint[1]) {
	default:
	break;
	case 'k':
	return C_Register;
	}
	}
	}
	return TargetLowering::getConstraintType(Constraint);
	}

	/// Examine constraint type and operand type and determine a weight value.
	/// This object must already have been set up with the operand type
	/// and the current alternative constraint selected.
	TargetLowering::ConstraintWeight
	X86TargetLowering::getSingleConstraintMatchWeight(
	AsmOperandInfo &info, const char *constraint) const {
	ConstraintWeight weight = CW_Invalid;
	Value *CallOperandVal = info.CallOperandVal;
	// If we don't have a value, we can't do a match,
	// but allow it at the lowest weight.
	if (!CallOperandVal)
	return CW_Default;
	Type *type = CallOperandVal->getType();
	// Look at the constraint type.
	switch (*constraint) {
	default:
	weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
	LLVM_FALLTHROUGH;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	case 'A':
	if (CallOperandVal->getType()->isIntegerTy())
	weight = CW_SpecificReg;
	break;
	case 'f':
	case 't':
	case 'u':
	if (type->isFloatingPointTy())
	weight = CW_SpecificReg;
	break;
	case 'y':
	if (type->isX86_MMXTy() && Subtarget.hasMMX())
	weight = CW_SpecificReg;
	break;
	case 'Y':
	// Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
	if (constraint[1] == 'k') {
	// Support for 'Yk' (similarly to the 'k' variant below).
	weight = CW_SpecificReg;
	break;
	}
	// Else fall through (handle "Y" constraint).
	LLVM_FALLTHROUGH;
	case 'v':
	if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
	weight = CW_Register;
	LLVM_FALLTHROUGH;
	case 'x':
	if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) \|\|
	((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
	weight = CW_Register;
	break;
	case 'k':
	// Enable conditional vector operations using %k<#> registers.
	weight = CW_SpecificReg;
	break;
	case 'I':
	if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
	if (C->getZExtValue() <= 31)
	weight = CW_Constant;
	}
	break;
	case 'J':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 63)
	weight = CW_Constant;
	}
	break;
	case 'K':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
	weight = CW_Constant;
	}
	break;
	case 'L':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getZExtValue() == 0xff) \|\| (C->getZExtValue() == 0xffff))
	weight = CW_Constant;
	}
	break;
	case 'M':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 3)
	weight = CW_Constant;
	}
	break;
	case 'N':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xff)
	weight = CW_Constant;
	}
	break;
	case 'G':
	case 'C':
	if (isa<ConstantFP>(CallOperandVal)) {
	weight = CW_Constant;
	}
	break;
	case 'e':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if ((C->getSExtValue() >= -0x80000000LL) &&
	(C->getSExtValue() <= 0x7fffffffLL))
	weight = CW_Constant;
	}
	break;
	case 'Z':
	if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
	if (C->getZExtValue() <= 0xffffffff)
	weight = CW_Constant;
	}
	break;
	}
	return weight;
	}

	/// Try to replace an X constraint, which matches anything, with another that
	/// has more specific requirements based on the type of the corresponding
	/// operand.
	const char *X86TargetLowering::
	LowerXConstraint(EVT ConstraintVT) const {
	// FP X constraints get lowered to SSE1/2 registers if available, otherwise
	// 'f' like normal targets.
	if (ConstraintVT.isFloatingPoint()) {
	if (Subtarget.hasSSE2())
	return "Y";
	if (Subtarget.hasSSE1())
	return "x";
	}

	return TargetLowering::LowerXConstraint(ConstraintVT);
	}

	/// Lower the specified operand into the Ops vector.
	/// If it is invalid, don't add anything to Ops.
	void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
	std::string &Constraint,
	std::vector<SDValue>&Ops,
	SelectionDAG &DAG) const {
	SDValue Result;

	// Only support length 1 constraints for now.
	if (Constraint.length() > 1) return;

	char ConstraintLetter = Constraint[0];
	switch (ConstraintLetter) {
	default: break;
	case 'I':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 31) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'J':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 63) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'K':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (isInt<8>(C->getSExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'L':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() == 0xff \|\| C->getZExtValue() == 0xffff \|\|
	(Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'M':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 3) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'N':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 255) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'O':
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (C->getZExtValue() <= 127) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	return;
	case 'e': {
	// 32-bit signed value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getSExtValue())) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	}
	return;
	}
	case 'Z': {
	// 32-bit unsigned value
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
	if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
	C->getZExtValue())) {
	Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
	Op.getValueType());
	break;
	}
	}
	// FIXME gcc accepts some relocatable values here too, but only in certain
	// memory models; it's complicated.
	return;
	}
	case 'i': {
	// Literal immediates are always ok.
	if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
	// Widen to 64 bits here to get it sign extended.
	Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
	break;
	}

	// In any sort of PIC mode addresses need to be computed at runtime by
	// adding in a register or some sort of table lookup. These can't
	// be used as immediates.
	if (Subtarget.isPICStyleGOT() \|\| Subtarget.isPICStyleStubPIC())
	return;

	// If we are in non-pic codegen mode, we allow the address of a global (with
	// an optional displacement) to be used with 'i'.
	GlobalAddressSDNode *GA = nullptr;
	int64_t Offset = 0;

	// Match either (GA), (GA+C), (GA+C1+C2), etc.
	while (1) {
	if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
	Offset += GA->getOffset();
	break;
	} else if (Op.getOpcode() == ISD::ADD) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	} else if (Op.getOpcode() == ISD::SUB) {
	if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
	Offset += -C->getZExtValue();
	Op = Op.getOperand(0);
	continue;
	}
	}

	// Otherwise, this isn't something we can handle, reject it.
	return;
	}

	const GlobalValue *GV = GA->getGlobal();
	// If we require an extra load to get this address, as in PIC mode, we
	// can't accept it.
	if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
	return;

	Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
	GA->getValueType(0), Offset);
	break;
	}
	}

	if (Result.getNode()) {
	Ops.push_back(Result);
	return;
	}
	return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
	}

	/// Check if \p RC is a general purpose register class.
	/// I.e., GR* or one of their variant.
	static bool isGRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::GR8RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR16RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR32RegClass) \|\|
	RC.hasSuperClassEq(&X86::GR64RegClass) \|\|
	RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
	}

	/// Check if \p RC is a vector register class.
	/// I.e., FR* / VR* or one of their variant.
	static bool isFRClass(const TargetRegisterClass &RC) {
	return RC.hasSuperClassEq(&X86::FR32XRegClass) \|\|
	RC.hasSuperClassEq(&X86::FR64XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR128XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR256XRegClass) \|\|
	RC.hasSuperClassEq(&X86::VR512RegClass);
	}

	std::pair<unsigned, const TargetRegisterClass *>
	X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
	StringRef Constraint,
	MVT VT) const {
	// First, see if this is a constraint that directly corresponds to an LLVM
	// register class.
	if (Constraint.size() == 1) {
	// GCC Constraint Letters
	switch (Constraint[0]) {
	default: break;
	// TODO: Slight differences here in allocation order and leaving
	// RIP in the class. Do they matter any more here than they do
	// in the normal allocation?
	case 'k':
	if (Subtarget.hasAVX512()) {
	// Only supported in AVX512 or later.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32RegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16RegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8RegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1RegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64RegClass);
	}
	}
	break;
	case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
	if (Subtarget.is64Bit()) {
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i64 \|\| VT == MVT::f64)
	return std::make_pair(0U, &X86::GR64RegClass);
	break;
	}
	LLVM_FALLTHROUGH;
	// 32-bit fallthrough
	case 'Q': // Q_REGS
	if (VT == MVT::i32 \|\| VT == MVT::f32)
	return std::make_pair(0U, &X86::GR32_ABCDRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_ABCDRegClass);
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
	if (VT == MVT::i64)
	return std::make_pair(0U, &X86::GR64_ABCDRegClass);
	break;
	case 'r': // GENERAL_REGS
	case 'l': // INDEX_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8RegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16RegClass);
	if (VT == MVT::i32 \|\| VT == MVT::f32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32RegClass);
	return std::make_pair(0U, &X86::GR64RegClass);
	case 'R': // LEGACY_REGS
	if (VT == MVT::i8 \|\| VT == MVT::i1)
	return std::make_pair(0U, &X86::GR8_NOREXRegClass);
	if (VT == MVT::i16)
	return std::make_pair(0U, &X86::GR16_NOREXRegClass);
	if (VT == MVT::i32 \|\| !Subtarget.is64Bit())
	return std::make_pair(0U, &X86::GR32_NOREXRegClass);
	return std::make_pair(0U, &X86::GR64_NOREXRegClass);
	case 'f': // FP Stack registers.
	// If SSE is enabled for this VT, use f80 to ensure the isel moves the
	// value to the correct fpstack register class.
	if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP32RegClass);
	if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
	return std::make_pair(0U, &X86::RFP64RegClass);
	return std::make_pair(0U, &X86::RFP80RegClass);
	case 'y': // MMX_REGS if MMX allowed.
	if (!Subtarget.hasMMX()) break;
	return std::make_pair(0U, &X86::VR64RegClass);
	case 'Y': // SSE_REGS if SSE2 allowed
	if (!Subtarget.hasSSE2()) break;
	LLVM_FALLTHROUGH;
	case 'v':
	case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
	if (!Subtarget.hasSSE1()) break;
	bool VConstraint = (Constraint[0] == 'v');

	switch (VT.SimpleTy) {
	default: break;
	// Scalar SSE types.
	case MVT::f32:
	case MVT::i32:
	if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR32XRegClass);
	return std::make_pair(0U, &X86::FR32RegClass);
	case MVT::f64:
	case MVT::i64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::FR64XRegClass);
	return std::make_pair(0U, &X86::FR64RegClass);
	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	// Vector types.
	case MVT::v16i8:
	case MVT::v8i16:
	case MVT::v4i32:
	case MVT::v2i64:
	case MVT::v4f32:
	case MVT::v2f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR128XRegClass);
	return std::make_pair(0U, &X86::VR128RegClass);
	// AVX types.
	case MVT::v32i8:
	case MVT::v16i16:
	case MVT::v8i32:
	case MVT::v4i64:
	case MVT::v8f32:
	case MVT::v4f64:
	if (VConstraint && Subtarget.hasVLX())
	return std::make_pair(0U, &X86::VR256XRegClass);
	return std::make_pair(0U, &X86::VR256RegClass);
	case MVT::v8f64:
	case MVT::v16f32:
	case MVT::v16i32:
	case MVT::v8i64:
	return std::make_pair(0U, &X86::VR512RegClass);
	}
	break;
	}
	} else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
	switch (Constraint[1]) {
	default:
	break;
	case 'k':
	// This register class doesn't allocate k0 for masked vector operation.
	if (Subtarget.hasAVX512()) { // Only supported in AVX512.
	switch (VT.SimpleTy) {
	default: break;
	case MVT::i32:
	return std::make_pair(0U, &X86::VK32WMRegClass);
	case MVT::i16:
	return std::make_pair(0U, &X86::VK16WMRegClass);
	case MVT::i8:
	return std::make_pair(0U, &X86::VK8WMRegClass);
	case MVT::i1:
	return std::make_pair(0U, &X86::VK1WMRegClass);
	case MVT::i64:
	return std::make_pair(0U, &X86::VK64WMRegClass);
	}
	}
	break;
	}
	}

	// Use the default implementation in TargetLowering to convert the register
	// constraint into a member of a register class.
	std::pair<unsigned, const TargetRegisterClass*> Res;
	Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);

	// Not found as a standard register?
	if (!Res.second) {
	// Map st(0) -> st(7) -> ST0
	if (Constraint.size() == 7 && Constraint[0] == '{' &&
	tolower(Constraint[1]) == 's' &&
	tolower(Constraint[2]) == 't' &&
	Constraint[3] == '(' &&
	(Constraint[4] >= '0' && Constraint[4] <= '7') &&
	Constraint[5] == ')' &&
	Constraint[6] == '}') {

	Res.first = X86::FP0+Constraint[4]-'0';
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// GCC allows "st(0)" to be called just plain "st".
	if (StringRef("{st}").equals_lower(Constraint)) {
	Res.first = X86::FP0;
	Res.second = &X86::RFP80RegClass;
	return Res;
	}

	// flags -> EFLAGS
	if (StringRef("{flags}").equals_lower(Constraint)) {
	Res.first = X86::EFLAGS;
	Res.second = &X86::CCRRegClass;
	return Res;
	}

	// 'A' means [ER]AX + [ER]DX.
	if (Constraint == "A") {
	if (Subtarget.is64Bit()) {
	Res.first = X86::RAX;
	Res.second = &X86::GR64_ADRegClass;
	} else {
	assert((Subtarget.is32Bit() \|\| Subtarget.is16Bit()) &&
	"Expecting 64, 32 or 16 bit subtarget");
	Res.first = X86::EAX;
	Res.second = &X86::GR32_ADRegClass;
	}
	return Res;
	}
	return Res;
	}

	// Otherwise, check to see if this is a register class of the wrong value
	// type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
	// turn into {ax},{dx}.
	// MVT::Other is used to specify clobber names.
	if (TRI->isTypeLegalForClass(*Res.second, VT) \|\| VT == MVT::Other)
	return Res; // Correct type already, nothing to do.

	// Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
	// return "eax". This should even work for things like getting 64bit integer
	// registers when given an f64 type.
	const TargetRegisterClass *Class = Res.second;
	// The generic code will match the first register class that contains the
	// given register. Thus, based on the ordering of the tablegened file,
	// the "plain" GR classes might not come first.
	// Therefore, use a helper method.
	if (isGRClass(*Class)) {
	unsigned Size = VT.getSizeInBits();
	if (Size == 1) Size = 8;
	unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
	if (DestReg > 0) {
	Res.first = DestReg;
	Res.second = Size == 8 ? &X86::GR8RegClass
	: Size == 16 ? &X86::GR16RegClass
	: Size == 32 ? &X86::GR32RegClass
	: &X86::GR64RegClass;
	assert(Res.second->contains(Res.first) && "Register in register class");
	} else {
	// No register found/type mismatch.
	Res.first = 0;
	Res.second = nullptr;
	}
	} else if (isFRClass(*Class)) {
	// Handle references to XMM physical registers that got mapped into the
	// wrong class. This can happen with constraints like {xmm0} where the
	// target independent register mapper will just pick the first match it can
	// find, ignoring the required type.

	// TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
	if (VT == MVT::f32 \|\| VT == MVT::i32)
	Res.second = &X86::FR32RegClass;
	else if (VT == MVT::f64 \|\| VT == MVT::i64)
	Res.second = &X86::FR64RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
	Res.second = &X86::VR128RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
	Res.second = &X86::VR256RegClass;
	else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
	Res.second = &X86::VR512RegClass;
	else {
	// Type mismatch and not a clobber: Return an error;
	Res.first = 0;
	Res.second = nullptr;
	}
	}

	return Res;
	}

	int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
	const AddrMode &AM, Type *Ty,
	unsigned AS) const {
	// Scaling factors are not free at all.
	// An indexed folded instruction, i.e., inst (reg1, reg2, scale),
	// will take 2 allocations in the out of order engine instead of 1
	// for plain addressing mode, i.e. inst (reg1).
	// E.g.,
	// vaddps (%rsi,%drx), %ymm0, %ymm1
	// Requires two allocations (one for the load, one for the computation)
	// whereas:
	// vaddps (%rsi), %ymm0, %ymm1
	// Requires just 1 allocation, i.e., freeing allocations for other operations
	// and having less micro operations to execute.
	//
	// For some X86 architectures, this is even worse because for instance for
	// stores, the complex addressing mode forces the instruction to use the
	// "load" ports instead of the dedicated "store" port.
	// E.g., on Haswell:
	// vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
	// vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
	if (isLegalAddressingMode(DL, AM, Ty, AS))
	// Scale represents reg2 * scale, thus account for 1
	// as soon as we use a second register.
	return AM.Scale != 0;
	return -1;
	}

	bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
	// Integer division on x86 is expensive. However, when aggressively optimizing
	// for code size, we prefer to use a div instruction, as it is usually smaller
	// than the alternative sequence.
	// The exception to this is vector division. Since x86 doesn't have vector
	// integer division, leaving the division as-is is a loss even in terms of
	// size, because it will have to be scalarized, while the alternative code
	// sequence can be performed in vector form.
	bool OptSize =
	Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
	return OptSize && !VT.isVector();
	}

	void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
	if (!Subtarget.is64Bit())
	return;

	// Update IsSplitCSR in X86MachineFunctionInfo.
	X86MachineFunctionInfo *AFI =
	Entry->getParent()->getInfo<X86MachineFunctionInfo>();
	AFI->setIsSplitCSR(true);
	}

	void X86TargetLowering::insertCopiesSplitCSR(
	MachineBasicBlock *Entry,
	const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
	const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
	const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
	if (!IStart)
	return;

	const TargetInstrInfo *TII = Subtarget.getInstrInfo();
	MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
	MachineBasicBlock::iterator MBBI = Entry->begin();
	for (const MCPhysReg I = IStart; I; ++I) {
	const TargetRegisterClass *RC = nullptr;
	if (X86::GR64RegClass.contains(*I))
	RC = &X86::GR64RegClass;
	else
	llvm_unreachable("Unexpected register class in CSRsViaCopy!");

	unsigned NewVR = MRI->createVirtualRegister(RC);
	// Create copy from CSR to a virtual register.
	// FIXME: this currently does not emit CFI pseudo-instructions, it works
	// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
	// nounwind. If we want to generalize this later, we may need to emit
	// CFI pseudo-instructions.
	assert(Entry->getParent()->getFunction()->hasFnAttribute(
	Attribute::NoUnwind) &&
	"Function should be nounwind in insertCopiesSplitCSR!");
	Entry->addLiveIn(*I);
	BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
	.addReg(*I);

	// Insert the copy-back instructions right before the terminator.
	for (auto *Exit : Exits)
	BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
	TII->get(TargetOpcode::COPY), *I)
	.addReg(NewVR);
	}
	}

	bool X86TargetLowering::supportSwiftError() const {
	return Subtarget.is64Bit();
	}

	/// Returns the name of the symbol used to emit stack probes or the empty
	/// string if not applicable.
	StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
	// If the function specifically requests stack probes, emit them.
	if (MF.getFunction()->hasFnAttribute("probe-stack"))
	return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();

	// Generally, if we aren't on Windows, the platform ABI does not include
	// support for stack probes, so don't emit them.
	if (!Subtarget.isOSWindows() \|\| Subtarget.isTargetMachO())
	return "";

	// We need a stack probe to conform to the Windows ABI. Choose the right
	// symbol.
	if (Subtarget.is64Bit())
	return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
	return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";
	}
	Index: head/contrib/llvm/lib/Target/X86/X86InstrSSE.td
	===================================================================
	--- head/contrib/llvm/lib/Target/X86/X86InstrSSE.td (revision 322319)
	+++ head/contrib/llvm/lib/Target/X86/X86InstrSSE.td (revision 322320)
	@@ -1,8711 +1,8710 @@
	//===-- X86InstrSSE.td - SSE Instruction Set ---------------- tablegen --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file describes the X86 SSE instruction set, defining the instructions,
	// and properties of the instructions which are needed for code generation,
	// machine code emission, and analysis.
	//
	//===----------------------------------------------------------------------===//

	class OpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm> {
	InstrItinClass rr = arg_rr;
	InstrItinClass rm = arg_rm;
	// InstrSchedModel info.
	X86FoldableSchedWrite Sched = WriteFAdd;
	}

	class SizeItins<OpndItins arg_s, OpndItins arg_d> {
	OpndItins s = arg_s;
	OpndItins d = arg_d;
	}


	class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
	InstrItinClass arg_ri> {
	InstrItinClass rr = arg_rr;
	InstrItinClass rm = arg_rm;
	InstrItinClass ri = arg_ri;
	}

	// scalar
	let Sched = WriteFAdd in {
	def SSE_ALU_F32S : OpndItins<
	IIC_SSE_ALU_F32S_RR, IIC_SSE_ALU_F32S_RM
	>;

	def SSE_ALU_F64S : OpndItins<
	IIC_SSE_ALU_F64S_RR, IIC_SSE_ALU_F64S_RM
	>;
	}

	def SSE_ALU_ITINS_S : SizeItins<
	SSE_ALU_F32S, SSE_ALU_F64S
	>;

	let Sched = WriteFMul in {
	def SSE_MUL_F32S : OpndItins<
	IIC_SSE_MUL_F32S_RR, IIC_SSE_MUL_F64S_RM
	>;

	def SSE_MUL_F64S : OpndItins<
	IIC_SSE_MUL_F64S_RR, IIC_SSE_MUL_F64S_RM
	>;
	}

	def SSE_MUL_ITINS_S : SizeItins<
	SSE_MUL_F32S, SSE_MUL_F64S
	>;

	let Sched = WriteFDiv in {
	def SSE_DIV_F32S : OpndItins<
	IIC_SSE_DIV_F32S_RR, IIC_SSE_DIV_F64S_RM
	>;

	def SSE_DIV_F64S : OpndItins<
	IIC_SSE_DIV_F64S_RR, IIC_SSE_DIV_F64S_RM
	>;
	}

	def SSE_DIV_ITINS_S : SizeItins<
	SSE_DIV_F32S, SSE_DIV_F64S
	>;

	// parallel
	let Sched = WriteFAdd in {
	def SSE_ALU_F32P : OpndItins<
	IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
	>;

	def SSE_ALU_F64P : OpndItins<
	IIC_SSE_ALU_F64P_RR, IIC_SSE_ALU_F64P_RM
	>;
	}

	def SSE_ALU_ITINS_P : SizeItins<
	SSE_ALU_F32P, SSE_ALU_F64P
	>;

	let Sched = WriteFMul in {
	def SSE_MUL_F32P : OpndItins<
	IIC_SSE_MUL_F32P_RR, IIC_SSE_MUL_F64P_RM
	>;

	def SSE_MUL_F64P : OpndItins<
	IIC_SSE_MUL_F64P_RR, IIC_SSE_MUL_F64P_RM
	>;
	}

	def SSE_MUL_ITINS_P : SizeItins<
	SSE_MUL_F32P, SSE_MUL_F64P
	>;

	let Sched = WriteFDiv in {
	def SSE_DIV_F32P : OpndItins<
	IIC_SSE_DIV_F32P_RR, IIC_SSE_DIV_F64P_RM
	>;

	def SSE_DIV_F64P : OpndItins<
	IIC_SSE_DIV_F64P_RR, IIC_SSE_DIV_F64P_RM
	>;
	}

	def SSE_DIV_ITINS_P : SizeItins<
	SSE_DIV_F32P, SSE_DIV_F64P
	>;

	let Sched = WriteVecLogic in
	def SSE_VEC_BIT_ITINS_P : OpndItins<
	IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
	>;

	def SSE_BIT_ITINS_P : OpndItins<
	IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
	>;

	let Sched = WriteVecALU in {
	def SSE_INTALU_ITINS_P : OpndItins<
	IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
	>;

	def SSE_INTALUQ_ITINS_P : OpndItins<
	IIC_SSE_INTALUQ_P_RR, IIC_SSE_INTALUQ_P_RM
	>;
	}

	let Sched = WriteVecIMul in
	def SSE_INTMUL_ITINS_P : OpndItins<
	IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
	>;

	def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
	IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
	>;

	def SSE_MOVA_ITINS : OpndItins<
	IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
	>;

	def SSE_MOVU_ITINS : OpndItins<
	IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
	>;

	def SSE_DPPD_ITINS : OpndItins<
	IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
	>;

	def SSE_DPPS_ITINS : OpndItins<
	IIC_SSE_DPPS_RR, IIC_SSE_DPPD_RM
	>;

	def DEFAULT_ITINS : OpndItins<
	IIC_ALU_NONMEM, IIC_ALU_MEM
	>;

	def SSE_EXTRACT_ITINS : OpndItins<
	IIC_SSE_EXTRACTPS_RR, IIC_SSE_EXTRACTPS_RM
	>;

	def SSE_INSERT_ITINS : OpndItins<
	IIC_SSE_INSERTPS_RR, IIC_SSE_INSERTPS_RM
	>;

	let Sched = WriteMPSAD in
	def SSE_MPSADBW_ITINS : OpndItins<
	IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
	>;

	let Sched = WriteVecIMul in
	def SSE_PMULLD_ITINS : OpndItins<
	IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
	>;

	// Definitions for backward compatibility.
	// The instructions mapped on these definitions uses a different itinerary
	// than the actual scheduling model.
	let Sched = WriteShuffle in
	def DEFAULT_ITINS_SHUFFLESCHED : OpndItins<
	IIC_ALU_NONMEM, IIC_ALU_MEM
	>;

	let Sched = WriteVecIMul in
	def DEFAULT_ITINS_VECIMULSCHED : OpndItins<
	IIC_ALU_NONMEM, IIC_ALU_MEM
	>;

	let Sched = WriteShuffle in
	def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
	IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
	>;

	let Sched = WriteMPSAD in
	def DEFAULT_ITINS_MPSADSCHED : OpndItins<
	IIC_ALU_NONMEM, IIC_ALU_MEM
	>;

	let Sched = WriteFBlend in
	def DEFAULT_ITINS_FBLENDSCHED : OpndItins<
	IIC_ALU_NONMEM, IIC_ALU_MEM
	>;

	let Sched = WriteBlend in
	def DEFAULT_ITINS_BLENDSCHED : OpndItins<
	IIC_ALU_NONMEM, IIC_ALU_MEM
	>;

	let Sched = WriteVarBlend in
	def DEFAULT_ITINS_VARBLENDSCHED : OpndItins<
	IIC_ALU_NONMEM, IIC_ALU_MEM
	>;

	let Sched = WriteFBlend in
	def SSE_INTALU_ITINS_FBLEND_P : OpndItins<
	IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
	>;

	let Sched = WriteBlend in
	def SSE_INTALU_ITINS_BLEND_P : OpndItins<
	IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
	>;

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 Instructions Classes
	//===----------------------------------------------------------------------===//

	/// sse12_fp_scalar - SSE 1 & 2 scalar instructions class
	multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
	RegisterClass RC, X86MemOperand x86memop,
	Domain d, OpndItins itins, bit Is2Addr = 1> {
	let isCommutable = 1 in {
	def rr : SI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (OpNode RC:$src1, RC:$src2))], itins.rr, d>,
	Sched<[itins.Sched]>;
	}
	def rm : SI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (OpNode RC:$src1, (load addr:$src2)))], itins.rm, d>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	/// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
	multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode, RegisterClass RC,
	ValueType VT, string asm, Operand memopr,
	ComplexPattern mem_cpat, Domain d,
	OpndItins itins, bit Is2Addr = 1> {
	let isCodeGenOnly = 1, hasSideEffects = 0 in {
	def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(asm, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
	Sched<[itins.Sched]>;
	let mayLoad = 1 in
	def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(asm, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], itins.rm, d>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}
	}

	/// sse12_fp_packed - SSE 1 & 2 packed instructions class
	multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
	RegisterClass RC, ValueType vt,
	X86MemOperand x86memop, PatFrag mem_frag,
	Domain d, OpndItins itins, bit Is2Addr = 1> {
	let isCommutable = 1 in
	def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
	Sched<[itins.Sched]>;
	let mayLoad = 1 in
	def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (OpNode RC:$src1, (mem_frag addr:$src2)))],
	itins.rm, d>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	/// sse12_fp_packed_logical_rm - SSE 1 & 2 packed instructions class
	multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
	string OpcodeStr, X86MemOperand x86memop,
	list<dag> pat_rr, list<dag> pat_rm,
	bit Is2Addr = 1> {
	let isCommutable = 1, hasSideEffects = 0 in
	def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	pat_rr, NoItinerary, d>,
	Sched<[WriteVecLogic]>;
	def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	pat_rm, NoItinerary, d>,
	Sched<[WriteVecLogicLd, ReadAfterLd]>;
	}

	//===----------------------------------------------------------------------===//
	// Non-instruction patterns
	//===----------------------------------------------------------------------===//

	// A vector extract of the first f32/f64 position is a subregister copy
	def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
	(COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
	def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
	(COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;

	// A 128-bit subvector extract from the first 256-bit vector position
	// is a subregister copy that needs no instruction.
	def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
	(v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
	def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
	(v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;

	def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
	(v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
	def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
	(v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;

	def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
	(v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
	def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
	(v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;

	// A 128-bit subvector insert to the first 256-bit vector position
	// is a subregister copy that needs no instruction.
	let AddedComplexity = 25 in { // to give priority over vinsertf128rm
	def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
	(INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
	def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
	(INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
	def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
	def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
	(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
	def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
	(INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
	def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
	(INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
	}

	// Implicitly promote a 32-bit scalar to a vector.
	def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
	(COPY_TO_REGCLASS FR32:$src, VR128)>;
	// Implicitly promote a 64-bit scalar to a vector.
	def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
	(COPY_TO_REGCLASS FR64:$src, VR128)>;

	// Bitcasts between 128-bit vector types. Return the original type since
	// no instruction is needed for the conversion
	def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
	def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
	def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
	def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
	def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
	def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
	def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
	def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>;
	def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>;

	// Bitcasts between 256-bit vector types. Return the original type since
	// no instruction is needed for the conversion
	def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
	def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
	def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
	def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
	def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
	def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>;
	def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
	def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
	def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>;
	def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>;
	def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>;
	def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>;
	def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>;
	def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>;
	def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>;
	def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
	def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
	def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
	def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
	def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
	def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
	def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>;
	def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
	def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
	def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
	def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
	def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>;
	def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
	def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
	def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;

	// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
	// This is expanded by ExpandPostRAPseudos.
	let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
	isPseudo = 1, SchedRW = [WriteZero] in {
	def FsFLD0SS : I<0, Pseudo, (outs FR32:$dst), (ins), "",
	[(set FR32:$dst, fp32imm0)]>, Requires<[HasSSE1, NoAVX512]>;
	def FsFLD0SD : I<0, Pseudo, (outs FR64:$dst), (ins), "",
	[(set FR64:$dst, fpimm0)]>, Requires<[HasSSE2, NoAVX512]>;
	}

	//===----------------------------------------------------------------------===//
	// AVX & SSE - Zero/One Vectors
	//===----------------------------------------------------------------------===//

	// Alias instruction that maps zero vector to pxor / xorp* for sse.
	// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
	// swizzled by ExecutionDepsFix to pxor.
	// We set canFoldAsLoad because this can be converted to a constant-pool
	// load of an all-zeros value if folding it would be beneficial.
	let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
	isPseudo = 1, SchedRW = [WriteZero] in {
	def V_SET0 : I<0, Pseudo, (outs VR128:$dst), (ins), "",
	[(set VR128:$dst, (v4f32 immAllZerosV))]>;
	}

	let Predicates = [NoAVX512] in
	def : Pat<(v4i32 immAllZerosV), (V_SET0)>;


	// The same as done above but for AVX. The 256-bit AVX1 ISA doesn't support PI,
	// and doesn't need it because on sandy bridge the register is set to zero
	// at the rename stage without using any execution unit, so SET0PSY
	// and SET0PDY can be used for vector int instructions without penalty
	let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
	isPseudo = 1, Predicates = [NoAVX512], SchedRW = [WriteZero] in {
	def AVX_SET0 : I<0, Pseudo, (outs VR256:$dst), (ins), "",
	[(set VR256:$dst, (v8i32 immAllZerosV))]>;
	}

	// We set canFoldAsLoad because this can be converted to a constant-pool
	// load of an all-ones value if folding it would be beneficial.
	let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
	isPseudo = 1, SchedRW = [WriteZero] in {
	def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "",
	[(set VR128:$dst, (v4i32 immAllOnesV))]>;
	let Predicates = [HasAVX1Only, OptForMinSize] in {
	def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "",
	[(set VR256:$dst, (v8i32 immAllOnesV))]>;
	}
	let Predicates = [HasAVX2] in
	def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "",
	[(set VR256:$dst, (v8i32 immAllOnesV))]>;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Move FP Scalar Instructions
	//
	// Move Instructions. Register-to-register movss/movsd is not used for FR32/64
	// register copies because it's a partial register update; Register-to-register
	// movss/movsd is not modeled as an INSERT_SUBREG because INSERT_SUBREG requires
	// that the insert be implementable in terms of a copy, and just mentioned, we
	// don't use movss/movsd for copies.
	//===----------------------------------------------------------------------===//

	multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
	X86MemOperand x86memop, string base_opc,
	string asm_opr, Domain d = GenericDomain,
	string Name> {
	let isCommutable = 1 in
	def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, RC:$src2),
	!strconcat(base_opc, asm_opr),
	[(set VR128:$dst, (vt (OpNode VR128:$src1,
	(scalar_to_vector RC:$src2))))],
	IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;

	// For the disassembler
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
	def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
	(ins VR128:$src1, RC:$src2),
	!strconcat(base_opc, asm_opr),
	[], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>,
	FoldGenData<Name#rr>;
	}

	multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
	X86MemOperand x86memop, string OpcodeStr,
	Domain d = GenericDomain, string Name> {
	// AVX
	defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}", d,
	"V"#Name>,
	VEX_4V, VEX_LIG, VEX_WIG;

	def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
	VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG;
	// SSE1 & 2
	let Constraints = "$src1 = $dst" in {
	defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
	"\t{$src2, $dst\|$dst, $src2}", d, Name>;
	}

	def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
	Sched<[WriteStore]>;
	}

	// Loading from memory automatically zeroing upper bits.
	multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
	PatFrag mem_pat, string OpcodeStr,
	Domain d = GenericDomain> {
	def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set RC:$dst, (mem_pat addr:$src))],
	IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>, VEX_WIG;
	def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set RC:$dst, (mem_pat addr:$src))],
	IIC_SSE_MOV_S_RM, d>, Sched<[WriteLoad]>;
	}

	defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
	SSEPackedSingle, "MOVSS">, XS;
	defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
	SSEPackedDouble, "MOVSD">, XD;

	let canFoldAsLoad = 1, isReMaterializable = 1 in {
	defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
	SSEPackedSingle>, XS;

	let AddedComplexity = 20 in
	defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd",
	SSEPackedDouble>, XD;
	}

	// Patterns
	let Predicates = [UseAVX] in {
	let AddedComplexity = 20 in {
	// MOVSSrm zeros the high parts of the register; represent this
	// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
	def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
	(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
	def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
	(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
	def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
	(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;
	def : Pat<(v4f32 (X86vzload addr:$src)),
	(COPY_TO_REGCLASS (VMOVSSrm addr:$src), VR128)>;

	// MOVSDrm zeros the high parts of the register; represent this
	// with SUBREG_TO_REG. The AVX versions also write: DST[255:128] <- 0
	def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
	(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
	def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
	(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
	def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
	(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
	def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
	(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;
	def : Pat<(v2f64 (X86vzload addr:$src)),
	(COPY_TO_REGCLASS (VMOVSDrm addr:$src), VR128)>;

	// Represent the same patterns above but in the form they appear for
	// 256-bit types
	def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
	(v4f32 (scalar_to_vector (loadf32 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
	def : Pat<(v8f32 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
	def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
	(v2f64 (scalar_to_vector (loadf64 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
	def : Pat<(v4f64 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
	}

	// Extract and store.
	def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
	addr:$dst),
	(VMOVSSmr addr:$dst, (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32))>;

	// Shuffle with VMOVSS
	def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
	(VMOVSSrr (v4i32 VR128:$src1),
	(COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
	def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
	(VMOVSSrr (v4f32 VR128:$src1),
	(COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;

	// 256-bit variants
	def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
	(SUBREG_TO_REG (i32 0),
	(VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
	(EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
	sub_xmm)>;
	def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
	(SUBREG_TO_REG (i32 0),
	(VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
	(EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
	sub_xmm)>;

	// Shuffle with VMOVSD
	def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
	(VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
	(VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;

	// 256-bit variants
	def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
	(SUBREG_TO_REG (i32 0),
	(VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
	(EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
	sub_xmm)>;
	def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
	(SUBREG_TO_REG (i32 0),
	(VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
	(EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
	sub_xmm)>;

	// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
	// is during lowering, where it's not possible to recognize the fold cause
	// it has two uses through a bitcast. One use disappears at isel time and the
	// fold opportunity reappears.
	def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
	(VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
	(VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
	(VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
	(VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	}

	let Predicates = [UseSSE1] in {
	let Predicates = [NoSSE41], AddedComplexity = 15 in {
	// Move scalar to XMM zero-extended, zeroing a VR128 then do a
	// MOVSS to the lower bits.
	def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
	(MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
	def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
	(MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
	(MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
	}

	let AddedComplexity = 20 in {
	// MOVSSrm already zeros the high parts of the register.
	def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
	(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
	def : Pat<(v4f32 (scalar_to_vector (loadf32 addr:$src))),
	(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
	def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
	(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
	def : Pat<(v4f32 (X86vzload addr:$src)),
	(COPY_TO_REGCLASS (MOVSSrm addr:$src), VR128)>;
	}

	// Extract and store.
	def : Pat<(store (f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
	addr:$dst),
	(MOVSSmr addr:$dst, (COPY_TO_REGCLASS VR128:$src, FR32))>;

	// Shuffle with MOVSS
	def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
	(MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
	def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
	(MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
	}

	let Predicates = [UseSSE2] in {
	let Predicates = [NoSSE41], AddedComplexity = 15 in {
	// Move scalar to XMM zero-extended, zeroing a VR128 then do a
	// MOVSD to the lower bits.
	def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
	(MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
	}

	let AddedComplexity = 20 in {
	// MOVSDrm already zeros the high parts of the register.
	def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
	(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
	def : Pat<(v2f64 (scalar_to_vector (loadf64 addr:$src))),
	(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
	def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
	(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
	def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
	(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
	def : Pat<(v2f64 (X86vzload addr:$src)),
	(COPY_TO_REGCLASS (MOVSDrm addr:$src), VR128)>;
	}

	// Shuffle with MOVSD
	def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
	(MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
	(MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;

	// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
	// is during lowering, where it's not possible to recognize the fold because
	// it has two uses through a bitcast. One use disappears at isel time and the
	// fold opportunity reappears.
	def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
	(MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
	(MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
	(MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
	(MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
	}

	// Aliases to help the assembler pick two byte VEX encodings by swapping the
	// operands relative to the normal instructions to use VEX.R instead of VEX.B.
	def : InstAlias<"vmovss\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	(VMOVSSrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;
	def : InstAlias<"vmovsd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	(VMOVSDrr_REV VR128L:$dst, VR128:$src1, VR128H:$src2), 0>;

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Move Aligned/Unaligned FP Instructions
	//===----------------------------------------------------------------------===//

	multiclass sse12_mov_packed<bits<8> opc, RegisterClass RC,
	X86MemOperand x86memop, PatFrag ld_frag,
	string asm, Domain d,
	OpndItins itins> {
	let hasSideEffects = 0 in
	def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
	!strconcat(asm, "\t{$src, $dst\|$dst, $src}"), [], itins.rr, d>,
	Sched<[WriteFShuffle]>;
	let canFoldAsLoad = 1, isReMaterializable = 1 in
	def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(asm, "\t{$src, $dst\|$dst, $src}"),
	[(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
	Sched<[WriteLoad]>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
	"movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
	PS, VEX, VEX_WIG;
	defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
	"movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
	PD, VEX, VEX_WIG;
	defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
	"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
	PS, VEX, VEX_WIG;
	defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
	"movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
	PD, VEX, VEX_WIG;

	defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
	"movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
	PS, VEX, VEX_L, VEX_WIG;
	defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
	"movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
	PD, VEX, VEX_L, VEX_WIG;
	defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
	"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
	PS, VEX, VEX_L, VEX_WIG;
	defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
	"movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
	PD, VEX, VEX_L, VEX_WIG;
	}

	let Predicates = [UseSSE1] in {
	defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
	"movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
	PS;
	defm MOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
	"movups", SSEPackedSingle, SSE_MOVU_ITINS>,
	PS;
	}
	let Predicates = [UseSSE2] in {
	defm MOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
	"movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
	PD;
	defm MOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
	"movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
	PD;
	}

	let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in {
	def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movaps\t{$src, $dst\|$dst, $src}",
	[(alignedstore (v4f32 VR128:$src), addr:$dst)],
	IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
	def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movapd\t{$src, $dst\|$dst, $src}",
	[(alignedstore (v2f64 VR128:$src), addr:$dst)],
	IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
	def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movups\t{$src, $dst\|$dst, $src}",
	[(store (v4f32 VR128:$src), addr:$dst)],
	IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
	def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movupd\t{$src, $dst\|$dst, $src}",
	[(store (v2f64 VR128:$src), addr:$dst)],
	IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
	def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
	"movaps\t{$src, $dst\|$dst, $src}",
	[(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
	IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
	def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
	"movapd\t{$src, $dst\|$dst, $src}",
	[(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
	IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
	def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
	"movups\t{$src, $dst\|$dst, $src}",
	[(store (v8f32 VR256:$src), addr:$dst)],
	IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
	def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
	"movupd\t{$src, $dst\|$dst, $src}",
	[(store (v4f64 VR256:$src), addr:$dst)],
	IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
	} // SchedRW

	// For disassembler
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
	SchedRW = [WriteFShuffle] in {
	def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
	(ins VR128:$src),
	"movaps\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
	FoldGenData<"VMOVAPSrr">;
	def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
	(ins VR128:$src),
	"movapd\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
	FoldGenData<"VMOVAPDrr">;
	def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
	(ins VR128:$src),
	"movups\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
	FoldGenData<"VMOVUPSrr">;
	def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
	(ins VR128:$src),
	"movupd\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
	FoldGenData<"VMOVUPDrr">;
	def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
	(ins VR256:$src),
	"movaps\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
	FoldGenData<"VMOVAPSYrr">;
	def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
	(ins VR256:$src),
	"movapd\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
	FoldGenData<"VMOVAPDYrr">;
	def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
	(ins VR256:$src),
	"movups\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
	FoldGenData<"VMOVUPSYrr">;
	def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
	(ins VR256:$src),
	"movupd\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
	FoldGenData<"VMOVUPDYrr">;
	}

	// Aliases to help the assembler pick two byte VEX encodings by swapping the
	// operands relative to the normal instructions to use VEX.R instead of VEX.B.
	def : InstAlias<"vmovaps\t{$src, $dst\|$dst, $src}",
	(VMOVAPSrr_REV VR128L:$dst, VR128H:$src), 0>;
	def : InstAlias<"vmovapd\t{$src, $dst\|$dst, $src}",
	(VMOVAPDrr_REV VR128L:$dst, VR128H:$src), 0>;
	def : InstAlias<"vmovups\t{$src, $dst\|$dst, $src}",
	(VMOVUPSrr_REV VR128L:$dst, VR128H:$src), 0>;
	def : InstAlias<"vmovupd\t{$src, $dst\|$dst, $src}",
	(VMOVUPDrr_REV VR128L:$dst, VR128H:$src), 0>;
	def : InstAlias<"vmovaps\t{$src, $dst\|$dst, $src}",
	(VMOVAPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
	def : InstAlias<"vmovapd\t{$src, $dst\|$dst, $src}",
	(VMOVAPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
	def : InstAlias<"vmovups\t{$src, $dst\|$dst, $src}",
	(VMOVUPSYrr_REV VR256L:$dst, VR256H:$src), 0>;
	def : InstAlias<"vmovupd\t{$src, $dst\|$dst, $src}",
	(VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;

	let SchedRW = [WriteStore] in {
	def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movaps\t{$src, $dst\|$dst, $src}",
	[(alignedstore (v4f32 VR128:$src), addr:$dst)],
	IIC_SSE_MOVA_P_MR>;
	def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movapd\t{$src, $dst\|$dst, $src}",
	[(alignedstore (v2f64 VR128:$src), addr:$dst)],
	IIC_SSE_MOVA_P_MR>;
	def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movups\t{$src, $dst\|$dst, $src}",
	[(store (v4f32 VR128:$src), addr:$dst)],
	IIC_SSE_MOVU_P_MR>;
	def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movupd\t{$src, $dst\|$dst, $src}",
	[(store (v2f64 VR128:$src), addr:$dst)],
	IIC_SSE_MOVU_P_MR>;
	} // SchedRW

	// For disassembler
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
	SchedRW = [WriteFShuffle] in {
	def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movaps\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPSrr">;
	def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movapd\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPDrr">;
	def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movups\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPSrr">;
	def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movupd\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPDrr">;
	}

	let Predicates = [HasAVX, NoVLX] in {
	// 256-bit load/store need to use floating point load/store in case we don't
	// have AVX2. Execution domain fixing will convert to integer if AVX2 is
	// available and changing the domain is beneficial.
	def : Pat<(alignedloadv4i64 addr:$src),
	(VMOVAPSYrm addr:$src)>;
	def : Pat<(loadv4i64 addr:$src),
	(VMOVUPSYrm addr:$src)>;
	def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
	(VMOVAPSYmr addr:$dst, VR256:$src)>;
	def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
	(VMOVAPSYmr addr:$dst, VR256:$src)>;
	def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
	(VMOVAPSYmr addr:$dst, VR256:$src)>;
	def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
	(VMOVAPSYmr addr:$dst, VR256:$src)>;
	def : Pat<(store (v4i64 VR256:$src), addr:$dst),
	(VMOVUPSYmr addr:$dst, VR256:$src)>;
	def : Pat<(store (v8i32 VR256:$src), addr:$dst),
	(VMOVUPSYmr addr:$dst, VR256:$src)>;
	def : Pat<(store (v16i16 VR256:$src), addr:$dst),
	(VMOVUPSYmr addr:$dst, VR256:$src)>;
	def : Pat<(store (v32i8 VR256:$src), addr:$dst),
	(VMOVUPSYmr addr:$dst, VR256:$src)>;

	// Special patterns for storing subvector extracts of lower 128-bits
	// Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
	def : Pat<(alignedstore (v2f64 (extract_subvector
	(v4f64 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v4f32 (extract_subvector
	(v8f32 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;

	def : Pat<(store (v2f64 (extract_subvector
	(v4f64 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	def : Pat<(store (v4f32 (extract_subvector
	(v8f32 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	}

	// Use movaps / movups for SSE integer load / store (one byte shorter).
	// The instructions selected below are then converted to MOVDQA/MOVDQU
	// during the SSE domain pass.
	let Predicates = [UseSSE1] in {
	def : Pat<(alignedloadv2i64 addr:$src),
	(MOVAPSrm addr:$src)>;
	def : Pat<(loadv2i64 addr:$src),
	(MOVUPSrm addr:$src)>;

	def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
	(MOVAPSmr addr:$dst, VR128:$src)>;
	def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
	(MOVAPSmr addr:$dst, VR128:$src)>;
	def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
	(MOVAPSmr addr:$dst, VR128:$src)>;
	def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
	(MOVAPSmr addr:$dst, VR128:$src)>;
	def : Pat<(store (v2i64 VR128:$src), addr:$dst),
	(MOVUPSmr addr:$dst, VR128:$src)>;
	def : Pat<(store (v4i32 VR128:$src), addr:$dst),
	(MOVUPSmr addr:$dst, VR128:$src)>;
	def : Pat<(store (v8i16 VR128:$src), addr:$dst),
	(MOVUPSmr addr:$dst, VR128:$src)>;
	def : Pat<(store (v16i8 VR128:$src), addr:$dst),
	(MOVUPSmr addr:$dst, VR128:$src)>;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Move Low packed FP Instructions
	//===----------------------------------------------------------------------===//

	multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
	string base_opc, string asm_opr,
	InstrItinClass itin> {
	def PSrm : PI<opc, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
	!strconcat(base_opc, "s", asm_opr),
	[(set VR128:$dst,
	(psnode VR128:$src1,
	(bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
	itin, SSEPackedSingle>, PS,
	Sched<[WriteFShuffleLd, ReadAfterLd]>;

	def PDrm : PI<opc, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
	!strconcat(base_opc, "d", asm_opr),
	[(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
	(scalar_to_vector (loadf64 addr:$src2)))))],
	itin, SSEPackedDouble>, PD,
	Sched<[WriteFShuffleLd, ReadAfterLd]>;

	}

	multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
	string base_opc, InstrItinClass itin> {
	let Predicates = [UseAVX] in
	defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	itin>, VEX_4V, VEX_WIG;

	let Constraints = "$src1 = $dst" in
	defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
	"\t{$src2, $dst\|$dst, $src2}",
	itin>;
	}

	let AddedComplexity = 20 in {
	defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
	IIC_SSE_MOV_LH>;
	}

	let SchedRW = [WriteStore] in {
	let Predicates = [UseAVX] in {
	def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movlps\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOV_LH>, VEX, VEX_WIG;
	def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movlpd\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt (v2f64 VR128:$src),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOV_LH>, VEX, VEX_WIG;
	}// UseAVX
	def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movlps\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOV_LH>;
	def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movlpd\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt (v2f64 VR128:$src),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOV_LH>;
	} // SchedRW

	let Predicates = [UseAVX] in {
	// Shuffle with VMOVLPS
	def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
	(VMOVLPSrm VR128:$src1, addr:$src2)>;
	def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
	(VMOVLPSrm VR128:$src1, addr:$src2)>;

	// Shuffle with VMOVLPD
	def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
	(VMOVLPDrm VR128:$src1, addr:$src2)>;
	def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
	(VMOVLPDrm VR128:$src1, addr:$src2)>;
	def : Pat<(v2f64 (X86Movsd VR128:$src1,
	(v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
	(VMOVLPDrm VR128:$src1, addr:$src2)>;

	// Store patterns
	def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
	addr:$src1),
	(VMOVLPSmr addr:$src1, VR128:$src2)>;
	def : Pat<(store (v4i32 (X86Movlps
	(bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
	(VMOVLPSmr addr:$src1, VR128:$src2)>;
	def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
	addr:$src1),
	(VMOVLPDmr addr:$src1, VR128:$src2)>;
	def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
	addr:$src1),
	(VMOVLPDmr addr:$src1, VR128:$src2)>;
	}

	let Predicates = [UseSSE1] in {
	// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
	def : Pat<(store (i64 (extractelt (bc_v2i64 (v4f32 VR128:$src2)),
	(iPTR 0))), addr:$src1),
	(MOVLPSmr addr:$src1, VR128:$src2)>;

	// Shuffle with MOVLPS
	def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
	(MOVLPSrm VR128:$src1, addr:$src2)>;
	def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
	(MOVLPSrm VR128:$src1, addr:$src2)>;
	def : Pat<(X86Movlps VR128:$src1,
	(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
	(MOVLPSrm VR128:$src1, addr:$src2)>;

	// Store patterns
	def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
	addr:$src1),
	(MOVLPSmr addr:$src1, VR128:$src2)>;
	def : Pat<(store (v4i32 (X86Movlps
	(bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
	addr:$src1),
	(MOVLPSmr addr:$src1, VR128:$src2)>;
	}

	let Predicates = [UseSSE2] in {
	// Shuffle with MOVLPD
	def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
	(MOVLPDrm VR128:$src1, addr:$src2)>;
	def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
	(MOVLPDrm VR128:$src1, addr:$src2)>;
	def : Pat<(v2f64 (X86Movsd VR128:$src1,
	(v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
	(MOVLPDrm VR128:$src1, addr:$src2)>;

	// Store patterns
	def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
	addr:$src1),
	(MOVLPDmr addr:$src1, VR128:$src2)>;
	def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
	addr:$src1),
	(MOVLPDmr addr:$src1, VR128:$src2)>;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Move Hi packed FP Instructions
	//===----------------------------------------------------------------------===//

	let AddedComplexity = 20 in {
	defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
	IIC_SSE_MOV_LH>;
	}

	let SchedRW = [WriteStore] in {
	// v2f64 extract element 1 is always custom lowered to unpack high to low
	// and extract element 0 so the non-store version isn't too horrible.
	let Predicates = [UseAVX] in {
	def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movhps\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt
	(X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
	(bc_v2f64 (v4f32 VR128:$src))),
	(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
	def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movhpd\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt
	(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
	(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
	} // UseAVX
	def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movhps\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt
	(X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
	(bc_v2f64 (v4f32 VR128:$src))),
	(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
	def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movhpd\t{$src, $dst\|$dst, $src}",
	[(store (f64 (extractelt
	(v2f64 (X86Unpckh VR128:$src, VR128:$src)),
	(iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>;
	} // SchedRW

	let Predicates = [UseAVX] in {
	// VMOVHPS patterns
	def : Pat<(X86Movlhps VR128:$src1,
	(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
	(VMOVHPSrm VR128:$src1, addr:$src2)>;
	def : Pat<(X86Movlhps VR128:$src1,
	(bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
	(VMOVHPSrm VR128:$src1, addr:$src2)>;

	// VMOVHPD patterns

	// FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
	// is during lowering, where it's not possible to recognize the load fold
	// cause it has two uses through a bitcast. One use disappears at isel time
	// and the fold opportunity reappears.
	def : Pat<(v2f64 (X86Unpckl VR128:$src1,
	(scalar_to_vector (loadf64 addr:$src2)))),
	(VMOVHPDrm VR128:$src1, addr:$src2)>;

	// Also handle an i64 load because that may get selected as a faster way to
	// load the data.
	def : Pat<(v2f64 (X86Unpckl VR128:$src1,
	(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
	(VMOVHPDrm VR128:$src1, addr:$src2)>;

	def : Pat<(store (f64 (extractelt
	(bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
	(iPTR 0))), addr:$dst),
	(VMOVHPDmr addr:$dst, VR128:$src)>;

	def : Pat<(store (f64 (extractelt
	(v2f64 (X86VPermilpi VR128:$src, (i8 1))),
	(iPTR 0))), addr:$dst),
	(VMOVHPDmr addr:$dst, VR128:$src)>;
	}

	let Predicates = [UseSSE1] in {
	// MOVHPS patterns
	def : Pat<(X86Movlhps VR128:$src1,
	(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
	(MOVHPSrm VR128:$src1, addr:$src2)>;
	def : Pat<(X86Movlhps VR128:$src1,
	(bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
	(MOVHPSrm VR128:$src1, addr:$src2)>;
	}

	let Predicates = [UseSSE2] in {
	// MOVHPD patterns

	// FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
	// is during lowering, where it's not possible to recognize the load fold
	// cause it has two uses through a bitcast. One use disappears at isel time
	// and the fold opportunity reappears.
	def : Pat<(v2f64 (X86Unpckl VR128:$src1,
	(scalar_to_vector (loadf64 addr:$src2)))),
	(MOVHPDrm VR128:$src1, addr:$src2)>;

	// Also handle an i64 load because that may get selected as a faster way to
	// load the data.
	def : Pat<(v2f64 (X86Unpckl VR128:$src1,
	(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
	(MOVHPDrm VR128:$src1, addr:$src2)>;

	def : Pat<(store (f64 (extractelt
	(bc_v2f64 (v4f32 (X86Movhlps VR128:$src, VR128:$src))),
	(iPTR 0))), addr:$dst),
	(MOVHPDmr addr:$dst, VR128:$src)>;

	def : Pat<(store (f64 (extractelt
	(v2f64 (X86Shufp VR128:$src, VR128:$src, (i8 1))),
	(iPTR 0))), addr:$dst),
	(MOVHPDmr addr:$dst, VR128:$src)>;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Move Low to High and High to Low packed FP Instructions
	//===----------------------------------------------------------------------===//

	let AddedComplexity = 20, Predicates = [UseAVX] in {
	def VMOVLHPSrr : VPSI<0x16, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	"movlhps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
	IIC_SSE_MOV_LH>,
	VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
	def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	"movhlps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
	IIC_SSE_MOV_LH>,
	VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
	}
	let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
	def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	"movlhps\t{$src2, $dst\|$dst, $src2}",
	[(set VR128:$dst,
	(v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
	IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
	let isCommutable = 1 in
	def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	"movhlps\t{$src2, $dst\|$dst, $src2}",
	[(set VR128:$dst,
	(v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
	IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
	}

	let Predicates = [UseAVX] in {
	// MOVLHPS patterns
	def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
	(VMOVLHPSrr VR128:$src1, VR128:$src2)>;
	def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
	(VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;

	// MOVHLPS patterns
	def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
	(VMOVHLPSrr VR128:$src1, VR128:$src2)>;
	}

	let Predicates = [UseSSE1] in {
	// MOVLHPS patterns
	def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
	(MOVLHPSrr VR128:$src1, VR128:$src2)>;
	def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
	(MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;

	// MOVHLPS patterns
	def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
	(MOVHLPSrr VR128:$src1, VR128:$src2)>;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Conversion Instructions
	//===----------------------------------------------------------------------===//

	def SSE_CVT_PD : OpndItins<
	IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
	>;

	let Sched = WriteCvtI2F in
	def SSE_CVT_PS : OpndItins<
	IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
	>;

	let Sched = WriteCvtI2F in
	def SSE_CVT_Scalar : OpndItins<
	IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
	>;

	let Sched = WriteCvtF2I in
	def SSE_CVT_SS2SI_32 : OpndItins<
	IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
	>;

	let Sched = WriteCvtF2I in
	def SSE_CVT_SS2SI_64 : OpndItins<
	IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
	>;

	let Sched = WriteCvtF2I in
	def SSE_CVT_SD2SI : OpndItins<
	IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
	>;

	// FIXME: We probably want to match the rm form only when optimizing for
	// size, to avoid false depenendecies (see sse_fp_unop_s for details)
	multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
	SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
	string asm, OpndItins itins> {
	def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), asm,
	[(set DstRC:$dst, (OpNode SrcRC:$src))],
	itins.rr>, Sched<[itins.Sched]>;
	def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins x86memop:$src), asm,
	[(set DstRC:$dst, (OpNode (ld_frag addr:$src)))],
	itins.rm>, Sched<[itins.Sched.Folded]>;
	}

	multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
	ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
	string asm, Domain d, OpndItins itins> {
	let hasSideEffects = 0 in {
	def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
	[(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))],
	itins.rr, d>, Sched<[itins.Sched]>;
	let mayLoad = 1 in
	def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
	[(set RC:$dst, (DstTy (sint_to_fp
	(SrcTy (bitconvert (ld_frag addr:$src))))))],
	itins.rm, d>, Sched<[itins.Sched.Folded]>;
	}
	}

	// FIXME: We probably want to match the rm form only when optimizing for
	// size, to avoid false depenendecies (see sse_fp_unop_s for details)
	multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
	X86MemOperand x86memop, string asm> {
	let hasSideEffects = 0, Predicates = [UseAVX] in {
	def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
	!strconcat(asm,"\t{$src, $src1, $dst\|$dst, $src1, $src}"), []>,
	Sched<[WriteCvtI2F]>;
	let mayLoad = 1 in
	def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
	(ins DstRC:$src1, x86memop:$src),
	!strconcat(asm,"\t{$src, $src1, $dst\|$dst, $src1, $src}"), []>,
	Sched<[WriteCvtI2FLd, ReadAfterLd]>;
	} // hasSideEffects = 0
	}

	let Predicates = [UseAVX] in {
	defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
	"cvttss2si\t{$src, $dst\|$dst, $src}",
	SSE_CVT_SS2SI_32>,
	XS, VEX, VEX_LIG;
	defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
	"cvttss2si\t{$src, $dst\|$dst, $src}",
	SSE_CVT_SS2SI_64>,
	XS, VEX, VEX_W, VEX_LIG;
	defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
	"cvttsd2si\t{$src, $dst\|$dst, $src}",
	SSE_CVT_SD2SI>,
	XD, VEX, VEX_LIG;
	defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
	"cvttsd2si\t{$src, $dst\|$dst, $src}",
	SSE_CVT_SD2SI>,
	XD, VEX, VEX_W, VEX_LIG;

	def : InstAlias<"vcvttss2si{l}\t{$src, $dst\|$dst, $src}",
	(VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
	def : InstAlias<"vcvttss2si{l}\t{$src, $dst\|$dst, $src}",
	(VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
	def : InstAlias<"vcvttsd2si{l}\t{$src, $dst\|$dst, $src}",
	(VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
	def : InstAlias<"vcvttsd2si{l}\t{$src, $dst\|$dst, $src}",
	(VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
	def : InstAlias<"vcvttss2si{q}\t{$src, $dst\|$dst, $src}",
	(VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
	def : InstAlias<"vcvttss2si{q}\t{$src, $dst\|$dst, $src}",
	(VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
	def : InstAlias<"vcvttsd2si{q}\t{$src, $dst\|$dst, $src}",
	(VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
	def : InstAlias<"vcvttsd2si{q}\t{$src, $dst\|$dst, $src}",
	(VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
	}
	// The assembler can recognize rr 64-bit instructions by seeing a rxx
	// register, but the same isn't true when only using memory operands,
	// provide other assembly "l" and "q" forms to address this explicitly
	// where appropriate to do so.
	defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
	XS, VEX_4V, VEX_LIG;
	defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
	XS, VEX_4V, VEX_W, VEX_LIG;
	defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
	XD, VEX_4V, VEX_LIG;
	defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
	XD, VEX_4V, VEX_W, VEX_LIG;

	let Predicates = [UseAVX] in {
	def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst\|$dst, $src1, $src}",
	(VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
	def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst\|$dst, $src1, $src}",
	(VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;

	def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
	(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
	(VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
	(VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
	def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
	(VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;

	def : Pat<(f32 (sint_to_fp GR32:$src)),
	(VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
	def : Pat<(f32 (sint_to_fp GR64:$src)),
	(VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
	def : Pat<(f64 (sint_to_fp GR32:$src)),
	(VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
	def : Pat<(f64 (sint_to_fp GR64:$src)),
	(VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
	}

	defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
	"cvttss2si\t{$src, $dst\|$dst, $src}",
	SSE_CVT_SS2SI_32>, XS;
	defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
	"cvttss2si\t{$src, $dst\|$dst, $src}",
	SSE_CVT_SS2SI_64>, XS, REX_W;
	defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
	"cvttsd2si\t{$src, $dst\|$dst, $src}",
	SSE_CVT_SD2SI>, XD;
	defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
	"cvttsd2si\t{$src, $dst\|$dst, $src}",
	SSE_CVT_SD2SI>, XD, REX_W;
	defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
	"cvtsi2ss{l}\t{$src, $dst\|$dst, $src}",
	SSE_CVT_Scalar>, XS;
	defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
	"cvtsi2ss{q}\t{$src, $dst\|$dst, $src}",
	SSE_CVT_Scalar>, XS, REX_W;
	defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
	"cvtsi2sd{l}\t{$src, $dst\|$dst, $src}",
	SSE_CVT_Scalar>, XD;
	defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
	"cvtsi2sd{q}\t{$src, $dst\|$dst, $src}",
	SSE_CVT_Scalar>, XD, REX_W;

	def : InstAlias<"cvttss2si{l}\t{$src, $dst\|$dst, $src}",
	(CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
	def : InstAlias<"cvttss2si{l}\t{$src, $dst\|$dst, $src}",
	(CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
	def : InstAlias<"cvttsd2si{l}\t{$src, $dst\|$dst, $src}",
	(CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
	def : InstAlias<"cvttsd2si{l}\t{$src, $dst\|$dst, $src}",
	(CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
	def : InstAlias<"cvttss2si{q}\t{$src, $dst\|$dst, $src}",
	(CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
	def : InstAlias<"cvttss2si{q}\t{$src, $dst\|$dst, $src}",
	(CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
	def : InstAlias<"cvttsd2si{q}\t{$src, $dst\|$dst, $src}",
	(CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
	def : InstAlias<"cvttsd2si{q}\t{$src, $dst\|$dst, $src}",
	(CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;

	def : InstAlias<"cvtsi2ss\t{$src, $dst\|$dst, $src}",
	(CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
	def : InstAlias<"cvtsi2sd\t{$src, $dst\|$dst, $src}",
	(CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;

	// Conversion Instructions Intrinsics - Match intrinsics which expect MM
	// and/or XMM operand(s).

	// FIXME: We probably want to match the rm form only when optimizing for
	// size, to avoid false depenendecies (see sse_fp_unop_s for details)
	multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
	Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
	string asm, OpndItins itins> {
	def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
	!strconcat(asm, "\t{$src, $dst\|$dst, $src}"),
	[(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
	Sched<[itins.Sched]>;
	def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
	!strconcat(asm, "\t{$src, $dst\|$dst, $src}"),
	[(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
	Sched<[itins.Sched.Folded]>;
	}

	multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
	RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
	PatFrag ld_frag, string asm, OpndItins itins,
	bit Is2Addr = 1> {
	def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(asm, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
	itins.rr>, Sched<[itins.Sched]>;
	def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
	(ins DstRC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(asm, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
	itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	let Predicates = [UseAVX] in {
	defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
	int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
	SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
	defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
	int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
	SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
	}
	defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
	sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
	defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
	sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;


	let isCodeGenOnly = 1 in {
	let Predicates = [UseAVX] in {
	defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
	int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
	SSE_CVT_Scalar, 0>, XS, VEX_4V;
	defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
	int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
	SSE_CVT_Scalar, 0>, XS, VEX_4V,
	VEX_W;
	defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
	int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
	SSE_CVT_Scalar, 0>, XD, VEX_4V;
	defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
	int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
	SSE_CVT_Scalar, 0>, XD,
	VEX_4V, VEX_W;
	}
	let Constraints = "$src1 = $dst" in {
	defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
	int_x86_sse_cvtsi2ss, i32mem, loadi32,
	"cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
	defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
	int_x86_sse_cvtsi642ss, i64mem, loadi64,
	"cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
	defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
	int_x86_sse2_cvtsi2sd, i32mem, loadi32,
	"cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
	defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
	int_x86_sse2_cvtsi642sd, i64mem, loadi64,
	"cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
	}
	} // isCodeGenOnly = 1

	/// SSE 1 Only

	// Aliases for intrinsics
	let isCodeGenOnly = 1 in {
	let Predicates = [UseAVX] in {
	defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
	ssmem, sse_load_f32, "cvttss2si",
	SSE_CVT_SS2SI_32>, XS, VEX;
	defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
	int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
	"cvttss2si", SSE_CVT_SS2SI_64>,
	XS, VEX, VEX_W;
	defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
	sdmem, sse_load_f64, "cvttsd2si",
	SSE_CVT_SD2SI>, XD, VEX;
	defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
	int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
	"cvttsd2si", SSE_CVT_SD2SI>,
	XD, VEX, VEX_W;
	}
	defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
	ssmem, sse_load_f32, "cvttss2si",
	SSE_CVT_SS2SI_32>, XS;
	defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
	int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
	"cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
	defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
	sdmem, sse_load_f64, "cvttsd2si",
	SSE_CVT_SD2SI>, XD;
	defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
	int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
	"cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
	} // isCodeGenOnly = 1

	let Predicates = [UseAVX] in {
	defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
	ssmem, sse_load_f32, "cvtss2si",
	SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
	defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
	ssmem, sse_load_f32, "cvtss2si",
	SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
	}
	defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
	ssmem, sse_load_f32, "cvtss2si",
	SSE_CVT_SS2SI_32>, XS;
	defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
	ssmem, sse_load_f32, "cvtss2si",
	SSE_CVT_SS2SI_64>, XS, REX_W;

	defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
	"vcvtdq2ps\t{$src, $dst\|$dst, $src}",
	SSEPackedSingle, SSE_CVT_PS>,
	PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
	defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
	"vcvtdq2ps\t{$src, $dst\|$dst, $src}",
	SSEPackedSingle, SSE_CVT_PS>,
	PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;

	defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
	"cvtdq2ps\t{$src, $dst\|$dst, $src}",
	SSEPackedSingle, SSE_CVT_PS>,
	PS, Requires<[UseSSE2]>;

	let Predicates = [UseAVX] in {
	def : InstAlias<"vcvtss2si{l}\t{$src, $dst\|$dst, $src}",
	(VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
	def : InstAlias<"vcvtss2si{l}\t{$src, $dst\|$dst, $src}",
	(VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
	def : InstAlias<"vcvtsd2si{l}\t{$src, $dst\|$dst, $src}",
	(VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
	def : InstAlias<"vcvtsd2si{l}\t{$src, $dst\|$dst, $src}",
	(VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
	def : InstAlias<"vcvtss2si{q}\t{$src, $dst\|$dst, $src}",
	(VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
	def : InstAlias<"vcvtss2si{q}\t{$src, $dst\|$dst, $src}",
	(VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
	def : InstAlias<"vcvtsd2si{q}\t{$src, $dst\|$dst, $src}",
	(VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
	def : InstAlias<"vcvtsd2si{q}\t{$src, $dst\|$dst, $src}",
	(VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
	}

	def : InstAlias<"cvtss2si{l}\t{$src, $dst\|$dst, $src}",
	(CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
	def : InstAlias<"cvtss2si{l}\t{$src, $dst\|$dst, $src}",
	(CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
	def : InstAlias<"cvtsd2si{l}\t{$src, $dst\|$dst, $src}",
	(CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
	def : InstAlias<"cvtsd2si{l}\t{$src, $dst\|$dst, $src}",
	(CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
	def : InstAlias<"cvtss2si{q}\t{$src, $dst\|$dst, $src}",
	(CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
	def : InstAlias<"cvtss2si{q}\t{$src, $dst\|$dst, $src}",
	(CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
	def : InstAlias<"cvtsd2si{q}\t{$src, $dst\|$dst, $src}",
	(CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
	def : InstAlias<"cvtsd2si{q}\t{$src, $dst\|$dst, $src}",
	(CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;

	/// SSE 2 Only

	// Convert scalar double to scalar single
	let hasSideEffects = 0, Predicates = [UseAVX] in {
	def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
	(ins FR32:$src1, FR64:$src2),
	"cvtsd2ss\t{$src2, $src1, $dst\|$dst, $src1, $src2}", [],
	IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
	Sched<[WriteCvtF2F]>, VEX_WIG;
	let mayLoad = 1 in
	def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
	(ins FR32:$src1, f64mem:$src2),
	"vcvtsd2ss\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[], IIC_SSE_CVT_Scalar_RM>,
	XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
	Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
	}

	def : Pat<(f32 (fpround FR64:$src)),
	(VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
	Requires<[UseAVX]>;

	def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
	"cvtsd2ss\t{$src, $dst\|$dst, $src}",
	[(set FR32:$dst, (fpround FR64:$src))],
	IIC_SSE_CVT_Scalar_RR>, Sched<[WriteCvtF2F]>;
	def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
	"cvtsd2ss\t{$src, $dst\|$dst, $src}",
	[(set FR32:$dst, (fpround (loadf64 addr:$src)))],
	IIC_SSE_CVT_Scalar_RM>,
	XD,
	Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;

	let isCodeGenOnly = 1 in {
	def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
	"vcvtsd2ss\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
	IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG,
	Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
	def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
	"vcvtsd2ss\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
	VR128:$src1, sse_load_f64:$src2))],
	IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_WIG,
	Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;

	let Constraints = "$src1 = $dst" in {
	def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
	"cvtsd2ss\t{$src2, $dst\|$dst, $src2}",
	[(set VR128:$dst,
	(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
	IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
	Sched<[WriteCvtF2F]>;
	def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
	"cvtsd2ss\t{$src2, $dst\|$dst, $src2}",
	[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
	VR128:$src1, sse_load_f64:$src2))],
	IIC_SSE_CVT_Scalar_RM>, XD, Requires<[UseSSE2]>,
	Sched<[WriteCvtF2FLd, ReadAfterLd]>;
	}
	} // isCodeGenOnly = 1

	// Convert scalar single to scalar double
	// SSE2 instructions with XS prefix
	let hasSideEffects = 0, Predicates = [UseAVX] in {
	def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
	(ins FR64:$src1, FR32:$src2),
	"vcvtss2sd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[], IIC_SSE_CVT_Scalar_RR>,
	XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
	Sched<[WriteCvtF2F]>, VEX_WIG;
	let mayLoad = 1 in
	def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
	(ins FR64:$src1, f32mem:$src2),
	"vcvtss2sd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[], IIC_SSE_CVT_Scalar_RM>,
	XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
	Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
	}

	def : Pat<(f64 (fpextend FR32:$src)),
	(VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>;
	def : Pat<(fpextend (loadf32 addr:$src)),
	(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;

	def : Pat<(extloadf32 addr:$src),
	(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
	Requires<[UseAVX, OptForSize]>;
	def : Pat<(extloadf32 addr:$src),
	(VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
	Requires<[UseAVX, OptForSpeed]>;

	def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
	"cvtss2sd\t{$src, $dst\|$dst, $src}",
	[(set FR64:$dst, (fpextend FR32:$src))],
	IIC_SSE_CVT_Scalar_RR>, XS,
	Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>;
	def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
	"cvtss2sd\t{$src, $dst\|$dst, $src}",
	[(set FR64:$dst, (extloadf32 addr:$src))],
	IIC_SSE_CVT_Scalar_RM>, XS,
	Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;

	// extload f32 -> f64. This matches load+fpextend because we have a hack in
	// the isel (PreprocessForFPConvert) that can introduce loads after dag
	// combine.
	// Since these loads aren't folded into the fpextend, we have to match it
	// explicitly here.
	def : Pat<(fpextend (loadf32 addr:$src)),
	(CVTSS2SDrm addr:$src)>, Requires<[UseSSE2]>;
	def : Pat<(extloadf32 addr:$src),
	(CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;

	let isCodeGenOnly = 1 in {
	def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
	"vcvtss2sd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
	IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG,
	Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
	def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
	"vcvtss2sd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
	IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG,
	Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
	let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
	def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
	"cvtss2sd\t{$src2, $dst\|$dst, $src2}",
	[(set VR128:$dst,
	(int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
	IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
	Sched<[WriteCvtF2F]>;
	def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
	"cvtss2sd\t{$src2, $dst\|$dst, $src2}",
	[(set VR128:$dst,
	(int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
	IIC_SSE_CVT_Scalar_RM>, XS, Requires<[UseSSE2]>,
	Sched<[WriteCvtF2FLd, ReadAfterLd]>;
	}
	} // isCodeGenOnly = 1

	// Patterns used for matching (v)cvtsi2ss, (v)cvtsi2sd, (v)cvtsd2ss and
	// (v)cvtss2sd intrinsic sequences from clang which produce unnecessary
	// vmovs{s,d} instructions
	let Predicates = [UseAVX] in {
	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128:$dst),
	(v4f32 (scalar_to_vector
	(f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
	(Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128:$dst),
	(v2f64 (scalar_to_vector
	(f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
	(Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;

	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128:$dst),
	(v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
	(Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;

	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128:$dst),
	(v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
	(Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128:$dst),
	(v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
	(Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128:$dst),
	(v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
	(Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
	} // Predicates = [UseAVX]

	let Predicates = [UseSSE2] in {
	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128:$dst),
	(v4f32 (scalar_to_vector
	(f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
	(Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128:$dst),
	(v2f64 (scalar_to_vector
	(f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
	(Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128:$dst),
	(v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
	(Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;

	def : Pat<(v2f64 (X86Movsd
	(v2f64 VR128:$dst),
	(v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
	(Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
	} // Predicates = [UseSSE2]

	let Predicates = [UseSSE1] in {
	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128:$dst),
	(v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
	(Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;

	def : Pat<(v4f32 (X86Movss
	(v4f32 VR128:$dst),
	(v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
	(Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
	} // Predicates = [UseSSE1]

	// Convert packed single/double fp to doubleword
	def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvtps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
	IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
	def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"cvtps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
	IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
	def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
	"cvtps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst,
	(int_x86_avx_cvt_ps2dq_256 VR256:$src))],
	IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
	def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
	"cvtps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst,
	(int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
	IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
	def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvtps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
	IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
	def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"cvtps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(int_x86_sse2_cvtps2dq (memopv4f32 addr:$src)))],
	IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;


	// Convert Packed Double FP to Packed DW Integers
	let Predicates = [HasAVX, NoVLX] in {
	// The assembler can recognize rr 256-bit instructions by seeing a ymm
	// register, but the same isn't true when using memory operands instead.
	// Provide other assembly rr and rm forms to address this explicitly.
	def VCVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"vcvtpd2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
	VEX, Sched<[WriteCvtF2I]>, VEX_WIG;

	// XMM only
	def : InstAlias<"vcvtpd2dqx\t{$src, $dst\|$dst, $src}",
	(VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
	def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"vcvtpd2dq{x}\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
	Sched<[WriteCvtF2ILd]>, VEX_WIG;
	def : InstAlias<"vcvtpd2dqx\t{$src, $dst\|$dst, $src}",
	(VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;

	// YMM only
	def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
	"vcvtpd2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
	VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
	def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
	"vcvtpd2dq{y}\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
	VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
	def : InstAlias<"vcvtpd2dqy\t{$src, $dst\|$dst, $src}",
	(VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
	def : InstAlias<"vcvtpd2dqy\t{$src, $dst\|$dst, $src}",
	(VCVTPD2DQYrm VR128:$dst, f256mem:$src), 0>;
	}

	def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"cvtpd2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))],
	IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;
	def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvtpd2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvtp2Int (v2f64 VR128:$src))))],
	IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;

	// Convert with truncation packed single/double fp to doubleword
	// SSE2 packed instructions with XS prefix
	let Predicates = [HasAVX, NoVLX] in {
	def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvttps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (fp_to_sint (v4f32 VR128:$src))))],
	IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
	def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"cvttps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
	IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
	def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
	"cvttps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst,
	(v8i32 (fp_to_sint (v8f32 VR256:$src))))],
	IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
	def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
	"cvttps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst,
	(v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
	IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
	Sched<[WriteCvtF2ILd]>, VEX_WIG;
	}

	def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvttps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (fp_to_sint (v4f32 VR128:$src))))],
	IIC_SSE_CVT_PS_RR>, Sched<[WriteCvtF2I]>;
	def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"cvttps2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (fp_to_sint (memopv4f32 addr:$src))))],
	IIC_SSE_CVT_PS_RM>, Sched<[WriteCvtF2ILd]>;

	let Predicates = [HasAVX, NoVLX] in
	def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvttpd2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
	IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;

	// The assembler can recognize rr 256-bit instructions by seeing a ymm
	// register, but the same isn't true when using memory operands instead.
	// Provide other assembly rr and rm forms to address this explicitly.

	// XMM only
	def : InstAlias<"vcvttpd2dqx\t{$src, $dst\|$dst, $src}",
	(VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
	let Predicates = [HasAVX, NoVLX] in
	def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"cvttpd2dq{x}\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
	IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
	def : InstAlias<"vcvttpd2dqx\t{$src, $dst\|$dst, $src}",
	(VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;

	// YMM only
	let Predicates = [HasAVX, NoVLX] in {
	def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
	"cvttpd2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (fp_to_sint (v4f64 VR256:$src))))],
	IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
	def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
	"cvttpd2dq{y}\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
	IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
	}
	def : InstAlias<"vcvttpd2dqy\t{$src, $dst\|$dst, $src}",
	(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
	def : InstAlias<"vcvttpd2dqy\t{$src, $dst\|$dst, $src}",
	(VCVTTPD2DQYrm VR128:$dst, f256mem:$src), 0>;

	let Predicates = [HasAVX, NoVLX] in {
	let AddedComplexity = 15 in {
	def : Pat<(X86vzmovl (v2i64 (bitconvert
	(v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
	(VCVTPD2DQrr VR128:$src)>;
	def : Pat<(X86vzmovl (v2i64 (bitconvert
	(v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
	(VCVTTPD2DQrr VR128:$src)>;
	}
	} // Predicates = [HasAVX]

	def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvttpd2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
	IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2I]>;
	def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
	"cvttpd2dq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (X86cvttp2si (memopv2f64 addr:$src))))],
	IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2ILd]>;

	let Predicates = [UseSSE2] in {
	let AddedComplexity = 15 in {
	def : Pat<(X86vzmovl (v2i64 (bitconvert
	(v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
	(CVTPD2DQrr VR128:$src)>;
	def : Pat<(X86vzmovl (v2i64 (bitconvert
	(v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
	(CVTTPD2DQrr VR128:$src)>;
	}
	} // Predicates = [UseSSE2]

	// Convert packed single to packed double
	let Predicates = [HasAVX, NoVLX] in {
	// SSE2 instructions without OpSize prefix
	def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"vcvtps2pd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
	IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
	def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
	"vcvtps2pd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
	IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
	def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
	"vcvtps2pd\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
	IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
	def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
	"vcvtps2pd\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
	IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
	}

	let Predicates = [UseSSE2] in {
	def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvtps2pd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
	IIC_SSE_CVT_PD_RR>, PS, Sched<[WriteCvtF2F]>;
	def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
	"cvtps2pd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
	IIC_SSE_CVT_PD_RM>, PS, Sched<[WriteCvtF2FLd]>;
	}

	// Convert Packed DW Integers to Packed Double FP
	let Predicates = [HasAVX, NoVLX] in {
	let hasSideEffects = 0, mayLoad = 1 in
	def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
	"vcvtdq2pd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))]>,
	VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG;
	def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"vcvtdq2pd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
	VEX, Sched<[WriteCvtI2F]>, VEX_WIG;
	def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
	"vcvtdq2pd\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst,
	(v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
	VEX, VEX_L, Sched<[WriteCvtI2FLd]>, VEX_WIG;
	def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
	"vcvtdq2pd\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst,
	(v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
	VEX, VEX_L, Sched<[WriteCvtI2F]>, VEX_WIG;
	}

	let hasSideEffects = 0, mayLoad = 1 in
	def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
	"cvtdq2pd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))],
	IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
	def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvtdq2pd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v2f64 (X86VSintToFP (v4i32 VR128:$src))))],
	IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtI2F]>;

	// AVX register conversion intrinsics
	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(VCVTDQ2PDrm addr:$src)>;
	} // Predicates = [HasAVX, NoVLX]

	// SSE2 register conversion intrinsics
	let Predicates = [UseSSE2] in {
	def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(CVTDQ2PDrm addr:$src)>;
	} // Predicates = [UseSSE2]

	// Convert packed double to packed single
	// The assembler can recognize rr 256-bit instructions by seeing a ymm
	// register, but the same isn't true when using memory operands instead.
	// Provide other assembly rr and rm forms to address this explicitly.
	let Predicates = [HasAVX, NoVLX] in
	def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvtpd2ps\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
	IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;

	// XMM only
	def : InstAlias<"vcvtpd2psx\t{$src, $dst\|$dst, $src}",
	(VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
	let Predicates = [HasAVX, NoVLX] in
	def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"cvtpd2ps{x}\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
	IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
	def : InstAlias<"vcvtpd2psx\t{$src, $dst\|$dst, $src}",
	(VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;

	// YMM only
	let Predicates = [HasAVX, NoVLX] in {
	def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
	"cvtpd2ps\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (fpround VR256:$src))],
	IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
	def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
	"cvtpd2ps{y}\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
	IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
	}
	def : InstAlias<"vcvtpd2psy\t{$src, $dst\|$dst, $src}",
	(VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
	def : InstAlias<"vcvtpd2psy\t{$src, $dst\|$dst, $src}",
	(VCVTPD2PSYrm VR128:$dst, f256mem:$src), 0>;

	def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"cvtpd2ps\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
	IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtF2F]>;
	def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	"cvtpd2ps\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))],
	IIC_SSE_CVT_PD_RM>, Sched<[WriteCvtF2FLd]>;

	// AVX 256-bit register conversion intrinsics
	// FIXME: Migrate SSE conversion intrinsics matching to use patterns as below
	// whenever possible to avoid declaring two versions of each one.

	let Predicates = [HasAVX, NoVLX] in {
	// Match fpround and fpextend for 128/256-bit conversions
	let AddedComplexity = 15 in
	def : Pat<(X86vzmovl (v2f64 (bitconvert
	(v4f32 (X86vfpround (v2f64 VR128:$src)))))),
	(VCVTPD2PSrr VR128:$src)>;
	}

	let Predicates = [UseSSE2] in {
	// Match fpround and fpextend for 128 conversions
	let AddedComplexity = 15 in
	def : Pat<(X86vzmovl (v2f64 (bitconvert
	(v4f32 (X86vfpround (v2f64 VR128:$src)))))),
	(CVTPD2PSrr VR128:$src)>;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Compare Instructions
	//===----------------------------------------------------------------------===//

	// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
	multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
	Operand CC, SDNode OpNode, ValueType VT,
	PatFrag ld_frag, string asm, string asm_alt,
	OpndItins itins, ImmLeaf immLeaf> {
	let isCommutable = 1 in
	def rr : SIi8<0xC2, MRMSrcReg,
	(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
	[(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
	itins.rr>, Sched<[itins.Sched]>;
	def rm : SIi8<0xC2, MRMSrcMem,
	(outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
	[(set RC:$dst, (OpNode (VT RC:$src1),
	(ld_frag addr:$src2), immLeaf:$cc))],
	itins.rm>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;

	// Accept explicit immediate argument form instead of comparison code.
	let isAsmParserOnly = 1, hasSideEffects = 0 in {
	def rr_alt : SIi8<0xC2, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, u8imm:$cc), asm_alt, [],
	IIC_SSE_ALU_F32S_RR>, Sched<[itins.Sched]>;
	let mayLoad = 1 in
	def rm_alt : SIi8<0xC2, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2, u8imm:$cc), asm_alt, [],
	IIC_SSE_ALU_F32S_RM>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}
	}

	let ExeDomain = SSEPackedSingle in
	defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
	"cmp${cc}ss\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	"cmpss\t{$cc, $src2, $src1, $dst\|$dst, $src1, $src2, $cc}",
	SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG, VEX_WIG;
	let ExeDomain = SSEPackedDouble in
	defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
	"cmp${cc}sd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	"cmpsd\t{$cc, $src2, $src1, $dst\|$dst, $src1, $src2, $cc}",
	SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
	XD, VEX_4V, VEX_LIG, VEX_WIG;

	let Constraints = "$src1 = $dst" in {
	let ExeDomain = SSEPackedSingle in
	defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
	"cmp${cc}ss\t{$src2, $dst\|$dst, $src2}",
	"cmpss\t{$cc, $src2, $dst\|$dst, $src2, $cc}", SSE_ALU_F32S,
	i8immZExt3>, XS;
	let ExeDomain = SSEPackedDouble in
	defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
	"cmp${cc}sd\t{$src2, $dst\|$dst, $src2}",
	"cmpsd\t{$cc, $src2, $dst\|$dst, $src2, $cc}",
	SSE_ALU_F64S, i8immZExt3>, XD;
	}

	multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
	Intrinsic Int, string asm, OpndItins itins,
	ImmLeaf immLeaf, ComplexPattern mem_cpat> {
	def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src, CC:$cc), asm,
	[(set VR128:$dst, (Int VR128:$src1,
	VR128:$src, immLeaf:$cc))],
	itins.rr>,
	Sched<[itins.Sched]>;
	let mayLoad = 1 in
	def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, memop:$src, CC:$cc), asm,
	[(set VR128:$dst, (Int VR128:$src1,
	mem_cpat:$src, immLeaf:$cc))],
	itins.rm>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	let isCodeGenOnly = 1 in {
	// Aliases to match intrinsics which expect XMM operand(s).
	let ExeDomain = SSEPackedSingle in
	defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
	"cmp${cc}ss\t{$src, $src1, $dst\|$dst, $src1, $src}",
	SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
	XS, VEX_4V;
	let ExeDomain = SSEPackedDouble in
	defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
	"cmp${cc}sd\t{$src, $src1, $dst\|$dst, $src1, $src}",
	SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
	XD, VEX_4V;
	let Constraints = "$src1 = $dst" in {
	let ExeDomain = SSEPackedSingle in
	defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
	"cmp${cc}ss\t{$src, $dst\|$dst, $src}",
	SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
	let ExeDomain = SSEPackedDouble in
	defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
	"cmp${cc}sd\t{$src, $dst\|$dst, $src}",
	SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
	XD;
	}
	}


	// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
	multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
	ValueType vt, X86MemOperand x86memop,
	PatFrag ld_frag, string OpcodeStr> {
	def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1\|$src1, $src2}"),
	[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
	IIC_SSE_COMIS_RR>,
	Sched<[WriteFAdd]>;
	let mayLoad = 1 in
	def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1\|$src1, $src2}"),
	[(set EFLAGS, (OpNode (vt RC:$src1),
	(ld_frag addr:$src2)))],
	IIC_SSE_COMIS_RM>,
	Sched<[WriteFAddLd, ReadAfterLd]>;
	}

	// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
	multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
	ValueType vt, Operand memop,
	ComplexPattern mem_cpat, string OpcodeStr> {
	def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1\|$src1, $src2}"),
	[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
	IIC_SSE_COMIS_RR>,
	Sched<[WriteFAdd]>;
	let mayLoad = 1 in
	def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1\|$src1, $src2}"),
	[(set EFLAGS, (OpNode (vt RC:$src1),
	mem_cpat:$src2))],
	IIC_SSE_COMIS_RM>,
	Sched<[WriteFAddLd, ReadAfterLd]>;
	}

	let Defs = [EFLAGS] in {
	defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
	"ucomiss">, PS, VEX, VEX_LIG, VEX_WIG;
	defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
	"ucomisd">, PD, VEX, VEX_LIG, VEX_WIG;
	let Pattern = []<dag> in {
	defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
	"comiss">, PS, VEX, VEX_LIG, VEX_WIG;
	defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
	"comisd">, PD, VEX, VEX_LIG, VEX_WIG;
	}

	let isCodeGenOnly = 1 in {
	defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
	sse_load_f32, "ucomiss">, PS, VEX, VEX_WIG;
	defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
	sse_load_f64, "ucomisd">, PD, VEX, VEX_WIG;

	defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
	sse_load_f32, "comiss">, PS, VEX, VEX_WIG;
	defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
	sse_load_f64, "comisd">, PD, VEX, VEX_WIG;
	}
	defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
	"ucomiss">, PS;
	defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
	"ucomisd">, PD;

	let Pattern = []<dag> in {
	defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
	"comiss">, PS;
	defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
	"comisd">, PD;
	}

	let isCodeGenOnly = 1 in {
	defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
	sse_load_f32, "ucomiss">, PS;
	defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
	sse_load_f64, "ucomisd">, PD;

	defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
	sse_load_f32, "comiss">, PS;
	defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
	sse_load_f64, "comisd">, PD;
	}
	} // Defs = [EFLAGS]

	// sse12_cmp_packed - sse 1 & 2 compare packed instructions
	multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
	Operand CC, ValueType VT, string asm,
	string asm_alt, Domain d, ImmLeaf immLeaf,
	PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
	let isCommutable = 1 in
	def rri : PIi8<0xC2, MRMSrcReg,
	(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
	[(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, immLeaf:$cc)))],
	itins.rr, d>,
	Sched<[WriteFAdd]>;
	def rmi : PIi8<0xC2, MRMSrcMem,
	(outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
	[(set RC:$dst,
	(VT (X86cmpp RC:$src1, (ld_frag addr:$src2), immLeaf:$cc)))],
	itins.rm, d>,
	Sched<[WriteFAddLd, ReadAfterLd]>;

	// Accept explicit immediate argument form instead of comparison code.
	let isAsmParserOnly = 1, hasSideEffects = 0 in {
	def rri_alt : PIi8<0xC2, MRMSrcReg,
	(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc),
	asm_alt, [], itins.rr, d>, Sched<[WriteFAdd]>;
	let mayLoad = 1 in
	def rmi_alt : PIi8<0xC2, MRMSrcMem,
	(outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc),
	asm_alt, [], itins.rm, d>,
	Sched<[WriteFAddLd, ReadAfterLd]>;
	}
	}

	defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
	"cmp${cc}ps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	"cmpps\t{$cc, $src2, $src1, $dst\|$dst, $src1, $src2, $cc}",
	SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V, VEX_WIG;
	defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
	"cmp${cc}pd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	"cmppd\t{$cc, $src2, $src1, $dst\|$dst, $src1, $src2, $cc}",
	SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V, VEX_WIG;
	defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
	"cmp${cc}ps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	"cmpps\t{$cc, $src2, $src1, $dst\|$dst, $src1, $src2, $cc}",
	SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
	defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
	"cmp${cc}pd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	"cmppd\t{$cc, $src2, $src1, $dst\|$dst, $src1, $src2, $cc}",
	SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
	let Constraints = "$src1 = $dst" in {
	defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
	"cmp${cc}ps\t{$src2, $dst\|$dst, $src2}",
	"cmpps\t{$cc, $src2, $dst\|$dst, $src2, $cc}",
	SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
	defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
	"cmp${cc}pd\t{$src2, $dst\|$dst, $src2}",
	"cmppd\t{$cc, $src2, $dst\|$dst, $src2, $cc}",
	SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Shuffle Instructions
	//===----------------------------------------------------------------------===//

	/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
	multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
	ValueType vt, string asm, PatFrag mem_frag,
	Domain d> {
	def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
	[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
	(i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
	Sched<[WriteFShuffleLd, ReadAfterLd]>;
	def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, u8imm:$src3), asm,
	[(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
	(i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
	Sched<[WriteFShuffle]>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
	"shufps\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	loadv4f32, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
	defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
	"shufps\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
	defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
	"shufpd\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	loadv2f64, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
	defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
	"shufpd\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
	}
	let Constraints = "$src1 = $dst" in {
	defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
	"shufps\t{$src3, $src2, $dst\|$dst, $src2, $src3}",
	memopv4f32, SSEPackedSingle>, PS;
	defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
	"shufpd\t{$src3, $src2, $dst\|$dst, $src2, $src3}",
	memopv2f64, SSEPackedDouble>, PD;
	}

	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v4i32 (X86Shufp VR128:$src1,
	(bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
	(VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;

	def : Pat<(v2i64 (X86Shufp VR128:$src1,
	(loadv2i64 addr:$src2), (i8 imm:$imm))),
	(VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;

	// 256-bit patterns
	def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v8i32 (X86Shufp VR256:$src1,
	(bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
	(VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;

	def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v4i64 (X86Shufp VR256:$src1,
	(loadv4i64 addr:$src2), (i8 imm:$imm))),
	(VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
	}

	let Predicates = [UseSSE1] in {
	def : Pat<(v4i32 (X86Shufp VR128:$src1,
	(bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
	(SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
	}

	let Predicates = [UseSSE2] in {
	// Generic SHUFPD patterns
	def : Pat<(v2i64 (X86Shufp VR128:$src1,
	(memopv2i64 addr:$src2), (i8 imm:$imm))),
	(SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Unpack FP Instructions
	//===----------------------------------------------------------------------===//

	/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
	multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
	PatFrag mem_frag, RegisterClass RC,
	X86MemOperand x86memop, string asm,
	Domain d, bit IsCommutable = 0> {
	let isCommutable = IsCommutable in
	def rr : PI<opc, MRMSrcReg,
	(outs RC:$dst), (ins RC:$src1, RC:$src2),
	asm, [(set RC:$dst,
	(vt (OpNode RC:$src1, RC:$src2)))],
	IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
	def rm : PI<opc, MRMSrcMem,
	(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	asm, [(set RC:$dst,
	(vt (OpNode RC:$src1,
	(mem_frag addr:$src2))))],
	IIC_SSE_UNPCK, d>,
	Sched<[WriteFShuffleLd, ReadAfterLd]>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
	VR128, f128mem, "unpckhps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
	defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
	VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
	defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
	VR128, f128mem, "unpcklps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
	defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
	VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	SSEPackedDouble>, PD, VEX_4V, VEX_WIG;

	defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
	VR256, f256mem, "unpckhps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
	defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
	VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
	defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
	VR256, f256mem, "unpcklps\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
	defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
	VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
	}// Predicates = [HasAVX, NoVLX]
	let Constraints = "$src1 = $dst" in {
	defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
	VR128, f128mem, "unpckhps\t{$src2, $dst\|$dst, $src2}",
	SSEPackedSingle>, PS;
	defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
	VR128, f128mem, "unpckhpd\t{$src2, $dst\|$dst, $src2}",
	SSEPackedDouble, 1>, PD;
	defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
	VR128, f128mem, "unpcklps\t{$src2, $dst\|$dst, $src2}",
	SSEPackedSingle>, PS;
	defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
	VR128, f128mem, "unpcklpd\t{$src2, $dst\|$dst, $src2}",
	SSEPackedDouble>, PD;
	} // Constraints = "$src1 = $dst"

	let Predicates = [HasAVX1Only] in {
	def : Pat<(v8i32 (X86Unpckl VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
	(VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
	def : Pat<(v8i32 (X86Unpckl VR256:$src1, VR256:$src2)),
	(VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
	def : Pat<(v8i32 (X86Unpckh VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
	(VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
	def : Pat<(v8i32 (X86Unpckh VR256:$src1, VR256:$src2)),
	(VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;

	def : Pat<(v4i64 (X86Unpckl VR256:$src1, (loadv4i64 addr:$src2))),
	(VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
	def : Pat<(v4i64 (X86Unpckl VR256:$src1, VR256:$src2)),
	(VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
	def : Pat<(v4i64 (X86Unpckh VR256:$src1, (loadv4i64 addr:$src2))),
	(VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
	def : Pat<(v4i64 (X86Unpckh VR256:$src1, VR256:$src2)),
	(VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
	}

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Extract Floating-Point Sign mask
	//===----------------------------------------------------------------------===//

	/// sse12_extr_sign_mask - sse 1 & 2 unpack and interleave
	multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
	string asm, Domain d> {
	def rr : PI<0x50, MRMSrcReg, (outs GR32orGR64:$dst), (ins RC:$src),
	!strconcat(asm, "\t{$src, $dst\|$dst, $src}"),
	[(set GR32orGR64:$dst, (X86movmsk (vt RC:$src)))], IIC_SSE_MOVMSK, d>,
	Sched<[WriteVecLogic]>;
	}

	let Predicates = [HasAVX] in {
	defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
	SSEPackedSingle>, PS, VEX, VEX_WIG;
	defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
	SSEPackedDouble>, PD, VEX, VEX_WIG;
	defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
	SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
	defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
	SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
	}

	defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
	SSEPackedSingle>, PS;
	defm MOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
	SSEPackedDouble>, PD;

	//===---------------------------------------------------------------------===//
	// SSE2 - Packed Integer Logical Instructions
	//===---------------------------------------------------------------------===//

	let ExeDomain = SSEPackedInt in { // SSE integer instructions

	/// PDI_binop_rm - Simple SSE2 binary operator.
	multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
	X86MemOperand x86memop, OpndItins itins,
	bit IsCommutable, bit Is2Addr> {
	let isCommutable = IsCommutable in
	def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))], itins.rr>,
	Sched<[itins.Sched]>;
	def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (OpVT (OpNode RC:$src1,
	(bitconvert (memop_frag addr:$src2)))))],
	itins.rm>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}
	} // ExeDomain = SSEPackedInt

	multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
	ValueType OpVT128, ValueType OpVT256,
	OpndItins itins, bit IsCommutable = 0, Predicate prd> {
	let Predicates = [HasAVX, prd] in
	defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
	VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V, VEX_WIG;

	let Constraints = "$src1 = $dst" in
	defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
	memopv2i64, i128mem, itins, IsCommutable, 1>;

	let Predicates = [HasAVX2, prd] in
	defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
	OpVT256, VR256, loadv4i64, i256mem, itins,
	IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
	}

	// These are ordered here for pattern ordering requirements with the fp versions

	defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
	SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
	defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
	SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
	defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
	SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
	defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
	SSE_VEC_BIT_ITINS_P, 0, NoVLX>;

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Logical Instructions
	//===----------------------------------------------------------------------===//

	/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
	///
	multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
	SDNode OpNode> {
	let Predicates = [HasAVX, NoVLX] in {
	defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
	!strconcat(OpcodeStr, "ps"), f256mem,
	[(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
	(bc_v4i64 (v8f32 VR256:$src2))))],
	[(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
	(loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L, VEX_WIG;

	defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
	!strconcat(OpcodeStr, "pd"), f256mem,
	[(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
	(bc_v4i64 (v4f64 VR256:$src2))))],
	[(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
	(loadv4i64 addr:$src2)))], 0>,
	PD, VEX_4V, VEX_L, VEX_WIG;

	defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
	!strconcat(OpcodeStr, "ps"), f128mem,
	[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
	(bc_v2i64 (v4f32 VR128:$src2))))],
	[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
	(loadv2i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_WIG;

	defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
	!strconcat(OpcodeStr, "pd"), f128mem,
	[(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
	(bc_v2i64 (v2f64 VR128:$src2))))],
	[(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
	(loadv2i64 addr:$src2)))], 0>,
	PD, VEX_4V, VEX_WIG;
	}

	let Constraints = "$src1 = $dst" in {
	defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
	!strconcat(OpcodeStr, "ps"), f128mem,
	[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
	(bc_v2i64 (v4f32 VR128:$src2))))],
	[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
	(memopv2i64 addr:$src2)))]>, PS;

	defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
	!strconcat(OpcodeStr, "pd"), f128mem,
	[(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
	(bc_v2i64 (v2f64 VR128:$src2))))],
	[(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
	(memopv2i64 addr:$src2)))]>, PD;
	}
	}

	defm AND : sse12_fp_packed_logical<0x54, "and", and>;
	defm OR : sse12_fp_packed_logical<0x56, "or", or>;
	defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
	let isCommutable = 0 in
	defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;

	// If only AVX1 is supported, we need to handle integer operations with
	// floating point instructions since the integer versions aren't available.
	let Predicates = [HasAVX1Only] in {
	def : Pat<(v4i64 (and VR256:$src1, VR256:$src2)),
	(VANDPSYrr VR256:$src1, VR256:$src2)>;
	def : Pat<(v4i64 (or VR256:$src1, VR256:$src2)),
	(VORPSYrr VR256:$src1, VR256:$src2)>;
	def : Pat<(v4i64 (xor VR256:$src1, VR256:$src2)),
	(VXORPSYrr VR256:$src1, VR256:$src2)>;
	def : Pat<(v4i64 (X86andnp VR256:$src1, VR256:$src2)),
	(VANDNPSYrr VR256:$src1, VR256:$src2)>;

	def : Pat<(and VR256:$src1, (loadv4i64 addr:$src2)),
	(VANDPSYrm VR256:$src1, addr:$src2)>;
	def : Pat<(or VR256:$src1, (loadv4i64 addr:$src2)),
	(VORPSYrm VR256:$src1, addr:$src2)>;
	def : Pat<(xor VR256:$src1, (loadv4i64 addr:$src2)),
	(VXORPSYrm VR256:$src1, addr:$src2)>;
	def : Pat<(X86andnp VR256:$src1, (loadv4i64 addr:$src2)),
	(VANDNPSYrm VR256:$src1, addr:$src2)>;
	}

	let Predicates = [HasAVX, NoVLX_Or_NoDQI] in {
	// Use packed logical operations for scalar ops.
	def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
	(COPY_TO_REGCLASS (VANDPDrr
	(COPY_TO_REGCLASS FR64:$src1, VR128),
	(COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
	def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
	(COPY_TO_REGCLASS (VORPDrr
	(COPY_TO_REGCLASS FR64:$src1, VR128),
	(COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
	def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
	(COPY_TO_REGCLASS (VXORPDrr
	(COPY_TO_REGCLASS FR64:$src1, VR128),
	(COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
	def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
	(COPY_TO_REGCLASS (VANDNPDrr
	(COPY_TO_REGCLASS FR64:$src1, VR128),
	(COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;

	def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
	(COPY_TO_REGCLASS (VANDPSrr
	(COPY_TO_REGCLASS FR32:$src1, VR128),
	(COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
	def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
	(COPY_TO_REGCLASS (VORPSrr
	(COPY_TO_REGCLASS FR32:$src1, VR128),
	(COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
	def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
	(COPY_TO_REGCLASS (VXORPSrr
	(COPY_TO_REGCLASS FR32:$src1, VR128),
	(COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
	def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
	(COPY_TO_REGCLASS (VANDNPSrr
	(COPY_TO_REGCLASS FR32:$src1, VR128),
	(COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
	}

	let Predicates = [UseSSE1] in {
	// Use packed logical operations for scalar ops.
	def : Pat<(f32 (X86fand FR32:$src1, FR32:$src2)),
	(COPY_TO_REGCLASS (ANDPSrr
	(COPY_TO_REGCLASS FR32:$src1, VR128),
	(COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
	def : Pat<(f32 (X86for FR32:$src1, FR32:$src2)),
	(COPY_TO_REGCLASS (ORPSrr
	(COPY_TO_REGCLASS FR32:$src1, VR128),
	(COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
	def : Pat<(f32 (X86fxor FR32:$src1, FR32:$src2)),
	(COPY_TO_REGCLASS (XORPSrr
	(COPY_TO_REGCLASS FR32:$src1, VR128),
	(COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
	def : Pat<(f32 (X86fandn FR32:$src1, FR32:$src2)),
	(COPY_TO_REGCLASS (ANDNPSrr
	(COPY_TO_REGCLASS FR32:$src1, VR128),
	(COPY_TO_REGCLASS FR32:$src2, VR128)), FR32)>;
	}

	let Predicates = [UseSSE2] in {
	// Use packed logical operations for scalar ops.
	def : Pat<(f64 (X86fand FR64:$src1, FR64:$src2)),
	(COPY_TO_REGCLASS (ANDPDrr
	(COPY_TO_REGCLASS FR64:$src1, VR128),
	(COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
	def : Pat<(f64 (X86for FR64:$src1, FR64:$src2)),
	(COPY_TO_REGCLASS (ORPDrr
	(COPY_TO_REGCLASS FR64:$src1, VR128),
	(COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
	def : Pat<(f64 (X86fxor FR64:$src1, FR64:$src2)),
	(COPY_TO_REGCLASS (XORPDrr
	(COPY_TO_REGCLASS FR64:$src1, VR128),
	(COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
	def : Pat<(f64 (X86fandn FR64:$src1, FR64:$src2)),
	(COPY_TO_REGCLASS (ANDNPDrr
	(COPY_TO_REGCLASS FR64:$src1, VR128),
	(COPY_TO_REGCLASS FR64:$src2, VR128)), FR64)>;
	}

	// Patterns for packed operations when we don't have integer type available.
	def : Pat<(v4f32 (X86fand VR128:$src1, VR128:$src2)),
	(ANDPSrr VR128:$src1, VR128:$src2)>;
	def : Pat<(v4f32 (X86for VR128:$src1, VR128:$src2)),
	(ORPSrr VR128:$src1, VR128:$src2)>;
	def : Pat<(v4f32 (X86fxor VR128:$src1, VR128:$src2)),
	(XORPSrr VR128:$src1, VR128:$src2)>;
	def : Pat<(v4f32 (X86fandn VR128:$src1, VR128:$src2)),
	(ANDNPSrr VR128:$src1, VR128:$src2)>;

	def : Pat<(X86fand VR128:$src1, (memopv4f32 addr:$src2)),
	(ANDPSrm VR128:$src1, addr:$src2)>;
	def : Pat<(X86for VR128:$src1, (memopv4f32 addr:$src2)),
	(ORPSrm VR128:$src1, addr:$src2)>;
	def : Pat<(X86fxor VR128:$src1, (memopv4f32 addr:$src2)),
	(XORPSrm VR128:$src1, addr:$src2)>;
	def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
	(ANDNPSrm VR128:$src1, addr:$src2)>;

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Arithmetic Instructions
	//===----------------------------------------------------------------------===//

	/// basic_sse12_fp_binop_xxx - SSE 1 & 2 binops come in both scalar and
	/// vector forms.
	///
	/// In addition, we also have a special variant of the scalar form here to
	/// represent the associated intrinsic operation. This form is unlike the
	/// plain scalar form, in that it takes an entire vector (instead of a scalar)
	/// and leaves the top elements unmodified (therefore these cannot be commuted).
	///
	/// These three forms can each be reg+reg or reg+mem.
	///

	/// FIXME: once all 256-bit intrinsics are matched, cleanup and refactor those
	/// classes below
	multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
	SDNode OpNode, SizeItins itins> {
	let Predicates = [HasAVX, NoVLX] in {
	defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
	VR128, v4f32, f128mem, loadv4f32,
	SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_WIG;
	defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
	VR128, v2f64, f128mem, loadv2f64,
	SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_WIG;

	defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
	OpNode, VR256, v8f32, f256mem, loadv8f32,
	SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
	defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
	OpNode, VR256, v4f64, f256mem, loadv4f64,
	SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
	}

	let Constraints = "$src1 = $dst" in {
	defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
	v4f32, f128mem, memopv4f32, SSEPackedSingle,
	itins.s>, PS;
	defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
	v2f64, f128mem, memopv2f64, SSEPackedDouble,
	itins.d>, PD;
	}
	}

	multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
	SizeItins itins> {
	defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
	OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
	XS, VEX_4V, VEX_LIG, VEX_WIG;
	defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
	OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
	XD, VEX_4V, VEX_LIG, VEX_WIG;

	let Constraints = "$src1 = $dst" in {
	defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
	OpNode, FR32, f32mem, SSEPackedSingle,
	itins.s>, XS;
	defm SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
	OpNode, FR64, f64mem, SSEPackedDouble,
	itins.d>, XD;
	}
	}

	multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
	SDPatternOperator OpNode,
	SizeItins itins> {
	defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
	!strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
	SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
	defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
	!strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
	SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;

	let Constraints = "$src1 = $dst" in {
	defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
	!strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
	SSEPackedSingle, itins.s>, XS;
	defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
	!strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
	SSEPackedDouble, itins.d>, XD;
	}
	}

	// Binary Arithmetic instructions
	defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
	basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
	basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SSE_ALU_ITINS_S>;
	defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
	basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
	basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SSE_MUL_ITINS_S>;
	let isCommutable = 0 in {
	defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
	basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
	basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag,SSE_ALU_ITINS_S>;
	defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
	basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
	basic_sse12_fp_binop_s_int<0x5E, "div", null_frag,SSE_DIV_ITINS_S>;
	defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
	basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
	basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SSE_ALU_ITINS_S>;
	defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
	basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
	basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SSE_ALU_ITINS_S>;
	}

	let isCodeGenOnly = 1 in {
	defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>,
	basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
	defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>,
	basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
	}

	// Patterns used to select SSE scalar fp arithmetic instructions from
	// either:
	//
	// (1) a scalar fp operation followed by a blend
	//
	// The effect is that the backend no longer emits unnecessary vector
	// insert instructions immediately after SSE scalar fp instructions
	// like addss or mulss.
	//
	// For example, given the following code:
	// __m128 foo(__m128 A, __m128 B) {
	// A[0] += B[0];
	// return A;
	// }
	//
	// Previously we generated:
	// addss %xmm0, %xmm1
	// movss %xmm1, %xmm0
	//
	// We now generate:
	// addss %xmm1, %xmm0
	//
	// (2) a vector packed single/double fp operation followed by a vector insert
	//
	// The effect is that the backend converts the packed fp instruction
	// followed by a vector insert into a single SSE scalar fp instruction.
	//
	// For example, given the following code:
	// __m128 foo(__m128 A, __m128 B) {
	// __m128 C = A + B;
	// return (__m128) {c[0], a[1], a[2], a[3]};
	// }
	//
	// Previously we generated:
	// addps %xmm0, %xmm1
	// movss %xmm1, %xmm0
	//
	// We now generate:
	// addss %xmm1, %xmm0

	// TODO: Some canonicalization in lowering would simplify the number of
	// patterns we have to try to match.
	multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
	let Predicates = [UseSSE1] in {
	// extracted scalar math op with insert via movss
	def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
	(Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
	FR32:$src))))),
	(!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
	(COPY_TO_REGCLASS FR32:$src, VR128))>;

	// vector math op with insert via movss
	def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
	(Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
	(!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
	}

	// With SSE 4.1, blendi is preferred to movsd, so match that too.
	let Predicates = [UseSSE41] in {
	// extracted scalar math op with insert via blend
	def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
	(Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
	FR32:$src))), (i8 1))),
	(!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
	(COPY_TO_REGCLASS FR32:$src, VR128))>;

	// vector math op with insert via blend
	def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
	(Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
	(!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;

	}

	// Repeat everything for AVX.
	let Predicates = [UseAVX] in {
	// extracted scalar math op with insert via movss
	def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
	(Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
	FR32:$src))))),
	(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
	(COPY_TO_REGCLASS FR32:$src, VR128))>;

	// extracted scalar math op with insert via blend
	def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
	(Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
	FR32:$src))), (i8 1))),
	(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
	(COPY_TO_REGCLASS FR32:$src, VR128))>;

	// vector math op with insert via movss
	def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
	(Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
	(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;

	// vector math op with insert via blend
	def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
	(Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
	(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
	}
	}

	defm : scalar_math_f32_patterns<fadd, "ADD">;
	defm : scalar_math_f32_patterns<fsub, "SUB">;
	defm : scalar_math_f32_patterns<fmul, "MUL">;
	defm : scalar_math_f32_patterns<fdiv, "DIV">;

	multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
	let Predicates = [UseSSE2] in {
	// extracted scalar math op with insert via movsd
	def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
	(Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
	FR64:$src))))),
	(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
	(COPY_TO_REGCLASS FR64:$src, VR128))>;

	// vector math op with insert via movsd
	def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
	(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
	(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
	}

	// With SSE 4.1, blendi is preferred to movsd, so match those too.
	let Predicates = [UseSSE41] in {
	// extracted scalar math op with insert via blend
	def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
	(Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
	FR64:$src))), (i8 1))),
	(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
	(COPY_TO_REGCLASS FR64:$src, VR128))>;

	// vector math op with insert via blend
	def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
	(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
	(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
	}

	// Repeat everything for AVX.
	let Predicates = [UseAVX] in {
	// extracted scalar math op with insert via movsd
	def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
	(Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
	FR64:$src))))),
	(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
	(COPY_TO_REGCLASS FR64:$src, VR128))>;

	// extracted scalar math op with insert via blend
	def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
	(Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
	FR64:$src))), (i8 1))),
	(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
	(COPY_TO_REGCLASS FR64:$src, VR128))>;

	// vector math op with insert via movsd
	def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
	(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
	(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;

	// vector math op with insert via blend
	def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
	(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
	(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
	}
	}

	defm : scalar_math_f64_patterns<fadd, "ADD">;
	defm : scalar_math_f64_patterns<fsub, "SUB">;
	defm : scalar_math_f64_patterns<fmul, "MUL">;
	defm : scalar_math_f64_patterns<fdiv, "DIV">;


	/// Unop Arithmetic
	/// In addition, we also have a special variant of the scalar form here to
	/// represent the associated intrinsic operation. This form is unlike the
	/// plain scalar form, in that it takes an entire vector (instead of a
	/// scalar) and leaves the top elements undefined.
	///
	/// And, we have a special variant form for a full-vector intrinsic form.

	let Sched = WriteFSqrt in {
	def SSE_SQRTPS : OpndItins<
	IIC_SSE_SQRTPS_RR, IIC_SSE_SQRTPS_RM
	>;

	def SSE_SQRTSS : OpndItins<
	IIC_SSE_SQRTSS_RR, IIC_SSE_SQRTSS_RM
	>;

	def SSE_SQRTPD : OpndItins<
	IIC_SSE_SQRTPD_RR, IIC_SSE_SQRTPD_RM
	>;

	def SSE_SQRTSD : OpndItins<
	IIC_SSE_SQRTSD_RR, IIC_SSE_SQRTSD_RM
	>;
	}

	let Sched = WriteFRsqrt in {
	def SSE_RSQRTPS : OpndItins<
	IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
	>;

	def SSE_RSQRTSS : OpndItins<
	IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
	>;
	}

	let Sched = WriteFRcp in {
	def SSE_RCPP : OpndItins<
	IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
	>;

	def SSE_RCPS : OpndItins<
	IIC_SSE_RCPS_RR, IIC_SSE_RCPS_RM
	>;
	}

	/// sse_fp_unop_s - SSE1 unops in scalar form
	/// For the non-AVX defs, we need $src1 to be tied to $dst because
	/// the HW instructions are 2 operand / destructive.
	multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
	ValueType vt, ValueType ScalarVT,
	X86MemOperand x86memop,
	Intrinsic Intr,
	SDNode OpNode, Domain d, OpndItins itins,
	Predicate target, string Suffix> {
	let hasSideEffects = 0 in {
	def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1),
	!strconcat(OpcodeStr, "\t{$src1, $dst\|$dst, $src1}"),
	[(set RC:$dst, (OpNode RC:$src1))], itins.rr, d>, Sched<[itins.Sched]>,
	Requires<[target]>;
	let mayLoad = 1 in
	def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1),
	!strconcat(OpcodeStr, "\t{$src1, $dst\|$dst, $src1}"),
	[(set RC:$dst, (OpNode (load addr:$src1)))], itins.rm, d>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>,
	Requires<[target, OptForSize]>;

	let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
	def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
	let mayLoad = 1 in
	def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}
	}

	let Predicates = [target] in {
	// These are unary operations, but they are modeled as having 2 source operands
	// because the high elements of the destination are unchanged in SSE.
	def : Pat<(Intr VR128:$src),
	(!cast<Instruction>(NAME#Suffix##r_Int) VR128:$src, VR128:$src)>;
	}
	// We don't want to fold scalar loads into these instructions unless
	// optimizing for size. This is because the folded instruction will have a
	// partial register update, while the unfolded sequence will not, e.g.
	// movss mem, %xmm0
	// rcpss %xmm0, %xmm0
	// which has a clobber before the rcp, vs.
	// rcpss mem, %xmm0
	let Predicates = [target, OptForSize] in {
	def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
	(!cast<Instruction>(NAME#Suffix##m_Int)
	(vt (IMPLICIT_DEF)), addr:$src2)>;
	}
	}

	multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
	ValueType vt, ValueType ScalarVT,
	X86MemOperand x86memop,
	Intrinsic Intr, SDNode OpNode, Domain d,
	OpndItins itins, string Suffix> {
	let hasSideEffects = 0 in {
	def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[], itins.rr, d>, Sched<[itins.Sched]>;
	let mayLoad = 1 in
	def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
	let isCodeGenOnly = 1, ExeDomain = d in {
	def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[]>, Sched<[itins.Sched.Folded]>;
	let mayLoad = 1 in
	def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, x86memop:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}
	}

	// We don't want to fold scalar loads into these instructions unless
	// optimizing for size. This is because the folded instruction will have a
	// partial register update, while the unfolded sequence will not, e.g.
	// vmovss mem, %xmm0
	// vrcpss %xmm0, %xmm0, %xmm0
	// which has a clobber before the rcp, vs.
	// vrcpss mem, %xmm0, %xmm0
	// TODO: In theory, we could fold the load, and avoid the stall caused by
	// the partial register store, either in ExecutionDepsFix or with smarter RA.
	let Predicates = [UseAVX] in {
	def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
	(ScalarVT (IMPLICIT_DEF)), RC:$src)>;
	}
	let Predicates = [HasAVX] in {
	def : Pat<(Intr VR128:$src),
	(!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
	VR128:$src)>;
	}
	let Predicates = [HasAVX, OptForSize] in {
	def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
	(!cast<Instruction>("V"#NAME#Suffix##m_Int)
	(vt (IMPLICIT_DEF)), addr:$src2)>;
	}
	let Predicates = [UseAVX, OptForSize] in {
	def : Pat<(ScalarVT (OpNode (load addr:$src))),
	(!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
	addr:$src)>;
	}
	}

	/// sse1_fp_unop_p - SSE1 unops in packed form.
	multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
	OpndItins itins, list<Predicate> prds> {
	let Predicates = prds in {
	def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	!strconcat("v", OpcodeStr,
	"ps\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
	itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
	def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	!strconcat("v", OpcodeStr,
	"ps\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
	itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
	def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
	!strconcat("v", OpcodeStr,
	"ps\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
	itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
	def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
	!strconcat("v", OpcodeStr,
	"ps\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
	itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
	}

	def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	!strconcat(OpcodeStr, "ps\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>,
	Sched<[itins.Sched]>;
	def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	!strconcat(OpcodeStr, "ps\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>,
	Sched<[itins.Sched.Folded]>;
	}

	/// sse2_fp_unop_p - SSE2 unops in vector forms.
	multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
	SDNode OpNode, OpndItins itins> {
	let Predicates = [HasAVX] in {
	def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	!strconcat("v", OpcodeStr,
	"pd\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
	itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
	def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	!strconcat("v", OpcodeStr,
	"pd\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
	itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
	def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
	!strconcat("v", OpcodeStr,
	"pd\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
	itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
	def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
	!strconcat("v", OpcodeStr,
	"pd\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
	itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
	}

	def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	!strconcat(OpcodeStr, "pd\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>,
	Sched<[itins.Sched]>;
	def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
	!strconcat(OpcodeStr, "pd\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>,
	Sched<[itins.Sched.Folded]>;
	}

	multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
	OpndItins itins> {
	defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
	!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
	SSEPackedSingle, itins, UseSSE1, "SS">, XS;
	defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
	f32mem,
	!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
	SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG;
	}

	multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
	OpndItins itins> {
	defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
	!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
	OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
	defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
	f64mem,
	!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
	OpNode, SSEPackedDouble, itins, "SD">,
	XD, VEX_4V, VEX_LIG, VEX_WIG;
	}

	// Square root.
	defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
	sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
	sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
	sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;

	// Reciprocal approximations. Note that these typically require refinement
	// in order to obtain suitable precision.
	defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
	sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
	defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
	sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;

	// There is no f64 version of the reciprocal approximation instructions.

	// TODO: We should add scalar op patterns for these just like we have for
	// the binops above. If the binop and unop patterns could all be unified
	// that would be even better.

	multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
	SDNode Move, ValueType VT,
	Predicate BasePredicate> {
	let Predicates = [BasePredicate] in {
	def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
	(!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
	}

	// With SSE 4.1, blendi is preferred to movs*, so match that too.
	let Predicates = [UseSSE41] in {
	def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
	(!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
	}

	// Repeat for AVX versions of the instructions.
	let Predicates = [HasAVX] in {
	def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
	(!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;

	def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
	(!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
	}
	}

	defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
	v4f32, UseSSE1>;
	defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
	v4f32, UseSSE1>;
	defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
	v4f32, UseSSE1>;
	defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
	v2f64, UseSSE2>;


	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Non-temporal stores
	//===----------------------------------------------------------------------===//

	let AddedComplexity = 400 in { // Prefer non-temporal versions
	let SchedRW = [WriteStore] in {
	let Predicates = [HasAVX, NoVLX] in {
	def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
	(ins f128mem:$dst, VR128:$src),
	"movntps\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore (v4f32 VR128:$src),
	addr:$dst)],
	IIC_SSE_MOVNT>, VEX, VEX_WIG;
	def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
	(ins f128mem:$dst, VR128:$src),
	"movntpd\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore (v2f64 VR128:$src),
	addr:$dst)],
	IIC_SSE_MOVNT>, VEX, VEX_WIG;

	let ExeDomain = SSEPackedInt in
	def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
	(ins i128mem:$dst, VR128:$src),
	"movntdq\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore (v2i64 VR128:$src),
	addr:$dst)],
	IIC_SSE_MOVNT>, VEX, VEX_WIG;

	def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
	(ins f256mem:$dst, VR256:$src),
	"movntps\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore (v8f32 VR256:$src),
	addr:$dst)],
	IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
	def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
	(ins f256mem:$dst, VR256:$src),
	"movntpd\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore (v4f64 VR256:$src),
	addr:$dst)],
	IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
	let ExeDomain = SSEPackedInt in
	def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
	(ins i256mem:$dst, VR256:$src),
	"movntdq\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore (v4i64 VR256:$src),
	addr:$dst)],
	IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
	}

	def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movntps\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
	IIC_SSE_MOVNT>;
	def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movntpd\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
	IIC_SSE_MOVNT>;

	let ExeDomain = SSEPackedInt in
	def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
	"movntdq\t{$src, $dst\|$dst, $src}",
	[(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
	IIC_SSE_MOVNT>;

	// There is no AVX form for instructions below this point
	def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
	"movnti{l}\t{$src, $dst\|$dst, $src}",
	[(nontemporalstore (i32 GR32:$src), addr:$dst)],
	IIC_SSE_MOVNT>,
	PS, Requires<[HasSSE2]>;
	def MOVNTI_64mr : RI<0xC3, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
	"movnti{q}\t{$src, $dst\|$dst, $src}",
	[(nontemporalstore (i64 GR64:$src), addr:$dst)],
	IIC_SSE_MOVNT>,
	PS, Requires<[HasSSE2]>;
	} // SchedRW = [WriteStore]

	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(alignednontemporalstore (v8i32 VR256:$src), addr:$dst),
	(VMOVNTDQYmr addr:$dst, VR256:$src)>;
	def : Pat<(alignednontemporalstore (v16i16 VR256:$src), addr:$dst),
	(VMOVNTDQYmr addr:$dst, VR256:$src)>;
	def : Pat<(alignednontemporalstore (v32i8 VR256:$src), addr:$dst),
	(VMOVNTDQYmr addr:$dst, VR256:$src)>;

	def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
	(VMOVNTDQmr addr:$dst, VR128:$src)>;
	def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
	(VMOVNTDQmr addr:$dst, VR128:$src)>;
	def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
	(VMOVNTDQmr addr:$dst, VR128:$src)>;
	}

	let Predicates = [UseSSE2] in {
	def : Pat<(alignednontemporalstore (v4i32 VR128:$src), addr:$dst),
	(MOVNTDQmr addr:$dst, VR128:$src)>;
	def : Pat<(alignednontemporalstore (v8i16 VR128:$src), addr:$dst),
	(MOVNTDQmr addr:$dst, VR128:$src)>;
	def : Pat<(alignednontemporalstore (v16i8 VR128:$src), addr:$dst),
	(MOVNTDQmr addr:$dst, VR128:$src)>;
	}

	} // AddedComplexity

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Prefetch and memory fence
	//===----------------------------------------------------------------------===//

	// Prefetch intrinsic.
	let Predicates = [HasSSE1], SchedRW = [WriteLoad] in {
	def PREFETCHT0 : I<0x18, MRM1m, (outs), (ins i8mem:$src),
	"prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3), (i32 1))],
	IIC_SSE_PREFETCH>, TB;
	def PREFETCHT1 : I<0x18, MRM2m, (outs), (ins i8mem:$src),
	"prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2), (i32 1))],
	IIC_SSE_PREFETCH>, TB;
	def PREFETCHT2 : I<0x18, MRM3m, (outs), (ins i8mem:$src),
	"prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1), (i32 1))],
	IIC_SSE_PREFETCH>, TB;
	def PREFETCHNTA : I<0x18, MRM0m, (outs), (ins i8mem:$src),
	"prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0), (i32 1))],
	IIC_SSE_PREFETCH>, TB;
	}

	// FIXME: How should flush instruction be modeled?
	let SchedRW = [WriteLoad] in {
	// Flush cache
	def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
	"clflush\t$src", [(int_x86_sse2_clflush addr:$src)],
	IIC_SSE_PREFETCH>, PS, Requires<[HasSSE2]>;
	}

	let SchedRW = [WriteNop] in {
	// Pause. This "instruction" is encoded as "rep; nop", so even though it
	// was introduced with SSE2, it's backward compatible.
	def PAUSE : I<0x90, RawFrm, (outs), (ins),
	- "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>,
	- OBXS, Requires<[HasSSE2]>;
	+ "pause", [(int_x86_sse2_pause)], IIC_SSE_PAUSE>, OBXS;
	}

	let SchedRW = [WriteFence] in {
	// Load, store, and memory fence
	// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
	// to include any 64-bit target.
	def SFENCE : I<0xAE, MRM_F8, (outs), (ins),
	"sfence", [(int_x86_sse_sfence)], IIC_SSE_SFENCE>,
	PS, Requires<[HasSSE1]>;
	def LFENCE : I<0xAE, MRM_E8, (outs), (ins),
	"lfence", [(int_x86_sse2_lfence)], IIC_SSE_LFENCE>,
	TB, Requires<[HasSSE2]>;
	def MFENCE : I<0xAE, MRM_F0, (outs), (ins),
	"mfence", [(int_x86_sse2_mfence)], IIC_SSE_MFENCE>,
	TB, Requires<[HasMFence]>;
	} // SchedRW

	def : Pat<(X86MFence), (MFENCE)>;

	//===----------------------------------------------------------------------===//
	// SSE 1 & 2 - Load/Store XCSR register
	//===----------------------------------------------------------------------===//

	def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
	"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
	IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>, VEX_WIG;
	def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
	"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
	IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>, VEX_WIG;

	def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
	"ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
	IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
	def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
	"stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
	IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;

	//===---------------------------------------------------------------------===//
	// SSE2 - Move Aligned/Unaligned Packed Integer Instructions
	//===---------------------------------------------------------------------===//

	let ExeDomain = SSEPackedInt in { // SSE integer instructions

	let hasSideEffects = 0, SchedRW = [WriteMove] in {
	def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"movdqa\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
	VEX, VEX_WIG;
	def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
	"movdqa\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
	VEX, VEX_L, VEX_WIG;
	def VMOVDQUrr : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"movdqu\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
	VEX, VEX_WIG;
	def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
	"movdqu\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
	VEX, VEX_L, VEX_WIG;
	}

	// For Disassembler
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
	SchedRW = [WriteMove] in {
	def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movdqa\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>,
	VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
	def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
	"movdqa\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
	FoldGenData<"VMOVDQAYrr">;
	def VMOVDQUrr_REV : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movdqu\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVU_P_RR>,
	VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
	def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
	"movdqu\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
	FoldGenData<"VMOVDQUYrr">;
	}

	let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
	hasSideEffects = 0, SchedRW = [WriteLoad] in {
	let Predicates = [HasAVX,NoVLX] in
	def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
	"movdqa\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (alignedloadv2i64 addr:$src))],
	IIC_SSE_MOVA_P_RM>, VEX, VEX_WIG;
	def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
	"movdqa\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
	VEX, VEX_L, VEX_WIG;
	let Predicates = [HasAVX,NoVLX] in
	def VMOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
	"vmovdqu\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (loadv2i64 addr:$src))],
	IIC_SSE_MOVU_P_RM>, XS, VEX, VEX_WIG;
	def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
	"vmovdqu\t{$src, $dst\|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
	XS, VEX, VEX_L, VEX_WIG;
	}

	let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
	let Predicates = [HasAVX,NoVLX] in
	def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
	(ins i128mem:$dst, VR128:$src),
	"movdqa\t{$src, $dst\|$dst, $src}",
	[(alignedstore (v2i64 VR128:$src), addr:$dst)],
	IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
	def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
	(ins i256mem:$dst, VR256:$src),
	"movdqa\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
	VEX, VEX_L, VEX_WIG;
	let Predicates = [HasAVX,NoVLX] in
	def VMOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
	"vmovdqu\t{$src, $dst\|$dst, $src}",
	[(store (v2i64 VR128:$src), addr:$dst)], IIC_SSE_MOVU_P_MR>,
	XS, VEX, VEX_WIG;
	def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
	"vmovdqu\t{$src, $dst\|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
	XS, VEX, VEX_L, VEX_WIG;
	}

	let SchedRW = [WriteMove] in {
	let hasSideEffects = 0 in {
	def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"movdqa\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;

	def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"movdqu\t{$src, $dst\|$dst, $src}",
	[], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
	}

	// For Disassembler
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
	def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movdqa\t{$src, $dst\|$dst, $src}", [],
	IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVDQArr">;

	def MOVDQUrr_REV : I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movdqu\t{$src, $dst\|$dst, $src}",
	[], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>,
	FoldGenData<"MOVDQUrr">;
	}
	} // SchedRW

	let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
	hasSideEffects = 0, SchedRW = [WriteLoad] in {
	def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
	"movdqa\t{$src, $dst\|$dst, $src}",
	[/(set VR128:$dst, (alignedloadv2i64 addr:$src))/],
	IIC_SSE_MOVA_P_RM>;
	def MOVDQUrm : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
	"movdqu\t{$src, $dst\|$dst, $src}",
	[/(set VR128:$dst, (loadv2i64 addr:$src))/],
	IIC_SSE_MOVU_P_RM>,
	XS, Requires<[UseSSE2]>;
	}

	let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
	def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
	"movdqa\t{$src, $dst\|$dst, $src}",
	[/(alignedstore (v2i64 VR128:$src), addr:$dst)/],
	IIC_SSE_MOVA_P_MR>;
	def MOVDQUmr : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
	"movdqu\t{$src, $dst\|$dst, $src}",
	[/(store (v2i64 VR128:$src), addr:$dst)/],
	IIC_SSE_MOVU_P_MR>,
	XS, Requires<[UseSSE2]>;
	}

	} // ExeDomain = SSEPackedInt

	// Aliases to help the assembler pick two byte VEX encodings by swapping the
	// operands relative to the normal instructions to use VEX.R instead of VEX.B.
	def : InstAlias<"vmovdqa\t{$src, $dst\|$dst, $src}",
	(VMOVDQArr_REV VR128L:$dst, VR128H:$src), 0>;
	def : InstAlias<"vmovdqa\t{$src, $dst\|$dst, $src}",
	(VMOVDQAYrr_REV VR256L:$dst, VR256H:$src), 0>;
	def : InstAlias<"vmovdqu\t{$src, $dst\|$dst, $src}",
	(VMOVDQUrr_REV VR128L:$dst, VR128H:$src), 0>;
	def : InstAlias<"vmovdqu\t{$src, $dst\|$dst, $src}",
	(VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;

	let Predicates = [HasAVX, NoVLX] in {
	// Additional patterns for other integer sizes.
	def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
	(VMOVDQAmr addr:$dst, VR128:$src)>;
	def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
	(VMOVDQAmr addr:$dst, VR128:$src)>;
	def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
	(VMOVDQAmr addr:$dst, VR128:$src)>;
	def : Pat<(store (v4i32 VR128:$src), addr:$dst),
	(VMOVDQUmr addr:$dst, VR128:$src)>;
	def : Pat<(store (v8i16 VR128:$src), addr:$dst),
	(VMOVDQUmr addr:$dst, VR128:$src)>;
	def : Pat<(store (v16i8 VR128:$src), addr:$dst),
	(VMOVDQUmr addr:$dst, VR128:$src)>;

	// Special patterns for storing subvector extracts of lower 128-bits
	// Its cheaper to just use VMOVDQA/VMOVDQU instead of VEXTRACTF128mr
	def : Pat<(alignedstore (v2i64 (extract_subvector
	(v4i64 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVDQAmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v4i32 (extract_subvector
	(v8i32 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVDQAmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v8i16 (extract_subvector
	(v16i16 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVDQAmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	def : Pat<(alignedstore (v16i8 (extract_subvector
	(v32i8 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVDQAmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;

	def : Pat<(store (v2i64 (extract_subvector
	(v4i64 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVDQUmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	def : Pat<(store (v4i32 (extract_subvector
	(v8i32 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVDQUmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	def : Pat<(store (v8i16 (extract_subvector
	(v16i16 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVDQUmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	def : Pat<(store (v16i8 (extract_subvector
	(v32i8 VR256:$src), (iPTR 0))), addr:$dst),
	(VMOVDQUmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
	}

	//===---------------------------------------------------------------------===//
	// SSE2 - Packed Integer Arithmetic Instructions
	//===---------------------------------------------------------------------===//

	let Sched = WriteVecIMul in
	def SSE_PMADD : OpndItins<
	IIC_SSE_PMADD, IIC_SSE_PMADD
	>;

	let ExeDomain = SSEPackedInt in { // SSE integer instructions

	/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
	multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType DstVT, ValueType SrcVT, RegisterClass RC,
	PatFrag memop_frag, X86MemOperand x86memop,
	OpndItins itins, bit Is2Addr = 1> {
	let isCommutable = 1 in
	def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
	Sched<[itins.Sched]>;
	def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
	(bitconvert (memop_frag addr:$src2)))))]>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}
	} // ExeDomain = SSEPackedInt

	defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
	SSE_INTALU_ITINS_P, 1, NoVLX>;
	defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
	SSE_INTALUQ_ITINS_P, 1, NoVLX>;
	defm PADDSB : PDI_binop_all<0xEC, "paddsb", X86adds, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PADDSW : PDI_binop_all<0xED, "paddsw", X86adds, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PADDUSB : PDI_binop_all<0xDC, "paddusb", X86addus, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PADDUSW : PDI_binop_all<0xDD, "paddusw", X86addus, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
	SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
	SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PMULHW : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
	SSE_INTMUL_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
	defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
	defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
	SSE_INTALU_ITINS_P, 0, NoVLX>;
	defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
	SSE_INTALUQ_ITINS_P, 0, NoVLX>;
	defm PSUBSB : PDI_binop_all<0xE8, "psubsb", X86subs, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
	defm PSUBSW : PDI_binop_all<0xE9, "psubsw", X86subs, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
	defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
	defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 0, NoVLX_Or_NoBWI>;
	defm PMINUB : PDI_binop_all<0xDA, "pminub", umin, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PMINSW : PDI_binop_all<0xEA, "pminsw", smin, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", umax, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", smax, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PAVGB : PDI_binop_all<0xE0, "pavgb", X86avg, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;
	defm PAVGW : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 1, NoVLX_Or_NoBWI>;

	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
	defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
	loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V, VEX_WIG;

	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
	defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
	VR256, loadv4i64, i256mem, SSE_PMADD,
	0>, VEX_4V, VEX_L, VEX_WIG;
	let Constraints = "$src1 = $dst" in
	defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
	memopv2i64, i128mem, SSE_PMADD>;

	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
	defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
	loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
	VEX_4V, VEX_WIG;
	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
	defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
	loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
	VEX_4V, VEX_L, VEX_WIG;
	let Constraints = "$src1 = $dst" in
	defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
	memopv2i64, i128mem, SSE_INTALU_ITINS_P>;

	let Predicates = [HasAVX, NoVLX] in
	defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
	loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
	VEX_4V, VEX_WIG;
	let Predicates = [HasAVX2, NoVLX] in
	defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
	VR256, loadv4i64, i256mem,
	SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG;
	let Constraints = "$src1 = $dst" in
	defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
	memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;

	//===---------------------------------------------------------------------===//
	// SSE2 - Packed Integer Logical Instructions
	//===---------------------------------------------------------------------===//

	multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
	string OpcodeStr, SDNode OpNode,
	SDNode OpNode2, RegisterClass RC,
	ValueType DstVT, ValueType SrcVT,
	PatFrag ld_frag, bit Is2Addr = 1> {
	// src2 is always 128-bit
	def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, VR128:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (DstVT (OpNode RC:$src1, (SrcVT VR128:$src2))))],
	SSE_INTSHIFT_ITINS_P.rr>, Sched<[WriteVecShift]>;
	def rm : PDI<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, i128mem:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (DstVT (OpNode RC:$src1,
	(SrcVT (bitconvert (ld_frag addr:$src2))))))],
	SSE_INTSHIFT_ITINS_P.rm>, Sched<[WriteVecShiftLd, ReadAfterLd]>;
	def ri : PDIi8<opc2, ImmForm, (outs RC:$dst),
	(ins RC:$src1, u8imm:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i8 imm:$src2))))],
	SSE_INTSHIFT_ITINS_P.ri>, Sched<[WriteVecShift]>;
	}

	multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
	string OpcodeStr, SDNode OpNode,
	SDNode OpNode2, ValueType DstVT128,
	ValueType DstVT256, ValueType SrcVT,
	Predicate prd> {
	let Predicates = [HasAVX, prd] in
	defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
	OpNode, OpNode2, VR128, DstVT128, SrcVT,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	let Predicates = [HasAVX2, prd] in
	defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
	OpNode, OpNode2, VR256, DstVT256, SrcVT,
	loadv2i64, 0>, VEX_4V, VEX_L, VEX_WIG;
	let Constraints = "$src1 = $dst" in
	defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
	VR128, DstVT128, SrcVT, memopv2i64>;
	}

	multiclass PDI_binop_ri<bits<8> opc, Format ImmForm, string OpcodeStr,
	SDNode OpNode, RegisterClass RC, ValueType VT,
	bit Is2Addr = 1> {
	def ri : PDIi8<opc, ImmForm, (outs RC:$dst), (ins RC:$src1, u8imm:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (VT (OpNode RC:$src1, (i8 imm:$src2))))],
	IIC_SSE_INTSHDQ_P_RI>, Sched<[WriteVecShift]>;
	}

	multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
	SDNode OpNode> {
	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
	defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
	VR128, v16i8, 0>, VEX_4V, VEX_WIG;
	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
	defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
	VR256, v32i8, 0>, VEX_4V, VEX_L, VEX_WIG;
	let Constraints = "$src1 = $dst" in
	defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
	}

	let ExeDomain = SSEPackedInt in {
	defm PSLLW : PDI_binop_rmi_all<0xF1, 0x71, MRM6r, "psllw", X86vshl, X86vshli,
	v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
	defm PSLLD : PDI_binop_rmi_all<0xF2, 0x72, MRM6r, "pslld", X86vshl, X86vshli,
	v4i32, v8i32, v4i32, NoVLX>;
	defm PSLLQ : PDI_binop_rmi_all<0xF3, 0x73, MRM6r, "psllq", X86vshl, X86vshli,
	v2i64, v4i64, v2i64, NoVLX>;

	defm PSRLW : PDI_binop_rmi_all<0xD1, 0x71, MRM2r, "psrlw", X86vsrl, X86vsrli,
	v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
	defm PSRLD : PDI_binop_rmi_all<0xD2, 0x72, MRM2r, "psrld", X86vsrl, X86vsrli,
	v4i32, v8i32, v4i32, NoVLX>;
	defm PSRLQ : PDI_binop_rmi_all<0xD3, 0x73, MRM2r, "psrlq", X86vsrl, X86vsrli,
	v2i64, v4i64, v2i64, NoVLX>;

	defm PSRAW : PDI_binop_rmi_all<0xE1, 0x71, MRM4r, "psraw", X86vsra, X86vsrai,
	v8i16, v16i16, v8i16, NoVLX_Or_NoBWI>;
	defm PSRAD : PDI_binop_rmi_all<0xE2, 0x72, MRM4r, "psrad", X86vsra, X86vsrai,
	v4i32, v8i32, v4i32, NoVLX>;

	defm PSLLDQ : PDI_binop_ri_all<0x73, MRM7r, "pslldq", X86vshldq>;
	defm PSRLDQ : PDI_binop_ri_all<0x73, MRM3r, "psrldq", X86vshrdq>;
	// PSRADQri doesn't exist in SSE[1-3].
	} // ExeDomain = SSEPackedInt

	//===---------------------------------------------------------------------===//
	// SSE2 - Packed Integer Comparison Instructions
	//===---------------------------------------------------------------------===//

	defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 1, TruePredicate>;
	defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 1, TruePredicate>;
	defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
	SSE_INTALU_ITINS_P, 1, TruePredicate>;
	defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
	SSE_INTALU_ITINS_P, 0, TruePredicate>;
	defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
	SSE_INTALU_ITINS_P, 0, TruePredicate>;
	defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
	SSE_INTALU_ITINS_P, 0, TruePredicate>;

	//===---------------------------------------------------------------------===//
	// SSE2 - Packed Integer Shuffle Instructions
	//===---------------------------------------------------------------------===//

	let ExeDomain = SSEPackedInt in {
	multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
	SDNode OpNode, Predicate prd> {
	let Predicates = [HasAVX, prd] in {
	def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, u8imm:$src2),
	!strconcat("v", OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR128:$dst,
	(vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
	IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>, VEX_WIG;
	def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
	(ins i128mem:$src1, u8imm:$src2),
	!strconcat("v", OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR128:$dst,
	(vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
	(i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
	Sched<[WriteShuffleLd]>, VEX_WIG;
	}

	let Predicates = [HasAVX2, prd] in {
	def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, u8imm:$src2),
	!strconcat("v", OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
	IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>, VEX_WIG;
	def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
	(ins i256mem:$src1, u8imm:$src2),
	!strconcat("v", OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
	(i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
	Sched<[WriteShuffleLd]>, VEX_WIG;
	}

	let Predicates = [UseSSE2] in {
	def ri : Ii8<0x70, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR128:$dst,
	(vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
	IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
	def mi : Ii8<0x70, MRMSrcMem,
	(outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR128:$dst,
	(vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
	(i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;
	}
	}
	} // ExeDomain = SSEPackedInt

	defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD;
	defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
	NoVLX_Or_NoBWI>, XS;
	defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
	NoVLX_Or_NoBWI>, XD;

	//===---------------------------------------------------------------------===//
	// Packed Integer Pack Instructions (SSE & AVX)
	//===---------------------------------------------------------------------===//

	let ExeDomain = SSEPackedInt in {
	multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
	ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
	bit Is2Addr = 1> {
	def rr : PDI<opc, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst,
	(OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
	Sched<[WriteShuffle]>;
	def rm : PDI<opc, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst,
	(OutVT (OpNode (ArgVT VR128:$src1),
	(bitconvert (ld_frag addr:$src2)))))]>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
	ValueType ArgVT, SDNode OpNode> {
	def Yrr : PDI<opc, MRMSrcReg,
	(outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
	Sched<[WriteShuffle]>;
	def Yrm : PDI<opc, MRMSrcMem,
	(outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(OutVT (OpNode (ArgVT VR256:$src1),
	(bitconvert (loadv4i64 addr:$src2)))))]>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
	ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
	bit Is2Addr = 1> {
	def rr : SS48I<opc, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst,
	(OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
	Sched<[WriteShuffle]>;
	def rm : SS48I<opc, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst,
	(OutVT (OpNode (ArgVT VR128:$src1),
	(bitconvert (ld_frag addr:$src2)))))]>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
	ValueType ArgVT, SDNode OpNode> {
	def Yrr : SS48I<opc, MRMSrcReg,
	(outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
	Sched<[WriteShuffle]>;
	def Yrm : SS48I<opc, MRMSrcMem,
	(outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(OutVT (OpNode (ArgVT VR256:$src1),
	(bitconvert (loadv4i64 addr:$src2)))))]>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
	loadv2i64, 0>, VEX_4V, VEX_WIG;

	defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
	loadv2i64, 0>, VEX_4V;
	}

	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
	defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
	VEX_4V, VEX_L, VEX_WIG;

	defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
	VEX_4V, VEX_L;
	}

	let Constraints = "$src1 = $dst" in {
	defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
	memopv2i64>;
	defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
	memopv2i64>;

	defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
	memopv2i64>;

	defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
	memopv2i64>;
	}
	} // ExeDomain = SSEPackedInt

	//===---------------------------------------------------------------------===//
	// SSE2 - Packed Integer Unpack Instructions
	//===---------------------------------------------------------------------===//

	let ExeDomain = SSEPackedInt in {
	multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
	SDNode OpNode, PatFrag ld_frag, bit Is2Addr = 1> {
	def rr : PDI<opc, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr,"\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr,"\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
	IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
	def rm : PDI<opc, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr,"\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr,"\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst, (vt (OpNode VR128:$src1,
	(bitconvert (ld_frag addr:$src2)))))],
	IIC_SSE_UNPCK>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
	SDNode OpNode> {
	def Yrr : PDI<opc, MRMSrcReg,
	(outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
	!strconcat(OpcodeStr,"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
	Sched<[WriteShuffle]>;
	def Yrm : PDI<opc, MRMSrcMem,
	(outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
	!strconcat(OpcodeStr,"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst, (vt (OpNode VR256:$src1,
	(bitconvert (loadv4i64 addr:$src2)))))]>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;
	}


	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	}
	let Predicates = [HasAVX, NoVLX] in {
	defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
	loadv2i64, 0>, VEX_4V, VEX_WIG;
	}

	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
	defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
	VEX_4V, VEX_L, VEX_WIG;
	}
	let Predicates = [HasAVX2, NoVLX] in {
	defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
	VEX_4V, VEX_L, VEX_WIG;
	}

	let Constraints = "$src1 = $dst" in {
	defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
	memopv2i64>;
	defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
	memopv2i64>;
	defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
	memopv2i64>;
	defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
	memopv2i64>;

	defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
	memopv2i64>;
	defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
	memopv2i64>;
	defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
	memopv2i64>;
	defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
	memopv2i64>;
	}
	} // ExeDomain = SSEPackedInt

	//===---------------------------------------------------------------------===//
	// SSE2 - Packed Integer Extract and Insert
	//===---------------------------------------------------------------------===//

	let ExeDomain = SSEPackedInt in {
	multiclass sse2_pinsrw<bit Is2Addr = 1> {
	def rri : Ii8<0xC4, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1,
	GR32orGR64:$src2, u8imm:$src3),
	!if(Is2Addr,
	"pinsrw\t{$src3, $src2, $dst\|$dst, $src2, $src3}",
	"vpinsrw\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[(set VR128:$dst,
	(X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))],
	IIC_SSE_PINSRW>, Sched<[WriteShuffle]>;
	def rmi : Ii8<0xC4, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1,
	i16mem:$src2, u8imm:$src3),
	!if(Is2Addr,
	"pinsrw\t{$src3, $src2, $dst\|$dst, $src2, $src3}",
	"vpinsrw\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[(set VR128:$dst,
	(X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
	imm:$src3))], IIC_SSE_PINSRW>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	// Extract
	let Predicates = [HasAVX, NoBWI] in
	def VPEXTRWri : Ii8<0xC5, MRMSrcReg,
	(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
	"vpextrw\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
	imm:$src2))]>, PD, VEX,
	Sched<[WriteShuffle]>;
	def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
	(outs GR32orGR64:$dst), (ins VR128:$src1, u8imm:$src2),
	"pextrw\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set GR32orGR64:$dst, (X86pextrw (v8i16 VR128:$src1),
	imm:$src2))], IIC_SSE_PEXTRW>,
	Sched<[WriteShuffleLd, ReadAfterLd]>;

	// Insert
	let Predicates = [HasAVX, NoBWI] in
	defm VPINSRW : sse2_pinsrw<0>, PD, VEX_4V;

	let Predicates = [UseSSE2], Constraints = "$src1 = $dst" in
	defm PINSRW : sse2_pinsrw, PD;

	} // ExeDomain = SSEPackedInt

	//===---------------------------------------------------------------------===//
	// SSE2 - Packed Mask Creation
	//===---------------------------------------------------------------------===//

	let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {

	def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
	(ins VR128:$src),
	"pmovmskb\t{$src, $dst\|$dst, $src}",
	[(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
	IIC_SSE_MOVMSK>, VEX, VEX_WIG;

	let Predicates = [HasAVX2] in {
	def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
	(ins VR256:$src),
	"pmovmskb\t{$src, $dst\|$dst, $src}",
	[(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
	VEX, VEX_L, VEX_WIG;
	}

	def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
	"pmovmskb\t{$src, $dst\|$dst, $src}",
	[(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
	IIC_SSE_MOVMSK>;

	} // ExeDomain = SSEPackedInt

	//===---------------------------------------------------------------------===//
	// SSE2 - Conditional Store
	//===---------------------------------------------------------------------===//

	let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {

	let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
	def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
	(ins VR128:$src, VR128:$mask),
	"maskmovdqu\t{$mask, $src\|$src, $mask}",
	[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
	IIC_SSE_MASKMOV>, VEX, VEX_WIG;
	let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
	def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
	(ins VR128:$src, VR128:$mask),
	"maskmovdqu\t{$mask, $src\|$src, $mask}",
	[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
	IIC_SSE_MASKMOV>, VEX, VEX_WIG;

	let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
	def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
	"maskmovdqu\t{$mask, $src\|$src, $mask}",
	[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
	IIC_SSE_MASKMOV>;
	let Uses = [RDI], Predicates = [UseSSE2,In64BitMode] in
	def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
	"maskmovdqu\t{$mask, $src\|$src, $mask}",
	[(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
	IIC_SSE_MASKMOV>;

	} // ExeDomain = SSEPackedInt

	//===---------------------------------------------------------------------===//
	// SSE2 - Move Doubleword/Quadword
	//===---------------------------------------------------------------------===//

	//===---------------------------------------------------------------------===//
	// Move Int Doubleword to Packed Double Int
	//
	let ExeDomain = SSEPackedInt in {
	def VMOVDI2PDIrr : VS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
	VEX, Sched<[WriteMove]>;
	def VMOVDI2PDIrm : VS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (scalar_to_vector (loadi32 addr:$src))))],
	IIC_SSE_MOVDQ>,
	VEX, Sched<[WriteLoad]>;
	def VMOV64toPQIrr : VRS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v2i64 (scalar_to_vector GR64:$src)))],
	IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
	def VMOV64toPQIrm : VRS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[], IIC_SSE_MOVDQ>, VEX, Sched<[WriteLoad]>;
	let isCodeGenOnly = 1 in
	def VMOV64toSDrr : VRS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set FR64:$dst, (bitconvert GR64:$src))],
	IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;

	def MOVDI2PDIrr : S2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
	Sched<[WriteMove]>;
	def MOVDI2PDIrm : S2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v4i32 (scalar_to_vector (loadi32 addr:$src))))],
	IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
	def MOV64toPQIrr : RS2I<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v2i64 (scalar_to_vector GR64:$src)))],
	IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
	def MOV64toPQIrm : RS2I<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[], IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
	let isCodeGenOnly = 1 in
	def MOV64toSDrr : RS2I<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set FR64:$dst, (bitconvert GR64:$src))],
	IIC_SSE_MOVDQ>, Sched<[WriteMove]>;
	} // ExeDomain = SSEPackedInt

	//===---------------------------------------------------------------------===//
	// Move Int Doubleword to Single Scalar
	//
	let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
	def VMOVDI2SSrr : VS2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set FR32:$dst, (bitconvert GR32:$src))],
	IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;

	def VMOVDI2SSrm : VS2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
	IIC_SSE_MOVDQ>,
	VEX, Sched<[WriteLoad]>;
	def MOVDI2SSrr : S2I<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set FR32:$dst, (bitconvert GR32:$src))],
	IIC_SSE_MOVDQ>, Sched<[WriteMove]>;

	def MOVDI2SSrm : S2I<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set FR32:$dst, (bitconvert (loadi32 addr:$src)))],
	IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
	} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1

	//===---------------------------------------------------------------------===//
	// Move Packed Doubleword Int to Packed Double Int
	//
	let ExeDomain = SSEPackedInt in {
	def VMOVPDI2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (extractelt (v4i32 VR128:$src),
	(iPTR 0)))], IIC_SSE_MOVD_ToGP>, VEX,
	Sched<[WriteMove]>;
	def VMOVPDI2DImr : VS2I<0x7E, MRMDestMem, (outs),
	(ins i32mem:$dst, VR128:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(store (i32 (extractelt (v4i32 VR128:$src),
	(iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
	VEX, Sched<[WriteStore]>;
	def MOVPDI2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (extractelt (v4i32 VR128:$src),
	(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
	Sched<[WriteMove]>;
	def MOVPDI2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(store (i32 (extractelt (v4i32 VR128:$src),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
	} // ExeDomain = SSEPackedInt
	//===---------------------------------------------------------------------===//
	// Move Packed Doubleword Int first element to Doubleword Int
	//
	let ExeDomain = SSEPackedInt in {
	let SchedRW = [WriteMove] in {
	def VMOVPQIto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (extractelt (v2i64 VR128:$src),
	(iPTR 0)))],
	IIC_SSE_MOVD_ToGP>,
	VEX;

	def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (extractelt (v2i64 VR128:$src),
	(iPTR 0)))],
	IIC_SSE_MOVD_ToGP>;
	} //SchedRW

	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
	def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
	(ins i64mem:$dst, VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
	def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
	} // ExeDomain = SSEPackedInt

	//===---------------------------------------------------------------------===//
	// Bitcast FR64 <-> GR64
	//
	let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
	let Predicates = [UseAVX] in
	def VMOV64toSDrm : VS2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>,
	VEX, Sched<[WriteLoad]>;
	def VMOVSDto64rr : VRS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (bitconvert FR64:$src))],
	IIC_SSE_MOVDQ>, VEX, Sched<[WriteMove]>;
	def VMOVSDto64mr : VRS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(store (i64 (bitconvert FR64:$src)), addr:$dst)],
	IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;

	def MOV64toSDrm : S2SI<0x7E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set FR64:$dst, (bitconvert (loadi64 addr:$src)))],
	IIC_SSE_MOVDQ>, Sched<[WriteLoad]>;
	def MOVSDto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (bitconvert FR64:$src))],
	IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
	def MOVSDto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(store (i64 (bitconvert FR64:$src)), addr:$dst)],
	IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
	} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1

	//===---------------------------------------------------------------------===//
	// Move Scalar Single to Double Int
	//
	let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
	def VMOVSS2DIrr : VS2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (bitconvert FR32:$src))],
	IIC_SSE_MOVD_ToGP>, VEX, Sched<[WriteMove]>;
	def VMOVSS2DImr : VS2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(store (i32 (bitconvert FR32:$src)), addr:$dst)],
	IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
	def MOVSS2DIrr : S2I<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (bitconvert FR32:$src))],
	IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
	def MOVSS2DImr : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
	"movd\t{$src, $dst\|$dst, $src}",
	[(store (i32 (bitconvert FR32:$src)), addr:$dst)],
	IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
	} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1

	let Predicates = [UseAVX] in {
	let AddedComplexity = 15 in {
	def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
	(VMOVDI2PDIrr GR32:$src)>;

	def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
	(VMOV64toPQIrr GR64:$src)>;

	def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
	(v2i64 (scalar_to_vector GR64:$src)),(iPTR 0)))),
	(SUBREG_TO_REG (i64 0), (VMOV64toPQIrr GR64:$src), sub_xmm)>;
	}
	// AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
	// These instructions also write zeros in the high part of a 256-bit register.
	let AddedComplexity = 20 in {
	def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
	(VMOVDI2PDIrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
	(VMOVDI2PDIrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
	(VMOVDI2PDIrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
	(VMOVDI2PDIrm addr:$src)>;
	def : Pat<(v4i32 (X86vzload addr:$src)),
	(VMOVDI2PDIrm addr:$src)>;
	def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
	(v4i32 (scalar_to_vector (loadi32 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
	def : Pat<(v8i32 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i64 0), (VMOVDI2PDIrm addr:$src), sub_xmm)>;
	}
	// Use regular 128-bit instructions to match 256-bit scalar_to_vec+zext.
	def : Pat<(v8i32 (X86vzmovl (insert_subvector undef,
	(v4i32 (scalar_to_vector GR32:$src)),(iPTR 0)))),
	(SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src), sub_xmm)>;
	}

	let Predicates = [UseSSE2] in {
	let AddedComplexity = 15 in {
	def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector GR32:$src)))),
	(MOVDI2PDIrr GR32:$src)>;

	def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
	(MOV64toPQIrr GR64:$src)>;
	}
	let AddedComplexity = 20 in {
	def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
	(MOVDI2PDIrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
	(MOVDI2PDIrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
	(MOVDI2PDIrm addr:$src)>;
	def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
	(MOVDI2PDIrm addr:$src)>;
	def : Pat<(v4i32 (X86vzload addr:$src)),
	(MOVDI2PDIrm addr:$src)>;
	}
	}

	// Before the MC layer of LLVM existed, clang emitted "movd" assembly instead of
	// "movq" due to MacOS parsing limitation. In order to parse old assembly, we add
	// these aliases.
	def : InstAlias<"movd\t{$src, $dst\|$dst, $src}",
	(MOV64toPQIrr VR128:$dst, GR64:$src), 0>;
	def : InstAlias<"movd\t{$src, $dst\|$dst, $src}",
	(MOVPQIto64rr GR64:$dst, VR128:$src), 0>;
	// Allow "vmovd" but print "vmovq" since we don't need compatibility for AVX.
	def : InstAlias<"vmovd\t{$src, $dst\|$dst, $src}",
	(VMOV64toPQIrr VR128:$dst, GR64:$src), 0>;
	def : InstAlias<"vmovd\t{$src, $dst\|$dst, $src}",
	(VMOVPQIto64rr GR64:$dst, VR128:$src), 0>;

	//===---------------------------------------------------------------------===//
	// SSE2 - Move Quadword
	//===---------------------------------------------------------------------===//

	//===---------------------------------------------------------------------===//
	// Move Quadword Int to Packed Quadword Int
	//

	let ExeDomain = SSEPackedInt, SchedRW = [WriteLoad] in {
	def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
	VEX, Requires<[UseAVX]>, VEX_WIG;
	def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst,
	(v2i64 (scalar_to_vector (loadi64 addr:$src))))],
	IIC_SSE_MOVDQ>, XS,
	Requires<[UseSSE2]>; // SSE2 instruction with XS Prefix
	} // ExeDomain, SchedRW

	//===---------------------------------------------------------------------===//
	// Move Packed Quadword Int to Quadword Int
	//
	let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
	def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(store (i64 (extractelt (v2i64 VR128:$src),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOVDQ>, VEX, VEX_WIG;
	def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(store (i64 (extractelt (v2i64 VR128:$src),
	(iPTR 0))), addr:$dst)],
	IIC_SSE_MOVDQ>;
	} // ExeDomain, SchedRW

	// For disassembler only
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
	SchedRW = [WriteVecLogic] in {
	def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX, VEX_WIG;
	def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
	}

	// Aliases to help the assembler pick two byte VEX encodings by swapping the
	// operands relative to the normal instructions to use VEX.R instead of VEX.B.
	def : InstAlias<"vmovq\t{$src, $dst\|$dst, $src}",
	(VMOVPQI2QIrr VR128L:$dst, VR128H:$src), 0>;

	let Predicates = [UseAVX], AddedComplexity = 20 in {
	def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
	(VMOVQI2PQIrm addr:$src)>;
	def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
	(VMOVQI2PQIrm addr:$src)>;
	def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
	(VMOVQI2PQIrm addr:$src)>;
	def : Pat<(v2i64 (X86vzload addr:$src)),
	(VMOVQI2PQIrm addr:$src)>;
	def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
	(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
	(SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
	def : Pat<(v4i64 (X86vzload addr:$src)),
	(SUBREG_TO_REG (i64 0), (VMOVQI2PQIrm addr:$src), sub_xmm)>;
	}

	let Predicates = [UseSSE2], AddedComplexity = 20 in {
	def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
	(MOVQI2PQIrm addr:$src)>;
	def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
	(MOVQI2PQIrm addr:$src)>;
	def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
	(MOVQI2PQIrm addr:$src)>;
	def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
	}

	//===---------------------------------------------------------------------===//
	// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
	// IA32 document. movq xmm1, xmm2 does clear the high bits.
	//
	let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in {
	let AddedComplexity = 15 in
	def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"vmovq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
	IIC_SSE_MOVQ_RR>,
	XS, VEX, Requires<[UseAVX]>, VEX_WIG;
	let AddedComplexity = 15 in
	def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	"movq\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
	IIC_SSE_MOVQ_RR>,
	XS, Requires<[UseSSE2]>;
	} // ExeDomain, SchedRW

	let AddedComplexity = 20 in {
	let Predicates = [UseAVX] in {
	def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
	(VMOVZPQILo2PQIrr VR128:$src)>;
	}
	let Predicates = [UseSSE2] in {
	def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
	(MOVZPQILo2PQIrr VR128:$src)>;
	}
	}

	//===---------------------------------------------------------------------===//
	// SSE3 - Replicate Single FP - MOVSHDUP and MOVSLDUP
	//===---------------------------------------------------------------------===//
	multiclass sse3_replicate_sfp<bits<8> op, SDNode OpNode, string OpcodeStr,
	ValueType vt, RegisterClass RC, PatFrag mem_frag,
	X86MemOperand x86memop> {
	def rr : S3SI<op, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set RC:$dst, (vt (OpNode RC:$src)))],
	IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
	def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set RC:$dst, (OpNode (mem_frag addr:$src)))],
	IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
	v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
	defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
	v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
	defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
	v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
	defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
	v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
	}
	defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
	memopv4f32, f128mem>;
	defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128,
	memopv4f32, f128mem>;

	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v4i32 (X86Movshdup VR128:$src)),
	(VMOVSHDUPrr VR128:$src)>;
	def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))),
	(VMOVSHDUPrm addr:$src)>;
	def : Pat<(v4i32 (X86Movsldup VR128:$src)),
	(VMOVSLDUPrr VR128:$src)>;
	def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (loadv2i64 addr:$src)))),
	(VMOVSLDUPrm addr:$src)>;
	def : Pat<(v8i32 (X86Movshdup VR256:$src)),
	(VMOVSHDUPYrr VR256:$src)>;
	def : Pat<(v8i32 (X86Movshdup (bc_v8i32 (loadv4i64 addr:$src)))),
	(VMOVSHDUPYrm addr:$src)>;
	def : Pat<(v8i32 (X86Movsldup VR256:$src)),
	(VMOVSLDUPYrr VR256:$src)>;
	def : Pat<(v8i32 (X86Movsldup (bc_v8i32 (loadv4i64 addr:$src)))),
	(VMOVSLDUPYrm addr:$src)>;
	}

	let Predicates = [UseSSE3] in {
	def : Pat<(v4i32 (X86Movshdup VR128:$src)),
	(MOVSHDUPrr VR128:$src)>;
	def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (memopv2i64 addr:$src)))),
	(MOVSHDUPrm addr:$src)>;
	def : Pat<(v4i32 (X86Movsldup VR128:$src)),
	(MOVSLDUPrr VR128:$src)>;
	def : Pat<(v4i32 (X86Movsldup (bc_v4i32 (memopv2i64 addr:$src)))),
	(MOVSLDUPrm addr:$src)>;
	}

	//===---------------------------------------------------------------------===//
	// SSE3 - Replicate Double FP - MOVDDUP
	//===---------------------------------------------------------------------===//

	multiclass sse3_replicate_dfp<string OpcodeStr> {
	def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (v2f64 (X86Movddup VR128:$src)))],
	IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
	def rm : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst,
	(v2f64 (X86Movddup
	(scalar_to_vector (loadf64 addr:$src)))))],
	IIC_SSE_MOV_LH>, Sched<[WriteLoad]>;
	}

	// FIXME: Merge with above classe when there're patterns for the ymm version
	multiclass sse3_replicate_dfp_y<string OpcodeStr> {
	def rr : S3DI<0x12, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst, (v4f64 (X86Movddup VR256:$src)))]>,
	Sched<[WriteFShuffle]>;
	def rm : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst,
	(v4f64 (X86Movddup (loadv4f64 addr:$src))))]>,
	Sched<[WriteLoad]>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	defm VMOVDDUP : sse3_replicate_dfp<"vmovddup">, VEX, VEX_WIG;
	defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L, VEX_WIG;
	}

	defm MOVDDUP : sse3_replicate_dfp<"movddup">;


	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(X86Movddup (loadv2f64 addr:$src)),
	(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;

	// 256-bit version
	def : Pat<(X86Movddup (loadv4i64 addr:$src)),
	(VMOVDDUPYrm addr:$src)>;
	def : Pat<(X86Movddup (v4i64 VR256:$src)),
	(VMOVDDUPYrr VR256:$src)>;
	}

	let Predicates = [HasAVX, NoVLX] in
	def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
	(VMOVDDUPrm addr:$src)>;
	let Predicates = [HasAVX1Only] in
	def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
	(VMOVDDUPrm addr:$src)>;

	let Predicates = [UseSSE3] in {
	def : Pat<(X86Movddup (memopv2f64 addr:$src)),
	(MOVDDUPrm addr:$src)>;
	}

	//===---------------------------------------------------------------------===//
	// SSE3 - Move Unaligned Integer
	//===---------------------------------------------------------------------===//

	let SchedRW = [WriteLoad] in {
	let Predicates = [HasAVX] in {
	def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
	"vlddqu\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX, VEX_WIG;
	def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
	"vlddqu\t{$src, $dst\|$dst, $src}",
	[(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
	VEX, VEX_L, VEX_WIG;
	}
	def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
	"lddqu\t{$src, $dst\|$dst, $src}",
	[(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
	IIC_SSE_LDDQU>;
	}

	//===---------------------------------------------------------------------===//
	// SSE3 - Arithmetic
	//===---------------------------------------------------------------------===//

	multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
	X86MemOperand x86memop, OpndItins itins,
	PatFrag ld_frag, bit Is2Addr = 1> {
	def rr : I<0xD0, MRMSrcReg,
	(outs RC:$dst), (ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
	Sched<[itins.Sched]>;
	def rm : I<0xD0, MRMSrcMem,
	(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	let Predicates = [HasAVX] in {
	let ExeDomain = SSEPackedSingle in {
	defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
	f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V, VEX_WIG;
	defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
	f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L, VEX_WIG;
	}
	let ExeDomain = SSEPackedDouble in {
	defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
	f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V, VEX_WIG;
	defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
	f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
	}
	}
	let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
	let ExeDomain = SSEPackedSingle in
	defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
	f128mem, SSE_ALU_F32P, memopv4f32>, XD;
	let ExeDomain = SSEPackedDouble in
	defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
	f128mem, SSE_ALU_F64P, memopv2f64>, PD;
	}

	// Patterns used to select 'addsub' instructions.
	let Predicates = [HasAVX] in {
	def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
	(VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
	def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
	(VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
	def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
	(VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
	def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
	(VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;

	def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
	(VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
	def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
	(VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
	def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
	(VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
	def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
	(VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
	}

	let Predicates = [UseSSE3] in {
	def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
	(ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
	def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
	(ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
	def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
	(ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
	def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
	(ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
	}

	//===---------------------------------------------------------------------===//
	// SSE3 Instructions
	//===---------------------------------------------------------------------===//

	// Horizontal ops
	multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
	X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
	bit Is2Addr = 1> {
	def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
	Sched<[WriteFHAdd]>;

	def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
	IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
	}
	multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
	X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
	bit Is2Addr = 1> {
	def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
	Sched<[WriteFHAdd]>;

	def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
	IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
	}

	let Predicates = [HasAVX] in {
	let ExeDomain = SSEPackedSingle in {
	defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
	X86fhadd, loadv4f32, 0>, VEX_4V, VEX_WIG;
	defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
	X86fhsub, loadv4f32, 0>, VEX_4V, VEX_WIG;
	defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
	X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
	defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
	X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
	}
	let ExeDomain = SSEPackedDouble in {
	defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
	X86fhadd, loadv2f64, 0>, VEX_4V, VEX_WIG;
	defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
	X86fhsub, loadv2f64, 0>, VEX_4V, VEX_WIG;
	defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
	X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
	defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
	X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
	}
	}

	let Constraints = "$src1 = $dst" in {
	let ExeDomain = SSEPackedSingle in {
	defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
	memopv4f32>;
	defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
	memopv4f32>;
	}
	let ExeDomain = SSEPackedDouble in {
	defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
	memopv2f64>;
	defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
	memopv2f64>;
	}
	}

	//===---------------------------------------------------------------------===//
	// SSSE3 - Packed Absolute Instructions
	//===---------------------------------------------------------------------===//


	/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
	multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
	SDNode OpNode, PatFrag ld_frag> {
	def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (vt (OpNode VR128:$src)))],
	IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;

	def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
	(ins i128mem:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst,
	(vt (OpNode (bitconvert (ld_frag addr:$src)))))],
	IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
	}

	/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
	multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
	SDNode OpNode> {
	def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
	Sched<[WriteVecALU]>;

	def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
	(ins i256mem:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst,
	(vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
	Sched<[WriteVecALULd]>;
	}

	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, loadv2i64>, VEX, VEX_WIG;
	defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, loadv2i64>, VEX, VEX_WIG;
	}
	let Predicates = [HasAVX, NoVLX] in {
	defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, loadv2i64>, VEX, VEX_WIG;
	}
	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
	defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs>, VEX, VEX_L, VEX_WIG;
	defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs>, VEX, VEX_L, VEX_WIG;
	}
	let Predicates = [HasAVX2, NoVLX] in {
	defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs>, VEX, VEX_L, VEX_WIG;
	}

	defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, memopv2i64>;
	defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, memopv2i64>;
	defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>;

	//===---------------------------------------------------------------------===//
	// SSSE3 - Packed Binary Operator Instructions
	//===---------------------------------------------------------------------===//

	let Sched = WritePHAdd in {
	def SSE_PHADDSUBD : OpndItins<
	IIC_SSE_PHADDSUBD_RR, IIC_SSE_PHADDSUBD_RM
	>;
	def SSE_PHADDSUBSW : OpndItins<
	IIC_SSE_PHADDSUBSW_RR, IIC_SSE_PHADDSUBSW_RM
	>;
	def SSE_PHADDSUBW : OpndItins<
	IIC_SSE_PHADDSUBW_RR, IIC_SSE_PHADDSUBW_RM
	>;
	}
	let Sched = WriteShuffle in
	def SSE_PSHUFB : OpndItins<
	IIC_SSE_PSHUFB_RR, IIC_SSE_PSHUFB_RM
	>;
	let Sched = WriteVecALU in
	def SSE_PSIGN : OpndItins<
	IIC_SSE_PSIGN_RR, IIC_SSE_PSIGN_RM
	>;
	let Sched = WriteVecIMul in
	def SSE_PMULHRSW : OpndItins<
	IIC_SSE_PMULHRSW, IIC_SSE_PMULHRSW
	>;

	/// SS3I_binop_rm - Simple SSSE3 bin op
	multiclass SS3I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType DstVT, ValueType OpVT, RegisterClass RC,
	PatFrag memop_frag, X86MemOperand x86memop,
	OpndItins itins, bit Is2Addr = 1> {
	let isCommutable = 1 in
	def rr : SS38I<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (DstVT (OpNode (OpVT RC:$src1), RC:$src2)))], itins.rr>,
	Sched<[itins.Sched]>;
	def rm : SS38I<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst,
	(DstVT (OpNode (OpVT RC:$src1),
	(bitconvert (memop_frag addr:$src2)))))], itins.rm>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	/// SS3I_binop_rm_int - Simple SSSE3 bin op whose type can be v*{i8,i16,i32}.
	multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
	Intrinsic IntId128, OpndItins itins,
	PatFrag ld_frag, bit Is2Addr = 1> {
	let isCommutable = 1 in
	def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
	Sched<[itins.Sched]>;
	def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst,
	(IntId128 VR128:$src1,
	(bitconvert (ld_frag addr:$src2))))]>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	multiclass SS3I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
	Intrinsic IntId256,
	X86FoldableSchedWrite Sched> {
	let isCommutable = 1 in
	def rr256 : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR256:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst, (IntId256 VR256:$src1, VR256:$src2))]>,
	Sched<[Sched]>;
	def rm256 : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, i256mem:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(IntId256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2))))]>,
	Sched<[Sched.Folded, ReadAfterLd]>;
	}

	let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	let isCommutable = 0 in {
	defm VPSHUFB : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
	VR128, loadv2i64, i128mem,
	SSE_PSHUFB, 0>, VEX_4V, VEX_WIG;
	defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
	v16i8, VR128, loadv2i64, i128mem,
	SSE_PMADD, 0>, VEX_4V, VEX_WIG;
	}
	defm VPMULHRSW : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
	VR128, loadv2i64, i128mem,
	SSE_PMULHRSW, 0>, VEX_4V, VEX_WIG;
	}

	let ImmT = NoImm, Predicates = [HasAVX] in {
	let isCommutable = 0 in {
	defm VPHADDW : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
	loadv2i64, i128mem,
	SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
	defm VPHADDD : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
	loadv2i64, i128mem,
	SSE_PHADDSUBD, 0>, VEX_4V, VEX_WIG;
	defm VPHSUBW : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
	loadv2i64, i128mem,
	SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
	defm VPHSUBD : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
	loadv2i64, i128mem,
	SSE_PHADDSUBD, 0>, VEX_4V;
	defm VPSIGNB : SS3I_binop_rm_int<0x08, "vpsignb",
	int_x86_ssse3_psign_b_128,
	SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPSIGNW : SS3I_binop_rm_int<0x09, "vpsignw",
	int_x86_ssse3_psign_w_128,
	SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPSIGND : SS3I_binop_rm_int<0x0A, "vpsignd",
	int_x86_ssse3_psign_d_128,
	SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPHADDSW : SS3I_binop_rm_int<0x03, "vphaddsw",
	int_x86_ssse3_phadd_sw_128,
	SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VPHSUBSW : SS3I_binop_rm_int<0x07, "vphsubsw",
	int_x86_ssse3_phsub_sw_128,
	SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
	}
	}

	let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
	let isCommutable = 0 in {
	defm VPSHUFBY : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
	VR256, loadv4i64, i256mem,
	SSE_PSHUFB, 0>, VEX_4V, VEX_L, VEX_WIG;
	defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
	v32i8, VR256, loadv4i64, i256mem,
	SSE_PMADD, 0>, VEX_4V, VEX_L, VEX_WIG;
	}
	defm VPMULHRSWY : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
	VR256, loadv4i64, i256mem,
	SSE_PMULHRSW, 0>, VEX_4V, VEX_L, VEX_WIG;
	}

	let ImmT = NoImm, Predicates = [HasAVX2] in {
	let isCommutable = 0 in {
	defm VPHADDWY : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
	VR256, loadv4i64, i256mem,
	SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
	defm VPHADDDY : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
	loadv4i64, i256mem,
	SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
	defm VPHSUBWY : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
	VR256, loadv4i64, i256mem,
	SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
	defm VPHSUBDY : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
	loadv4i64, i256mem,
	SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
	defm VPSIGNBY : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
	WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
	defm VPSIGNWY : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
	WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
	defm VPSIGNDY : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
	WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
	defm VPHADDSW : SS3I_binop_rm_int_y<0x03, "vphaddsw",
	int_x86_avx2_phadd_sw,
	WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
	defm VPHSUBSW : SS3I_binop_rm_int_y<0x07, "vphsubsw",
	int_x86_avx2_phsub_sw,
	WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
	}
	}

	// None of these have i8 immediate fields.
	let ImmT = NoImm, Constraints = "$src1 = $dst" in {
	let isCommutable = 0 in {
	defm PHADDW : SS3I_binop_rm<0x01, "phaddw", X86hadd, v8i16, v8i16, VR128,
	memopv2i64, i128mem, SSE_PHADDSUBW>;
	defm PHADDD : SS3I_binop_rm<0x02, "phaddd", X86hadd, v4i32, v4i32, VR128,
	memopv2i64, i128mem, SSE_PHADDSUBD>;
	defm PHSUBW : SS3I_binop_rm<0x05, "phsubw", X86hsub, v8i16, v8i16, VR128,
	memopv2i64, i128mem, SSE_PHADDSUBW>;
	defm PHSUBD : SS3I_binop_rm<0x06, "phsubd", X86hsub, v4i32, v4i32, VR128,
	memopv2i64, i128mem, SSE_PHADDSUBD>;
	defm PSIGNB : SS3I_binop_rm_int<0x08, "psignb", int_x86_ssse3_psign_b_128,
	SSE_PSIGN, memopv2i64>;
	defm PSIGNW : SS3I_binop_rm_int<0x09, "psignw", int_x86_ssse3_psign_w_128,
	SSE_PSIGN, memopv2i64>;
	defm PSIGND : SS3I_binop_rm_int<0x0A, "psignd", int_x86_ssse3_psign_d_128,
	SSE_PSIGN, memopv2i64>;
	defm PSHUFB : SS3I_binop_rm<0x00, "pshufb", X86pshufb, v16i8, v16i8, VR128,
	memopv2i64, i128mem, SSE_PSHUFB>;
	defm PHADDSW : SS3I_binop_rm_int<0x03, "phaddsw",
	int_x86_ssse3_phadd_sw_128,
	SSE_PHADDSUBSW, memopv2i64>;
	defm PHSUBSW : SS3I_binop_rm_int<0x07, "phsubsw",
	int_x86_ssse3_phsub_sw_128,
	SSE_PHADDSUBSW, memopv2i64>;
	defm PMADDUBSW : SS3I_binop_rm<0x04, "pmaddubsw", X86vpmaddubsw, v8i16,
	v16i8, VR128, memopv2i64, i128mem,
	SSE_PMADD>;
	}
	defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
	VR128, memopv2i64, i128mem, SSE_PMULHRSW>;
	}

	//===---------------------------------------------------------------------===//
	// SSSE3 - Packed Align Instruction Patterns
	//===---------------------------------------------------------------------===//

	multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
	let hasSideEffects = 0 in {
	def rri : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
	let mayLoad = 1 in
	def rmi : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
	}
	}

	multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
	let hasSideEffects = 0 in {
	def Yrri : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR256:$src2, u8imm:$src3),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[]>, Sched<[WriteShuffle]>;
	let mayLoad = 1 in
	def Yrmi : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, i256mem:$src2, u8imm:$src3),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
	}
	}

	let Predicates = [HasAVX] in
	defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V, VEX_WIG;
	let Predicates = [HasAVX2] in
	defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L, VEX_WIG;
	let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
	defm PALIGNR : ssse3_palignr<"palignr">;

	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
	def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
	}

	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
	def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
	def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
	def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
	}

	let Predicates = [UseSSSE3] in {
	def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
	def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
	def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
	def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
	(PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
	}

	//===---------------------------------------------------------------------===//
	// SSSE3 - Thread synchronization
	//===---------------------------------------------------------------------===//

	let SchedRW = [WriteSystem] in {
	let usesCustomInserter = 1 in {
	def MONITOR : PseudoI<(outs), (ins i32mem:$src1, GR32:$src2, GR32:$src3),
	[(int_x86_sse3_monitor addr:$src1, GR32:$src2, GR32:$src3)]>,
	Requires<[HasSSE3]>;
	}

	let Uses = [EAX, ECX, EDX] in
	def MONITORrrr : I<0x01, MRM_C8, (outs), (ins), "monitor", [], IIC_SSE_MONITOR>,
	TB, Requires<[HasSSE3]>;

	let Uses = [ECX, EAX] in
	def MWAITrr : I<0x01, MRM_C9, (outs), (ins), "mwait",
	[(int_x86_sse3_mwait ECX, EAX)], IIC_SSE_MWAIT>,
	TB, Requires<[HasSSE3]>;
	} // SchedRW

	def : InstAlias<"mwait\t{%eax, %ecx\|ecx, eax}", (MWAITrr)>, Requires<[Not64BitMode]>;
	def : InstAlias<"mwait\t{%rax, %rcx\|rcx, rax}", (MWAITrr)>, Requires<[In64BitMode]>;

	def : InstAlias<"monitor\t{%eax, %ecx, %edx\|edx, ecx, eax}", (MONITORrrr)>,
	Requires<[Not64BitMode]>;
	def : InstAlias<"monitor\t{%rax, %rcx, %rdx\|rdx, rcx, rax}", (MONITORrrr)>,
	Requires<[In64BitMode]>;

	//===----------------------------------------------------------------------===//
	// SSE4.1 - Packed Move with Sign/Zero Extend
	//===----------------------------------------------------------------------===//

	multiclass SS41I_pmovx_rrrm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
	RegisterClass OutRC, RegisterClass InRC,
	OpndItins itins> {
	def rr : SS48I<opc, MRMSrcReg, (outs OutRC:$dst), (ins InRC:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[], itins.rr>,
	Sched<[itins.Sched]>;

	def rm : SS48I<opc, MRMSrcMem, (outs OutRC:$dst), (ins MemOp:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[],
	itins.rm>, Sched<[itins.Sched.Folded]>;
	}

	multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
	X86MemOperand MemOp, X86MemOperand MemYOp,
	OpndItins SSEItins, OpndItins AVXItins,
	OpndItins AVX2Itins, Predicate prd> {
	defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
	let Predicates = [HasAVX, prd] in
	defm V#NAME : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
	VR128, VR128, AVXItins>, VEX, VEX_WIG;
	let Predicates = [HasAVX2, prd] in
	defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
	VR256, VR128, AVX2Itins>, VEX, VEX_L, VEX_WIG;
	}

	multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
	X86MemOperand MemYOp, Predicate prd> {
	defm PMOVSX#NAME : SS41I_pmovx_rm_all<opc, !strconcat("pmovsx", OpcodeStr),
	MemOp, MemYOp,
	SSE_INTALU_ITINS_SHUFF_P,
	DEFAULT_ITINS_SHUFFLESCHED,
	DEFAULT_ITINS_SHUFFLESCHED, prd>;
	defm PMOVZX#NAME : SS41I_pmovx_rm_all<!add(opc, 0x10),
	!strconcat("pmovzx", OpcodeStr),
	MemOp, MemYOp,
	SSE_INTALU_ITINS_SHUFF_P,
	DEFAULT_ITINS_SHUFFLESCHED,
	DEFAULT_ITINS_SHUFFLESCHED, prd>;
	}

	defm BW : SS41I_pmovx_rm<0x20, "bw", i64mem, i128mem, NoVLX_Or_NoBWI>;
	defm WD : SS41I_pmovx_rm<0x23, "wd", i64mem, i128mem, NoVLX>;
	defm DQ : SS41I_pmovx_rm<0x25, "dq", i64mem, i128mem, NoVLX>;

	defm BD : SS41I_pmovx_rm<0x21, "bd", i32mem, i64mem, NoVLX>;
	defm WQ : SS41I_pmovx_rm<0x24, "wq", i32mem, i64mem, NoVLX>;

	defm BQ : SS41I_pmovx_rm<0x22, "bq", i16mem, i32mem, NoVLX>;

	// AVX2 Patterns
	multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy, SDNode ExtOp> {
	// Register-Register patterns
	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	def : Pat<(v16i16 (ExtOp (v16i8 VR128:$src))),
	(!cast<I>(OpcPrefix#BWYrr) VR128:$src)>;
	}
	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v8i32 (ExtOp (v16i8 VR128:$src))),
	(!cast<I>(OpcPrefix#BDYrr) VR128:$src)>;
	def : Pat<(v4i64 (ExtOp (v16i8 VR128:$src))),
	(!cast<I>(OpcPrefix#BQYrr) VR128:$src)>;

	def : Pat<(v8i32 (ExtOp (v8i16 VR128:$src))),
	(!cast<I>(OpcPrefix#WDYrr) VR128:$src)>;
	def : Pat<(v4i64 (ExtOp (v8i16 VR128:$src))),
	(!cast<I>(OpcPrefix#WQYrr) VR128:$src)>;

	def : Pat<(v4i64 (ExtOp (v4i32 VR128:$src))),
	(!cast<I>(OpcPrefix#DQYrr) VR128:$src)>;
	}

	// Simple Register-Memory patterns
	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	def : Pat<(v16i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
	(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
	}
	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
	(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
	def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
	(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;

	def : Pat<(v8i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
	(!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
	def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
	(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;

	def : Pat<(v4i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
	(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
	}

	// AVX2 Register-Memory patterns
	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	def : Pat<(v16i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
	def : Pat<(v16i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
	def : Pat<(v16i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWYrm) addr:$src)>;
	}
	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v8i32 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;

	def : Pat<(v4i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
	(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
	(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;

	def : Pat<(v8i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDYrm) addr:$src)>;
	def : Pat<(v8i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDYrm) addr:$src)>;

	def : Pat<(v4i64 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;

	def : Pat<(v4i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
	def : Pat<(v4i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQYrm) addr:$src)>;
	}
	}

	defm : SS41I_pmovx_avx2_patterns<"VPMOVSX", "s", X86vsext>;
	defm : SS41I_pmovx_avx2_patterns<"VPMOVZX", "z", X86vzext>;

	// SSE4.1/AVX patterns.
	multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
	SDNode ExtOp, PatFrag ExtLoad16> {
	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	def : Pat<(v8i16 (ExtOp (v16i8 VR128:$src))),
	(!cast<I>(OpcPrefix#BWrr) VR128:$src)>;
	}
	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v4i32 (ExtOp (v16i8 VR128:$src))),
	(!cast<I>(OpcPrefix#BDrr) VR128:$src)>;
	def : Pat<(v2i64 (ExtOp (v16i8 VR128:$src))),
	(!cast<I>(OpcPrefix#BQrr) VR128:$src)>;

	def : Pat<(v4i32 (ExtOp (v8i16 VR128:$src))),
	(!cast<I>(OpcPrefix#WDrr) VR128:$src)>;
	def : Pat<(v2i64 (ExtOp (v8i16 VR128:$src))),
	(!cast<I>(OpcPrefix#WQrr) VR128:$src)>;

	def : Pat<(v2i64 (ExtOp (v4i32 VR128:$src))),
	(!cast<I>(OpcPrefix#DQrr) VR128:$src)>;
	}
	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	def : Pat<(v8i16 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
	(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
	}
	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
	(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
	def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi8") addr:$src)),
	(!cast<I>(OpcPrefix#BQrm) addr:$src)>;

	def : Pat<(v4i32 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
	(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
	def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi16") addr:$src)),
	(!cast<I>(OpcPrefix#WQrm) addr:$src)>;

	def : Pat<(v2i64 (!cast<PatFrag>(ExtTy#"extloadvi32") addr:$src)),
	(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
	}
	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
	def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
	(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
	def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
	def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
	def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
	}
	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
	(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
	def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
	(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
	def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
	def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BDrm) addr:$src)>;

	def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
	(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
	(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#BQrm) addr:$src)>;

	def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
	def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
	(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
	def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
	def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
	def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WDrm) addr:$src)>;

	def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
	(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
	(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#WQrm) addr:$src)>;

	def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
	(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
	(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
	def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
	(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
	}
	}

	defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec, extloadi32i16>;
	defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec, loadi16_anyext>;

	let Predicates = [UseSSE41] in {
	defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec, extloadi32i16>;
	defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec, loadi16_anyext>;
	}

	//===----------------------------------------------------------------------===//
	// SSE4.1 - Extract Instructions
	//===----------------------------------------------------------------------===//

	/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
	multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
	def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
	(ins VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set GR32orGR64:$dst, (X86pextrb (v16i8 VR128:$src1),
	imm:$src2))]>,
	Sched<[WriteShuffle]>;
	let hasSideEffects = 0, mayStore = 1,
	SchedRW = [WriteShuffleLd, WriteRMW] in
	def mr : SS4AIi8<opc, MRMDestMem, (outs),
	(ins i8mem:$dst, VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
	imm:$src2)))), addr:$dst)]>;
	}

	let Predicates = [HasAVX, NoBWI] in
	defm VPEXTRB : SS41I_extract8<0x14, "vpextrb">, VEX;

	defm PEXTRB : SS41I_extract8<0x14, "pextrb">;


	/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
	multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
	let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
	def rr_REV : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
	(ins VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[]>, Sched<[WriteShuffle]>, FoldGenData<NAME#ri>;

	let hasSideEffects = 0, mayStore = 1,
	SchedRW = [WriteShuffleLd, WriteRMW] in
	def mr : SS4AIi8<opc, MRMDestMem, (outs),
	(ins i16mem:$dst, VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
	imm:$src2)))), addr:$dst)]>;
	}

	let Predicates = [HasAVX, NoBWI] in
	defm VPEXTRW : SS41I_extract16<0x15, "vpextrw">, VEX;

	defm PEXTRW : SS41I_extract16<0x15, "pextrw">;


	/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
	multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
	def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
	(ins VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set GR32:$dst,
	(extractelt (v4i32 VR128:$src1), imm:$src2))]>,
	Sched<[WriteShuffle]>;
	let SchedRW = [WriteShuffleLd, WriteRMW] in
	def mr : SS4AIi8<opc, MRMDestMem, (outs),
	(ins i32mem:$dst, VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(store (extractelt (v4i32 VR128:$src1), imm:$src2),
	addr:$dst)]>;
	}

	let Predicates = [HasAVX, NoDQI] in
	defm VPEXTRD : SS41I_extract32<0x16, "vpextrd">, VEX;

	defm PEXTRD : SS41I_extract32<0x16, "pextrd">;

	/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
	multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
	def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
	(ins VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set GR64:$dst,
	(extractelt (v2i64 VR128:$src1), imm:$src2))]>,
	Sched<[WriteShuffle]>;
	let SchedRW = [WriteShuffleLd, WriteRMW] in
	def mr : SS4AIi8<opc, MRMDestMem, (outs),
	(ins i64mem:$dst, VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(store (extractelt (v2i64 VR128:$src1), imm:$src2),
	addr:$dst)]>;
	}

	let Predicates = [HasAVX, NoDQI] in
	defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;

	defm PEXTRQ : SS41I_extract64<0x16, "pextrq">, REX_W;

	/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
	/// destination
	multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
	OpndItins itins = DEFAULT_ITINS> {
	def rr : SS4AIi8<opc, MRMDestReg, (outs GR32orGR64:$dst),
	(ins VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set GR32orGR64:$dst,
	(extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))],
	itins.rr>, Sched<[WriteFBlend]>;
	let SchedRW = [WriteFBlendLd, WriteRMW] in
	def mr : SS4AIi8<opc, MRMDestMem, (outs),
	(ins f32mem:$dst, VR128:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
	addr:$dst)], itins.rm>;
	}

	let ExeDomain = SSEPackedSingle in {
	let Predicates = [UseAVX] in
	defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
	defm EXTRACTPS : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
	}

	// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
	def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
	imm:$src2))),
	addr:$dst),
	(VEXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
	Requires<[HasAVX]>;
	def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
	imm:$src2))),
	addr:$dst),
	(EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
	Requires<[UseSSE41]>;

	//===----------------------------------------------------------------------===//
	// SSE4.1 - Insert Instructions
	//===----------------------------------------------------------------------===//

	multiclass SS41I_insert8<bits<8> opc, string asm, bit Is2Addr = 1> {
	def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, GR32orGR64:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
	Sched<[WriteShuffle]>;
	def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i8mem:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
	imm:$src3))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	let Predicates = [HasAVX, NoBWI] in
	defm VPINSRB : SS41I_insert8<0x20, "vpinsrb", 0>, VEX_4V;
	let Constraints = "$src1 = $dst" in
	defm PINSRB : SS41I_insert8<0x20, "pinsrb">;

	multiclass SS41I_insert32<bits<8> opc, string asm, bit Is2Addr = 1> {
	def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, GR32:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
	Sched<[WriteShuffle]>;
	def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i32mem:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
	imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	let Predicates = [HasAVX, NoDQI] in
	defm VPINSRD : SS41I_insert32<0x22, "vpinsrd", 0>, VEX_4V;
	let Constraints = "$src1 = $dst" in
	defm PINSRD : SS41I_insert32<0x22, "pinsrd">;

	multiclass SS41I_insert64<bits<8> opc, string asm, bit Is2Addr = 1> {
	def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, GR64:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
	Sched<[WriteShuffle]>;
	def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i64mem:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
	imm:$src3)))]>, Sched<[WriteShuffleLd, ReadAfterLd]>;
	}

	let Predicates = [HasAVX, NoDQI] in
	defm VPINSRQ : SS41I_insert64<0x22, "vpinsrq", 0>, VEX_4V, VEX_W;
	let Constraints = "$src1 = $dst" in
	defm PINSRQ : SS41I_insert64<0x22, "pinsrq">, REX_W;

	// insertps has a few different modes, there's the first two here below which
	// are optimized inserts that won't zero arbitrary elements in the destination
	// vector. The next one matches the intrinsic and could zero arbitrary elements
	// in the target vector.
	multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
	OpndItins itins = DEFAULT_ITINS> {
	def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
	Sched<[WriteFShuffle]>;
	def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, f32mem:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(asm, "\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(asm,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(X86insertps VR128:$src1,
	(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
	imm:$src3))], itins.rm>,
	Sched<[WriteFShuffleLd, ReadAfterLd]>;
	}

	let ExeDomain = SSEPackedSingle in {
	let Predicates = [UseAVX] in
	defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V, VEX_WIG;
	let Constraints = "$src1 = $dst" in
	defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
	}

	let Predicates = [UseSSE41] in {
	// If we're inserting an element from a load or a null pshuf of a load,
	// fold the load into the insertps instruction.
	def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
	(scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
	imm:$src3)),
	(INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
	def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
	(loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
	(INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
	}

	let Predicates = [UseAVX] in {
	// If we're inserting an element from a vbroadcast of a load, fold the
	// load into the X86insertps instruction.
	def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
	(X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
	(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
	def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
	(X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
	(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
	}

	//===----------------------------------------------------------------------===//
	// SSE4.1 - Round Instructions
	//===----------------------------------------------------------------------===//

	multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
	X86MemOperand x86memop, RegisterClass RC,
	PatFrag mem_frag32, PatFrag mem_frag64,
	Intrinsic V4F32Int, Intrinsic V2F64Int> {
	let ExeDomain = SSEPackedSingle in {
	// Intrinsic operation, reg.
	// Vector intrinsic operation, reg
	def PSr : SS4AIi8<opcps, MRMSrcReg,
	(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
	!strconcat(OpcodeStr,
	"ps\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
	IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;

	// Vector intrinsic operation, mem
	def PSm : SS4AIi8<opcps, MRMSrcMem,
	(outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
	!strconcat(OpcodeStr,
	"ps\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst,
	(V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
	IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
	} // ExeDomain = SSEPackedSingle

	let ExeDomain = SSEPackedDouble in {
	// Vector intrinsic operation, reg
	def PDr : SS4AIi8<opcpd, MRMSrcReg,
	(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
	!strconcat(OpcodeStr,
	"pd\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
	IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;

	// Vector intrinsic operation, mem
	def PDm : SS4AIi8<opcpd, MRMSrcMem,
	(outs RC:$dst), (ins x86memop:$src1, i32u8imm:$src2),
	!strconcat(OpcodeStr,
	"pd\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst,
	(V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
	IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
	} // ExeDomain = SSEPackedDouble
	}

	multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
	string OpcodeStr> {
	let ExeDomain = GenericDomain, hasSideEffects = 0 in {
	def SSr : SS4AIi8<opcss, MRMSrcReg,
	(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
	!strconcat(OpcodeStr,
	"ss\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[]>, Sched<[WriteFAdd]>;

	let mayLoad = 1 in
	def SSm : SS4AIi8<opcss, MRMSrcMem,
	(outs FR32:$dst), (ins FR32:$src1, f32mem:$src2, i32u8imm:$src3),
	!strconcat(OpcodeStr,
	"ss\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[]>, Sched<[WriteFAddLd, ReadAfterLd]>;

	def SDr : SS4AIi8<opcsd, MRMSrcReg,
	(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
	!strconcat(OpcodeStr,
	"sd\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[]>, Sched<[WriteFAdd]>;

	let mayLoad = 1 in
	def SDm : SS4AIi8<opcsd, MRMSrcMem,
	(outs FR64:$dst), (ins FR64:$src1, f64mem:$src2, i32u8imm:$src3),
	!strconcat(OpcodeStr,
	"sd\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
	} // ExeDomain = GenericDomain, hasSideEffects = 0
	}

	multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
	string OpcodeStr> {
	let ExeDomain = GenericDomain, hasSideEffects = 0 in {
	def SSr : SS4AIi8<opcss, MRMSrcReg,
	(outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
	!strconcat(OpcodeStr,
	"ss\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[]>, Sched<[WriteFAdd]>;

	let mayLoad = 1 in
	def SSm : SS4AIi8<opcss, MRMSrcMem,
	(outs FR32:$dst), (ins f32mem:$src1, i32u8imm:$src2),
	!strconcat(OpcodeStr,
	"ss\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[]>, Sched<[WriteFAddLd, ReadAfterLd]>;

	def SDr : SS4AIi8<opcsd, MRMSrcReg,
	(outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
	!strconcat(OpcodeStr,
	"sd\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[]>, Sched<[WriteFAdd]>;

	let mayLoad = 1 in
	def SDm : SS4AIi8<opcsd, MRMSrcMem,
	(outs FR64:$dst), (ins f64mem:$src1, i32u8imm:$src2),
	!strconcat(OpcodeStr,
	"sd\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
	} // ExeDomain = GenericDomain, hasSideEffects = 0
	}

	multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
	string OpcodeStr,
	Intrinsic F32Int,
	Intrinsic F64Int, bit Is2Addr = 1> {
	let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
	def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
	!if(Is2Addr,
	!strconcat(OpcodeStr,
	"ss\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(OpcodeStr,
	"ss\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
	Sched<[WriteFAdd]>;

	def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32u8imm:$src3),
	!if(Is2Addr,
	!strconcat(OpcodeStr,
	"ss\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(OpcodeStr,
	"ss\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
	Sched<[WriteFAddLd, ReadAfterLd]>;

	def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
	(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
	!if(Is2Addr,
	!strconcat(OpcodeStr,
	"sd\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(OpcodeStr,
	"sd\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
	Sched<[WriteFAdd]>;

	def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
	(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32u8imm:$src3),
	!if(Is2Addr,
	!strconcat(OpcodeStr,
	"sd\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(OpcodeStr,
	"sd\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set VR128:$dst,
	(F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
	Sched<[WriteFAddLd, ReadAfterLd]>;
	} // ExeDomain = GenericDomain, isCodeGenOnly = 1
	}

	// FP round - roundss, roundps, roundsd, roundpd
	let Predicates = [HasAVX] in {
	// Intrinsic form
	defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
	loadv4f32, loadv2f64,
	int_x86_sse41_round_ps,
	int_x86_sse41_round_pd>, VEX, VEX_WIG;
	defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
	loadv8f32, loadv4f64,
	int_x86_avx_round_ps_256,
	int_x86_avx_round_pd_256>, VEX, VEX_L, VEX_WIG;
	defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
	int_x86_sse41_round_ss,
	int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG, VEX_WIG;
	defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
	}

	let Predicates = [UseAVX] in {
	def : Pat<(ffloor FR32:$src),
	(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x9))>;
	def : Pat<(f64 (ffloor FR64:$src)),
	(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x9))>;
	def : Pat<(f32 (fnearbyint FR32:$src)),
	(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xC))>;
	def : Pat<(f64 (fnearbyint FR64:$src)),
	(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xC))>;
	def : Pat<(f32 (fceil FR32:$src)),
	(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xA))>;
	def : Pat<(f64 (fceil FR64:$src)),
	(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xA))>;
	def : Pat<(f32 (frint FR32:$src)),
	(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0x4))>;
	def : Pat<(f64 (frint FR64:$src)),
	(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0x4))>;
	def : Pat<(f32 (ftrunc FR32:$src)),
	(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src, (i32 0xB))>;
	def : Pat<(f64 (ftrunc FR64:$src)),
	(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
	}

	let Predicates = [HasAVX] in {
	def : Pat<(v4f32 (ffloor VR128:$src)),
	(VROUNDPSr VR128:$src, (i32 0x9))>;
	def : Pat<(v4f32 (fnearbyint VR128:$src)),
	(VROUNDPSr VR128:$src, (i32 0xC))>;
	def : Pat<(v4f32 (fceil VR128:$src)),
	(VROUNDPSr VR128:$src, (i32 0xA))>;
	def : Pat<(v4f32 (frint VR128:$src)),
	(VROUNDPSr VR128:$src, (i32 0x4))>;
	def : Pat<(v4f32 (ftrunc VR128:$src)),
	(VROUNDPSr VR128:$src, (i32 0xB))>;

	def : Pat<(v2f64 (ffloor VR128:$src)),
	(VROUNDPDr VR128:$src, (i32 0x9))>;
	def : Pat<(v2f64 (fnearbyint VR128:$src)),
	(VROUNDPDr VR128:$src, (i32 0xC))>;
	def : Pat<(v2f64 (fceil VR128:$src)),
	(VROUNDPDr VR128:$src, (i32 0xA))>;
	def : Pat<(v2f64 (frint VR128:$src)),
	(VROUNDPDr VR128:$src, (i32 0x4))>;
	def : Pat<(v2f64 (ftrunc VR128:$src)),
	(VROUNDPDr VR128:$src, (i32 0xB))>;

	def : Pat<(v8f32 (ffloor VR256:$src)),
	(VROUNDYPSr VR256:$src, (i32 0x9))>;
	def : Pat<(v8f32 (fnearbyint VR256:$src)),
	(VROUNDYPSr VR256:$src, (i32 0xC))>;
	def : Pat<(v8f32 (fceil VR256:$src)),
	(VROUNDYPSr VR256:$src, (i32 0xA))>;
	def : Pat<(v8f32 (frint VR256:$src)),
	(VROUNDYPSr VR256:$src, (i32 0x4))>;
	def : Pat<(v8f32 (ftrunc VR256:$src)),
	(VROUNDYPSr VR256:$src, (i32 0xB))>;

	def : Pat<(v4f64 (ffloor VR256:$src)),
	(VROUNDYPDr VR256:$src, (i32 0x9))>;
	def : Pat<(v4f64 (fnearbyint VR256:$src)),
	(VROUNDYPDr VR256:$src, (i32 0xC))>;
	def : Pat<(v4f64 (fceil VR256:$src)),
	(VROUNDYPDr VR256:$src, (i32 0xA))>;
	def : Pat<(v4f64 (frint VR256:$src)),
	(VROUNDYPDr VR256:$src, (i32 0x4))>;
	def : Pat<(v4f64 (ftrunc VR256:$src)),
	(VROUNDYPDr VR256:$src, (i32 0xB))>;
	}

	defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
	memopv4f32, memopv2f64, int_x86_sse41_round_ps,
	int_x86_sse41_round_pd>;

	defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;

	let Constraints = "$src1 = $dst" in
	defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
	int_x86_sse41_round_ss, int_x86_sse41_round_sd>;

	let Predicates = [UseSSE41] in {
	def : Pat<(ffloor FR32:$src),
	(ROUNDSSr FR32:$src, (i32 0x9))>;
	def : Pat<(f64 (ffloor FR64:$src)),
	(ROUNDSDr FR64:$src, (i32 0x9))>;
	def : Pat<(f32 (fnearbyint FR32:$src)),
	(ROUNDSSr FR32:$src, (i32 0xC))>;
	def : Pat<(f64 (fnearbyint FR64:$src)),
	(ROUNDSDr FR64:$src, (i32 0xC))>;
	def : Pat<(f32 (fceil FR32:$src)),
	(ROUNDSSr FR32:$src, (i32 0xA))>;
	def : Pat<(f64 (fceil FR64:$src)),
	(ROUNDSDr FR64:$src, (i32 0xA))>;
	def : Pat<(f32 (frint FR32:$src)),
	(ROUNDSSr FR32:$src, (i32 0x4))>;
	def : Pat<(f64 (frint FR64:$src)),
	(ROUNDSDr FR64:$src, (i32 0x4))>;
	def : Pat<(f32 (ftrunc FR32:$src)),
	(ROUNDSSr FR32:$src, (i32 0xB))>;
	def : Pat<(f64 (ftrunc FR64:$src)),
	(ROUNDSDr FR64:$src, (i32 0xB))>;

	def : Pat<(v4f32 (ffloor VR128:$src)),
	(ROUNDPSr VR128:$src, (i32 0x9))>;
	def : Pat<(v4f32 (fnearbyint VR128:$src)),
	(ROUNDPSr VR128:$src, (i32 0xC))>;
	def : Pat<(v4f32 (fceil VR128:$src)),
	(ROUNDPSr VR128:$src, (i32 0xA))>;
	def : Pat<(v4f32 (frint VR128:$src)),
	(ROUNDPSr VR128:$src, (i32 0x4))>;
	def : Pat<(v4f32 (ftrunc VR128:$src)),
	(ROUNDPSr VR128:$src, (i32 0xB))>;

	def : Pat<(v2f64 (ffloor VR128:$src)),
	(ROUNDPDr VR128:$src, (i32 0x9))>;
	def : Pat<(v2f64 (fnearbyint VR128:$src)),
	(ROUNDPDr VR128:$src, (i32 0xC))>;
	def : Pat<(v2f64 (fceil VR128:$src)),
	(ROUNDPDr VR128:$src, (i32 0xA))>;
	def : Pat<(v2f64 (frint VR128:$src)),
	(ROUNDPDr VR128:$src, (i32 0x4))>;
	def : Pat<(v2f64 (ftrunc VR128:$src)),
	(ROUNDPDr VR128:$src, (i32 0xB))>;
	}

	//===----------------------------------------------------------------------===//
	// SSE4.1 - Packed Bit Test
	//===----------------------------------------------------------------------===//

	// ptest instruction we'll lower to this in X86ISelLowering primarily from
	// the intel intrinsic that corresponds to this.
	let Defs = [EFLAGS], Predicates = [HasAVX] in {
	def VPTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
	"vptest\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
	Sched<[WriteVecLogic]>, VEX, VEX_WIG;
	def VPTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
	"vptest\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
	Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_WIG;

	def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
	"vptest\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
	Sched<[WriteVecLogic]>, VEX, VEX_L, VEX_WIG;
	def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
	"vptest\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
	Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L, VEX_WIG;
	}

	let Defs = [EFLAGS] in {
	def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
	"ptest\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
	Sched<[WriteVecLogic]>;
	def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
	"ptest\t{$src2, $src1\|$src1, $src2}",
	[(set EFLAGS, (X86ptest VR128:$src1, (memopv2i64 addr:$src2)))]>,
	Sched<[WriteVecLogicLd, ReadAfterLd]>;
	}

	// The bit test instructions below are AVX only
	multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
	X86MemOperand x86memop, PatFrag mem_frag, ValueType vt> {
	def rr : SS48I<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1\|$src1, $src2}"),
	[(set EFLAGS, (X86testp RC:$src1, (vt RC:$src2)))]>,
	Sched<[WriteVecLogic]>, VEX;
	def rm : SS48I<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1\|$src1, $src2}"),
	[(set EFLAGS, (X86testp RC:$src1, (mem_frag addr:$src2)))]>,
	Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
	}

	let Defs = [EFLAGS], Predicates = [HasAVX] in {
	let ExeDomain = SSEPackedSingle in {
	defm VTESTPS : avx_bittest<0x0E, "vtestps", VR128, f128mem, loadv4f32, v4f32>;
	defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, loadv8f32, v8f32>,
	VEX_L;
	}
	let ExeDomain = SSEPackedDouble in {
	defm VTESTPD : avx_bittest<0x0F, "vtestpd", VR128, f128mem, loadv2f64, v2f64>;
	defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, loadv4f64, v4f64>,
	VEX_L;
	}
	}

	//===----------------------------------------------------------------------===//
	// SSE4.1 - Misc Instructions
	//===----------------------------------------------------------------------===//

	let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
	def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
	"popcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (ctpop GR16:$src)), (implicit EFLAGS)],
	IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
	OpSize16, XS;
	def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
	"popcnt{w}\t{$src, $dst\|$dst, $src}",
	[(set GR16:$dst, (ctpop (loadi16 addr:$src))),
	(implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
	Sched<[WriteFAddLd]>, OpSize16, XS;

	def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
	"popcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (ctpop GR32:$src)), (implicit EFLAGS)],
	IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>,
	OpSize32, XS;

	def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
	"popcnt{l}\t{$src, $dst\|$dst, $src}",
	[(set GR32:$dst, (ctpop (loadi32 addr:$src))),
	(implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
	Sched<[WriteFAddLd]>, OpSize32, XS;

	def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
	"popcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (ctpop GR64:$src)), (implicit EFLAGS)],
	IIC_SSE_POPCNT_RR>, Sched<[WriteFAdd]>, XS;
	def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
	"popcnt{q}\t{$src, $dst\|$dst, $src}",
	[(set GR64:$dst, (ctpop (loadi64 addr:$src))),
	(implicit EFLAGS)], IIC_SSE_POPCNT_RM>,
	Sched<[WriteFAddLd]>, XS;
	}



	// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
	multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
	Intrinsic IntId128, PatFrag ld_frag,
	X86FoldableSchedWrite Sched> {
	def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst, (IntId128 VR128:$src))]>,
	Sched<[Sched]>;
	def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
	(ins i128mem:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst,
	(IntId128 (bitconvert (ld_frag addr:$src))))]>,
	Sched<[Sched.Folded]>;
	}

	// PHMIN has the same profile as PSAD, thus we use the same scheduling
	// model, although the naming is misleading.
	let Predicates = [HasAVX] in
	defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
	int_x86_sse41_phminposuw, loadv2i64,
	WriteVecIMul>, VEX, VEX_WIG;
	defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
	int_x86_sse41_phminposuw, memopv2i64,
	WriteVecIMul>;

	/// SS48I_binop_rm - Simple SSE41 binary operator.
	multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
	X86MemOperand x86memop, bit Is2Addr = 1,
	OpndItins itins = SSE_INTALU_ITINS_P> {
	let isCommutable = 1 in
	def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>,
	Sched<[itins.Sched]>;
	def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst,
	(OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)))))]>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
	/// types.
	multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType DstVT, ValueType SrcVT, RegisterClass RC,
	PatFrag memop_frag, X86MemOperand x86memop,
	OpndItins itins,
	bit IsCommutable = 0, bit Is2Addr = 1> {
	let isCommutable = IsCommutable in
	def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
	Sched<[itins.Sched]>;
	def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
	(bitconvert (memop_frag addr:$src2)))))]>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;
	defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;
	defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;
	defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;
	defm VPMULDQ : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
	VR128, loadv2i64, i128mem,
	SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG;
	}
	let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
	defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;
	defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;
	defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;
	defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;
	}

	let Predicates = [HasAVX2, NoVLX] in {
	defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
	VR256, loadv4i64, i256mem,
	SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG;
	}
	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
	defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;
	defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;
	}

	let Constraints = "$src1 = $dst" in {
	defm PMINSB : SS48I_binop_rm<0x38, "pminsb", smin, v16i8, VR128,
	memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
	defm PMINSD : SS48I_binop_rm<0x39, "pminsd", smin, v4i32, VR128,
	memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
	defm PMINUD : SS48I_binop_rm<0x3B, "pminud", umin, v4i32, VR128,
	memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
	defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", umin, v8i16, VR128,
	memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
	defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", smax, v16i8, VR128,
	memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
	defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", smax, v4i32, VR128,
	memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
	defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", umax, v4i32, VR128,
	memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
	defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", umax, v8i16, VR128,
	memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
	defm PMULDQ : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
	VR128, memopv2i64, i128mem,
	SSE_INTMUL_ITINS_P, 1>;
	}

	let Predicates = [HasAVX, NoVLX] in
	defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
	loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
	VEX_4V, VEX_WIG;
	let Predicates = [HasAVX] in
	defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
	loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_WIG;

	let Predicates = [HasAVX2, NoVLX] in
	defm VPMULLDY : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
	loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
	VEX_4V, VEX_L, VEX_WIG;
	let Predicates = [HasAVX2] in
	defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
	loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
	VEX_4V, VEX_L, VEX_WIG;

	let Constraints = "$src1 = $dst" in {
	defm PMULLD : SS48I_binop_rm<0x40, "pmulld", mul, v4i32, VR128,
	memopv2i64, i128mem, 1, SSE_PMULLD_ITINS>;
	defm PCMPEQQ : SS48I_binop_rm<0x29, "pcmpeqq", X86pcmpeq, v2i64, VR128,
	memopv2i64, i128mem, 1, SSE_INTALUQ_ITINS_P>;
	}

	/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
	multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
	Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
	X86MemOperand x86memop, bit Is2Addr = 1,
	OpndItins itins = DEFAULT_ITINS> {
	let isCommutable = 1 in
	def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set RC:$dst, (IntId RC:$src1, RC:$src2, imm:$src3))], itins.rr>,
	Sched<[itins.Sched]>;
	def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set RC:$dst,
	(IntId RC:$src1,
	(bitconvert (memop_frag addr:$src2)), imm:$src3))], itins.rm>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
	multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
	X86MemOperand x86memop, bit Is2Addr = 1,
	OpndItins itins = DEFAULT_ITINS> {
	let isCommutable = 1 in
	def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
	itins.rr>, Sched<[itins.Sched]>;
	def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2, u8imm:$src3),
	!if(Is2Addr,
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $dst\|$dst, $src2, $src3}"),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}")),
	[(set RC:$dst,
	(OpVT (OpNode RC:$src1,
	(bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	let Predicates = [HasAVX] in {
	let isCommutable = 0 in {
	defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
	VR128, loadv2i64, i128mem, 0,
	DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
	}

	let ExeDomain = SSEPackedSingle in {
	defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
	VR128, loadv4f32, f128mem, 0,
	DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
	defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
	VR256, loadv8f32, f256mem, 0,
	DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
	}
	let ExeDomain = SSEPackedDouble in {
	defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
	VR128, loadv2f64, f128mem, 0,
	DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
	defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
	VR256, loadv4f64, f256mem, 0,
	DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
	}
	defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
	VR128, loadv2i64, i128mem, 0,
	DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG;

	let ExeDomain = SSEPackedSingle in
	defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
	VR128, loadv4f32, f128mem, 0,
	SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
	let ExeDomain = SSEPackedDouble in
	defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
	VR128, loadv2f64, f128mem, 0,
	SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
	let ExeDomain = SSEPackedSingle in
	defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
	VR256, loadv8f32, i256mem, 0,
	SSE_DPPS_ITINS>, VEX_4V, VEX_L, VEX_WIG;
	}

	let Predicates = [HasAVX2] in {
	let isCommutable = 0 in {
	defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
	VR256, loadv4i64, i256mem, 0,
	DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
	}
	defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
	VR256, loadv4i64, i256mem, 0,
	DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
	}

	let Constraints = "$src1 = $dst" in {
	let isCommutable = 0 in {
	defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
	VR128, memopv2i64, i128mem,
	1, SSE_MPSADBW_ITINS>;
	}
	let ExeDomain = SSEPackedSingle in
	defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
	VR128, memopv4f32, f128mem,
	1, SSE_INTALU_ITINS_FBLEND_P>;
	let ExeDomain = SSEPackedDouble in
	defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
	VR128, memopv2f64, f128mem,
	1, SSE_INTALU_ITINS_FBLEND_P>;
	defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
	VR128, memopv2i64, i128mem,
	1, SSE_INTALU_ITINS_BLEND_P>;
	let ExeDomain = SSEPackedSingle in
	defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
	VR128, memopv4f32, f128mem, 1,
	SSE_DPPS_ITINS>;
	let ExeDomain = SSEPackedDouble in
	defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
	VR128, memopv2f64, f128mem, 1,
	SSE_DPPD_ITINS>;
	}

	// For insertion into the zero index (low half) of a 256-bit vector, it is
	// more efficient to generate a blend with immediate instead of an insert*128.
	let Predicates = [HasAVX] in {
	def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
	(VBLENDPDYrri VR256:$src1,
	(INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0x3)>;
	def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
	(VBLENDPSYrri VR256:$src1,
	(INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	}

	/// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
	multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
	RegisterClass RC, X86MemOperand x86memop,
	PatFrag mem_frag, Intrinsic IntId,
	X86FoldableSchedWrite Sched> {
	def rr : Ii8Reg<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, RC:$src3),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[(set RC:$dst, (IntId RC:$src1, RC:$src2, RC:$src3))],
	NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
	Sched<[Sched]>;

	def rm : Ii8Reg<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2, RC:$src3),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[(set RC:$dst,
	(IntId RC:$src1, (bitconvert (mem_frag addr:$src2)),
	RC:$src3))],
	NoItinerary, SSEPackedInt>, TAPD, VEX_4V,
	Sched<[Sched.Folded, ReadAfterLd]>;
	}

	let Predicates = [HasAVX] in {
	let ExeDomain = SSEPackedDouble in {
	defm VBLENDVPD : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
	loadv2f64, int_x86_sse41_blendvpd,
	WriteFVarBlend>;
	defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
	loadv4f64, int_x86_avx_blendv_pd_256,
	WriteFVarBlend>, VEX_L;
	} // ExeDomain = SSEPackedDouble
	let ExeDomain = SSEPackedSingle in {
	defm VBLENDVPS : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
	loadv4f32, int_x86_sse41_blendvps,
	WriteFVarBlend>;
	defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
	loadv8f32, int_x86_avx_blendv_ps_256,
	WriteFVarBlend>, VEX_L;
	} // ExeDomain = SSEPackedSingle
	defm VPBLENDVB : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
	loadv2i64, int_x86_sse41_pblendvb,
	WriteVarBlend>;
	}

	let Predicates = [HasAVX2] in {
	defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
	loadv4i64, int_x86_avx2_pblendvb,
	WriteVarBlend>, VEX_L;
	}

	let Predicates = [HasAVX] in {
	def : Pat<(v16i8 (vselect (v16i8 VR128:$mask), (v16i8 VR128:$src1),
	(v16i8 VR128:$src2))),
	(VPBLENDVBrr VR128:$src2, VR128:$src1, VR128:$mask)>;
	def : Pat<(v4i32 (vselect (v4i32 VR128:$mask), (v4i32 VR128:$src1),
	(v4i32 VR128:$src2))),
	(VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
	def : Pat<(v4f32 (vselect (v4i32 VR128:$mask), (v4f32 VR128:$src1),
	(v4f32 VR128:$src2))),
	(VBLENDVPSrr VR128:$src2, VR128:$src1, VR128:$mask)>;
	def : Pat<(v2i64 (vselect (v2i64 VR128:$mask), (v2i64 VR128:$src1),
	(v2i64 VR128:$src2))),
	(VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
	def : Pat<(v2f64 (vselect (v2i64 VR128:$mask), (v2f64 VR128:$src1),
	(v2f64 VR128:$src2))),
	(VBLENDVPDrr VR128:$src2, VR128:$src1, VR128:$mask)>;
	def : Pat<(v8i32 (vselect (v8i32 VR256:$mask), (v8i32 VR256:$src1),
	(v8i32 VR256:$src2))),
	(VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
	def : Pat<(v8f32 (vselect (v8i32 VR256:$mask), (v8f32 VR256:$src1),
	(v8f32 VR256:$src2))),
	(VBLENDVPSYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
	def : Pat<(v4i64 (vselect (v4i64 VR256:$mask), (v4i64 VR256:$src1),
	(v4i64 VR256:$src2))),
	(VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
	def : Pat<(v4f64 (vselect (v4i64 VR256:$mask), (v4f64 VR256:$src1),
	(v4f64 VR256:$src2))),
	(VBLENDVPDYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
	}

	let Predicates = [HasAVX2] in {
	def : Pat<(v32i8 (vselect (v32i8 VR256:$mask), (v32i8 VR256:$src1),
	(v32i8 VR256:$src2))),
	(VPBLENDVBYrr VR256:$src2, VR256:$src1, VR256:$mask)>;
	}

	// Patterns
	// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
	// on targets where they have equal performance. These were changed to use
	// blends because blends have better throughput on SandyBridge and Haswell, but
	// movs[s/d] are 1-2 byte shorter instructions.
	let Predicates = [UseAVX] in {
	let AddedComplexity = 15 in {
	// Move scalar to XMM zero-extended, zeroing a VR128 then do a
	// MOVS{S,D} to the lower bits.
	def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
	(VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
	def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
	(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
	(VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
	def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
	(VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;

	// Move low f32 and clear high bits.
	def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
	(VBLENDPSYrri (v8f32 (AVX_SET0)), VR256:$src, (i8 1))>;

	// Move low f64 and clear high bits.
	def : Pat<(v4f64 (X86vzmovl (v4f64 VR256:$src))),
	(VBLENDPDYrri (v4f64 (AVX_SET0)), VR256:$src, (i8 1))>;
	}

	// These will incur an FP/int domain crossing penalty, but it may be the only
	// way without AVX2. Do not add any complexity because we may be able to match
	// more optimal patterns defined earlier in this file.
	def : Pat<(v8i32 (X86vzmovl (v8i32 VR256:$src))),
	(VBLENDPSYrri (v8i32 (AVX_SET0)), VR256:$src, (i8 1))>;
	def : Pat<(v4i64 (X86vzmovl (v4i64 VR256:$src))),
	(VBLENDPDYrri (v4i64 (AVX_SET0)), VR256:$src, (i8 1))>;
	}

	// FIXME: Prefer a movss or movsd over a blendps when optimizing for size or
	// on targets where they have equal performance. These were changed to use
	// blends because blends have better throughput on SandyBridge and Haswell, but
	// movs[s/d] are 1-2 byte shorter instructions.
	let Predicates = [UseSSE41], AddedComplexity = 15 in {
	// With SSE41 we can use blends for these patterns.
	def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
	(BLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
	def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
	(PBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
	}


	/// SS41I_ternary_int - SSE 4.1 ternary operator
	let Uses = [XMM0], Constraints = "$src1 = $dst" in {
	multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
	X86MemOperand x86memop, Intrinsic IntId,
	OpndItins itins = DEFAULT_ITINS> {
	def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	!strconcat(OpcodeStr,
	"\t{%xmm0, $src2, $dst\|$dst, $src2, xmm0}"),
	[(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
	itins.rr>, Sched<[itins.Sched]>;

	def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, x86memop:$src2),
	!strconcat(OpcodeStr,
	"\t{%xmm0, $src2, $dst\|$dst, $src2, xmm0}"),
	[(set VR128:$dst,
	(IntId VR128:$src1,
	(bitconvert (mem_frag addr:$src2)), XMM0))],
	itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}
	}

	let ExeDomain = SSEPackedDouble in
	defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", memopv2f64, f128mem,
	int_x86_sse41_blendvpd,
	DEFAULT_ITINS_FBLENDSCHED>;
	let ExeDomain = SSEPackedSingle in
	defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", memopv4f32, f128mem,
	int_x86_sse41_blendvps,
	DEFAULT_ITINS_FBLENDSCHED>;
	defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
	int_x86_sse41_pblendvb,
	DEFAULT_ITINS_VARBLENDSCHED>;

	// Aliases with the implicit xmm0 argument
	def : InstAlias<"blendvpd\t{$src2, $dst\|$dst, $src2}",
	(BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
	def : InstAlias<"blendvpd\t{$src2, $dst\|$dst, $src2}",
	(BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
	def : InstAlias<"blendvps\t{$src2, $dst\|$dst, $src2}",
	(BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
	def : InstAlias<"blendvps\t{$src2, $dst\|$dst, $src2}",
	(BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
	def : InstAlias<"pblendvb\t{$src2, $dst\|$dst, $src2}",
	(PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
	def : InstAlias<"pblendvb\t{$src2, $dst\|$dst, $src2}",
	(PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;

	let Predicates = [UseSSE41] in {
	def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
	(v16i8 VR128:$src2))),
	(PBLENDVBrr0 VR128:$src2, VR128:$src1)>;
	def : Pat<(v4i32 (vselect (v4i32 XMM0), (v4i32 VR128:$src1),
	(v4i32 VR128:$src2))),
	(BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
	def : Pat<(v4f32 (vselect (v4i32 XMM0), (v4f32 VR128:$src1),
	(v4f32 VR128:$src2))),
	(BLENDVPSrr0 VR128:$src2, VR128:$src1)>;
	def : Pat<(v2i64 (vselect (v2i64 XMM0), (v2i64 VR128:$src1),
	(v2i64 VR128:$src2))),
	(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
	def : Pat<(v2f64 (vselect (v2i64 XMM0), (v2f64 VR128:$src1),
	(v2f64 VR128:$src2))),
	(BLENDVPDrr0 VR128:$src2, VR128:$src1)>;
	}

	let AddedComplexity = 400 in { // Prefer non-temporal versions
	let SchedRW = [WriteLoad] in {
	let Predicates = [HasAVX, NoVLX] in
	def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
	"vmovntdqa\t{$src, $dst\|$dst, $src}", []>,
	VEX, VEX_WIG;
	let Predicates = [HasAVX2, NoVLX] in
	def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
	"vmovntdqa\t{$src, $dst\|$dst, $src}", []>,
	VEX, VEX_L, VEX_WIG;
	def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
	"movntdqa\t{$src, $dst\|$dst, $src}", []>;
	} // SchedRW

	let Predicates = [HasAVX2, NoVLX] in {
	def : Pat<(v8f32 (alignednontemporalload addr:$src)),
	(VMOVNTDQAYrm addr:$src)>;
	def : Pat<(v4f64 (alignednontemporalload addr:$src)),
	(VMOVNTDQAYrm addr:$src)>;
	def : Pat<(v4i64 (alignednontemporalload addr:$src)),
	(VMOVNTDQAYrm addr:$src)>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v4f32 (alignednontemporalload addr:$src)),
	(VMOVNTDQArm addr:$src)>;
	def : Pat<(v2f64 (alignednontemporalload addr:$src)),
	(VMOVNTDQArm addr:$src)>;
	def : Pat<(v2i64 (alignednontemporalload addr:$src)),
	(VMOVNTDQArm addr:$src)>;
	}

	let Predicates = [UseSSE41] in {
	def : Pat<(v4f32 (alignednontemporalload addr:$src)),
	(MOVNTDQArm addr:$src)>;
	def : Pat<(v2f64 (alignednontemporalload addr:$src)),
	(MOVNTDQArm addr:$src)>;
	def : Pat<(v2i64 (alignednontemporalload addr:$src)),
	(MOVNTDQArm addr:$src)>;
	}

	} // AddedComplexity

	//===----------------------------------------------------------------------===//
	// SSE4.2 - Compare Instructions
	//===----------------------------------------------------------------------===//

	/// SS42I_binop_rm - Simple SSE 4.2 binary operator
	multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
	X86MemOperand x86memop, OpndItins itins,
	bit Is2Addr = 1> {
	def rr : SS428I<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2)))]>, Sched<[itins.Sched]>;
	def rm : SS428I<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set RC:$dst,
	(OpVT (OpNode RC:$src1, (memop_frag addr:$src2))))]>,
	Sched<[itins.Sched.Folded, ReadAfterLd]>;
	}

	let Predicates = [HasAVX] in
	defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
	loadv2i64, i128mem, SSE_INTALU_ITINS_P, 0>,
	VEX_4V, VEX_WIG;

	let Predicates = [HasAVX2] in
	defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
	loadv4i64, i256mem, SSE_INTALU_ITINS_P, 0>,
	VEX_4V, VEX_L, VEX_WIG;

	let Constraints = "$src1 = $dst" in
	defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
	memopv2i64, i128mem, SSE_INTALU_ITINS_P>;

	//===----------------------------------------------------------------------===//
	// SSE4.2 - String/text Processing Instructions
	//===----------------------------------------------------------------------===//

	// Packed Compare Implicit Length Strings, Return Mask
	multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
	def REG : PseudoI<(outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	[(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
	imm:$src3))]>;
	def MEM : PseudoI<(outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
	[(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
	(bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
	}

	let Defs = [EFLAGS], usesCustomInserter = 1 in {
	defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
	Requires<[HasAVX]>, VEX_WIG;
	defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
	Requires<[UseSSE42]>;
	}

	multiclass pcmpistrm_SS42AI<string asm> {
	def rr : SS42AI<0x62, MRMSrcReg, (outs),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	!strconcat(asm, "\t{$src3, $src2, $src1\|$src1, $src2, $src3}"),
	[]>, Sched<[WritePCmpIStrM]>;
	let mayLoad = 1 in
	def rm :SS42AI<0x62, MRMSrcMem, (outs),
	(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
	!strconcat(asm, "\t{$src3, $src2, $src1\|$src1, $src2, $src3}"),
	[]>, Sched<[WritePCmpIStrMLd, ReadAfterLd]>;
	}

	let Defs = [XMM0, EFLAGS], hasSideEffects = 0 in {
	let Predicates = [HasAVX] in
	defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
	defm PCMPISTRM128 : pcmpistrm_SS42AI<"pcmpistrm"> ;
	}

	// Packed Compare Explicit Length Strings, Return Mask
	multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
	def REG : PseudoI<(outs VR128:$dst),
	(ins VR128:$src1, VR128:$src3, u8imm:$src5),
	[(set VR128:$dst, (int_x86_sse42_pcmpestrm128
	VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
	def MEM : PseudoI<(outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src3, u8imm:$src5),
	[(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
	(bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
	}

	let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
	defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
	Requires<[HasAVX]>;
	defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
	Requires<[UseSSE42]>;
	}

	multiclass SS42AI_pcmpestrm<string asm> {
	def rr : SS42AI<0x60, MRMSrcReg, (outs),
	(ins VR128:$src1, VR128:$src3, u8imm:$src5),
	!strconcat(asm, "\t{$src5, $src3, $src1\|$src1, $src3, $src5}"),
	[]>, Sched<[WritePCmpEStrM]>;
	let mayLoad = 1 in
	def rm : SS42AI<0x60, MRMSrcMem, (outs),
	(ins VR128:$src1, i128mem:$src3, u8imm:$src5),
	!strconcat(asm, "\t{$src5, $src3, $src1\|$src1, $src3, $src5}"),
	[]>, Sched<[WritePCmpEStrMLd, ReadAfterLd]>;
	}

	let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
	let Predicates = [HasAVX] in
	defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
	defm PCMPESTRM128 : SS42AI_pcmpestrm<"pcmpestrm">;
	}

	// Packed Compare Implicit Length Strings, Return Index
	multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
	def REG : PseudoI<(outs GR32:$dst),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	[(set GR32:$dst, EFLAGS,
	(X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
	def MEM : PseudoI<(outs GR32:$dst),
	(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
	[(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
	(bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
	}

	let Defs = [EFLAGS], usesCustomInserter = 1 in {
	defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
	Requires<[HasAVX]>, VEX_WIG;
	defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
	Requires<[UseSSE42]>;
	}

	multiclass SS42AI_pcmpistri<string asm> {
	def rr : SS42AI<0x63, MRMSrcReg, (outs),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	!strconcat(asm, "\t{$src3, $src2, $src1\|$src1, $src2, $src3}"),
	[]>, Sched<[WritePCmpIStrI]>;
	let mayLoad = 1 in
	def rm : SS42AI<0x63, MRMSrcMem, (outs),
	(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
	!strconcat(asm, "\t{$src3, $src2, $src1\|$src1, $src2, $src3}"),
	[]>, Sched<[WritePCmpIStrILd, ReadAfterLd]>;
	}

	let Defs = [ECX, EFLAGS], hasSideEffects = 0 in {
	let Predicates = [HasAVX] in
	defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
	defm PCMPISTRI : SS42AI_pcmpistri<"pcmpistri">;
	}

	// Packed Compare Explicit Length Strings, Return Index
	multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
	def REG : PseudoI<(outs GR32:$dst),
	(ins VR128:$src1, VR128:$src3, u8imm:$src5),
	[(set GR32:$dst, EFLAGS,
	(X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
	def MEM : PseudoI<(outs GR32:$dst),
	(ins VR128:$src1, i128mem:$src3, u8imm:$src5),
	[(set GR32:$dst, EFLAGS,
	(X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (ld_frag addr:$src3)), EDX,
	imm:$src5))]>;
	}

	let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
	defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
	Requires<[HasAVX]>;
	defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
	Requires<[UseSSE42]>;
	}

	multiclass SS42AI_pcmpestri<string asm> {
	def rr : SS42AI<0x61, MRMSrcReg, (outs),
	(ins VR128:$src1, VR128:$src3, u8imm:$src5),
	!strconcat(asm, "\t{$src5, $src3, $src1\|$src1, $src3, $src5}"),
	[]>, Sched<[WritePCmpEStrI]>;
	let mayLoad = 1 in
	def rm : SS42AI<0x61, MRMSrcMem, (outs),
	(ins VR128:$src1, i128mem:$src3, u8imm:$src5),
	!strconcat(asm, "\t{$src5, $src3, $src1\|$src1, $src3, $src5}"),
	[]>, Sched<[WritePCmpEStrILd, ReadAfterLd]>;
	}

	let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
	let Predicates = [HasAVX] in
	defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
	defm PCMPESTRI : SS42AI_pcmpestri<"pcmpestri">;
	}

	//===----------------------------------------------------------------------===//
	// SSE4.2 - CRC Instructions
	//===----------------------------------------------------------------------===//

	// No CRC instructions have AVX equivalents

	// crc intrinsic instruction
	// This set of instructions are only rm, the only difference is the size
	// of r and m.
	class SS42I_crc32r<bits<8> opc, string asm, RegisterClass RCOut,
	RegisterClass RCIn, SDPatternOperator Int> :
	SS42FI<opc, MRMSrcReg, (outs RCOut:$dst), (ins RCOut:$src1, RCIn:$src2),
	!strconcat(asm, "\t{$src2, $src1\|$src1, $src2}"),
	[(set RCOut:$dst, (Int RCOut:$src1, RCIn:$src2))], IIC_CRC32_REG>,
	Sched<[WriteFAdd]>;

	class SS42I_crc32m<bits<8> opc, string asm, RegisterClass RCOut,
	X86MemOperand x86memop, SDPatternOperator Int> :
	SS42FI<opc, MRMSrcMem, (outs RCOut:$dst), (ins RCOut:$src1, x86memop:$src2),
	!strconcat(asm, "\t{$src2, $src1\|$src1, $src2}"),
	[(set RCOut:$dst, (Int RCOut:$src1, (load addr:$src2)))],
	IIC_CRC32_MEM>, Sched<[WriteFAddLd, ReadAfterLd]>;

	let Constraints = "$src1 = $dst" in {
	def CRC32r32m8 : SS42I_crc32m<0xF0, "crc32{b}", GR32, i8mem,
	int_x86_sse42_crc32_32_8>;
	def CRC32r32r8 : SS42I_crc32r<0xF0, "crc32{b}", GR32, GR8,
	int_x86_sse42_crc32_32_8>;
	def CRC32r32m16 : SS42I_crc32m<0xF1, "crc32{w}", GR32, i16mem,
	int_x86_sse42_crc32_32_16>, OpSize16;
	def CRC32r32r16 : SS42I_crc32r<0xF1, "crc32{w}", GR32, GR16,
	int_x86_sse42_crc32_32_16>, OpSize16;
	def CRC32r32m32 : SS42I_crc32m<0xF1, "crc32{l}", GR32, i32mem,
	int_x86_sse42_crc32_32_32>, OpSize32;
	def CRC32r32r32 : SS42I_crc32r<0xF1, "crc32{l}", GR32, GR32,
	int_x86_sse42_crc32_32_32>, OpSize32;
	def CRC32r64m64 : SS42I_crc32m<0xF1, "crc32{q}", GR64, i64mem,
	int_x86_sse42_crc32_64_64>, REX_W;
	def CRC32r64r64 : SS42I_crc32r<0xF1, "crc32{q}", GR64, GR64,
	int_x86_sse42_crc32_64_64>, REX_W;
	let hasSideEffects = 0 in {
	let mayLoad = 1 in
	def CRC32r64m8 : SS42I_crc32m<0xF0, "crc32{b}", GR64, i8mem,
	null_frag>, REX_W;
	def CRC32r64r8 : SS42I_crc32r<0xF0, "crc32{b}", GR64, GR8,
	null_frag>, REX_W;
	}
	}

	//===----------------------------------------------------------------------===//
	// SHA-NI Instructions
	//===----------------------------------------------------------------------===//

	multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
	bit UsesXMM0 = 0> {
	def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	!if(UsesXMM0,
	!strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst\|$dst, $src2, xmm0}"),
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}")),
	[!if(UsesXMM0,
	(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
	(set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;

	def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2),
	!if(UsesXMM0,
	!strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst\|$dst, $src2, xmm0}"),
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}")),
	[!if(UsesXMM0,
	(set VR128:$dst, (IntId VR128:$src1,
	(bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
	(set VR128:$dst, (IntId VR128:$src1,
	(bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
	}

	let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
	def SHA1RNDS4rri : Ii8<0xCC, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	"sha1rnds4\t{$src3, $src2, $dst\|$dst, $src2, $src3}",
	[(set VR128:$dst,
	(int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
	(i8 imm:$src3)))]>, TA;
	def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
	"sha1rnds4\t{$src3, $src2, $dst\|$dst, $src2, $src3}",
	[(set VR128:$dst,
	(int_x86_sha1rnds4 VR128:$src1,
	(bc_v4i32 (memopv2i64 addr:$src2)),
	(i8 imm:$src3)))]>, TA;

	defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
	defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
	defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;

	let Uses=[XMM0] in
	defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;

	defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
	defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
	}

	// Aliases with explicit %xmm0
	def : InstAlias<"sha256rnds2\t{$src2, $dst\|$dst, $src2}",
	(SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
	def : InstAlias<"sha256rnds2\t{$src2, $dst\|$dst, $src2}",
	(SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;

	//===----------------------------------------------------------------------===//
	// AES-NI Instructions
	//===----------------------------------------------------------------------===//

	multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
	PatFrag ld_frag, bit Is2Addr = 1> {
	def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
	Sched<[WriteAESDecEnc]>;
	def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2),
	!if(Is2Addr,
	!strconcat(OpcodeStr, "\t{$src2, $dst\|$dst, $src2}"),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}")),
	[(set VR128:$dst,
	(IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
	Sched<[WriteAESDecEncLd, ReadAfterLd]>;
	}

	// Perform One Round of an AES Encryption/Decryption Flow
	let Predicates = [HasAVX, HasAES] in {
	defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
	int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
	int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
	int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V, VEX_WIG;
	defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
	int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
	}

	let Constraints = "$src1 = $dst" in {
	defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
	int_x86_aesni_aesenc, memopv2i64>;
	defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
	int_x86_aesni_aesenclast, memopv2i64>;
	defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
	int_x86_aesni_aesdec, memopv2i64>;
	defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
	int_x86_aesni_aesdeclast, memopv2i64>;
	}

	// Perform the AES InvMixColumn Transformation
	let Predicates = [HasAVX, HasAES] in {
	def VAESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1),
	"vaesimc\t{$src1, $dst\|$dst, $src1}",
	[(set VR128:$dst,
	(int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
	VEX, VEX_WIG;
	def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
	(ins i128mem:$src1),
	"vaesimc\t{$src1, $dst\|$dst, $src1}",
	[(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
	Sched<[WriteAESIMCLd]>, VEX, VEX_WIG;
	}
	def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1),
	"aesimc\t{$src1, $dst\|$dst, $src1}",
	[(set VR128:$dst,
	(int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>;
	def AESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
	(ins i128mem:$src1),
	"aesimc\t{$src1, $dst\|$dst, $src1}",
	[(set VR128:$dst, (int_x86_aesni_aesimc (memopv2i64 addr:$src1)))]>,
	Sched<[WriteAESIMCLd]>;

	// AES Round Key Generation Assist
	let Predicates = [HasAVX, HasAES] in {
	def VAESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, u8imm:$src2),
	"vaeskeygenassist\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
	Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
	def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
	(ins i128mem:$src1, u8imm:$src2),
	"vaeskeygenassist\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
	Sched<[WriteAESKeyGenLd]>, VEX, VEX_WIG;
	}
	def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, u8imm:$src2),
	"aeskeygenassist\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
	Sched<[WriteAESKeyGen]>;
	def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
	(ins i128mem:$src1, u8imm:$src2),
	"aeskeygenassist\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst,
	(int_x86_aesni_aeskeygenassist (memopv2i64 addr:$src1), imm:$src2))]>,
	Sched<[WriteAESKeyGenLd]>;

	//===----------------------------------------------------------------------===//
	// PCLMUL Instructions
	//===----------------------------------------------------------------------===//

	// AVX carry-less Multiplication instructions
	let isCommutable = 1 in
	def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	"vpclmulqdq\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set VR128:$dst,
	(int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
	Sched<[WriteCLMul]>, VEX_WIG;

	def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
	"vpclmulqdq\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
	(loadv2i64 addr:$src2), imm:$src3))]>,
	Sched<[WriteCLMulLd, ReadAfterLd]>, VEX_WIG;

	// Carry-less Multiplication instructions
	let Constraints = "$src1 = $dst" in {
	let isCommutable = 1 in
	def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2, u8imm:$src3),
	"pclmulqdq\t{$src3, $src2, $dst\|$dst, $src2, $src3}",
	[(set VR128:$dst,
	(int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
	IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;

	def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
	"pclmulqdq\t{$src3, $src2, $dst\|$dst, $src2, $src3}",
	[(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
	(memopv2i64 addr:$src2), imm:$src3))],
	IIC_SSE_PCLMULQDQ_RM>,
	Sched<[WriteCLMulLd, ReadAfterLd]>;
	} // Constraints = "$src1 = $dst"


	multiclass pclmul_alias<string asm, int immop> {
	def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst\|$dst, $src}"),
	(PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;

	def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst\|$dst, $src}"),
	(PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;

	def : InstAlias<!strconcat("vpclmul", asm,
	"dq {$src2, $src1, $dst\|$dst, $src1, $src2}"),
	(VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
	0>;

	def : InstAlias<!strconcat("vpclmul", asm,
	"dq {$src2, $src1, $dst\|$dst, $src1, $src2}"),
	(VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
	0>;
	}
	defm : pclmul_alias<"hqhq", 0x11>;
	defm : pclmul_alias<"hqlq", 0x01>;
	defm : pclmul_alias<"lqhq", 0x10>;
	defm : pclmul_alias<"lqlq", 0x00>;

	//===----------------------------------------------------------------------===//
	// SSE4A Instructions
	//===----------------------------------------------------------------------===//

	let Predicates = [HasSSE4A] in {

	let ExeDomain = SSEPackedInt in {
	let Constraints = "$src = $dst" in {
	def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
	(ins VR128:$src, u8imm:$len, u8imm:$idx),
	"extrq\t{$idx, $len, $src\|$src, $len, $idx}",
	[(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
	imm:$idx))]>, PD;
	def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src, VR128:$mask),
	"extrq\t{$mask, $src\|$src, $mask}",
	[(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
	VR128:$mask))]>, PD;

	def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
	"insertq\t{$idx, $len, $src2, $src\|$src, $src2, $len, $idx}",
	[(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
	imm:$len, imm:$idx))]>, XD;
	def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src, VR128:$mask),
	"insertq\t{$mask, $src\|$src, $mask}",
	[(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
	VR128:$mask))]>, XD;
	}
	} // ExeDomain = SSEPackedInt

	// Non-temporal (unaligned) scalar stores.
	let AddedComplexity = 400 in { // Prefer non-temporal versions
	let mayStore = 1, SchedRW = [WriteStore] in {
	def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
	"movntss\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVNT>, XS;

	def MOVNTSD : I<0x2B, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
	"movntsd\t{$src, $dst\|$dst, $src}", [], IIC_SSE_MOVNT>, XD;
	} // SchedRW

	def : Pat<(nontemporalstore FR32:$src, addr:$dst),
	(MOVNTSS addr:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;

	def : Pat<(nontemporalstore FR64:$src, addr:$dst),
	(MOVNTSD addr:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;

	} // AddedComplexity
	} // HasSSE4A

	//===----------------------------------------------------------------------===//
	// AVX Instructions
	//===----------------------------------------------------------------------===//

	//===----------------------------------------------------------------------===//
	// VBROADCAST - Load from memory and broadcast to all elements of the
	// destination operand
	//
	class avx_broadcast_rm<bits<8> opc, string OpcodeStr, RegisterClass RC,
	X86MemOperand x86memop, ValueType VT,
	PatFrag ld_frag, SchedWrite Sched> :
	AVX8I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set RC:$dst, (VT (X86VBroadcast (ld_frag addr:$src))))]>,
	Sched<[Sched]>, VEX;

	// AVX2 adds register forms
	class avx2_broadcast_rr<bits<8> opc, string OpcodeStr, RegisterClass RC,
	ValueType ResVT, ValueType OpVT, SchedWrite Sched> :
	AVX28I<opc, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set RC:$dst, (ResVT (X86VBroadcast (OpVT VR128:$src))))]>,
	Sched<[Sched]>, VEX;

	let ExeDomain = SSEPackedSingle, Predicates = [HasAVX, NoVLX] in {
	def VBROADCASTSSrm : avx_broadcast_rm<0x18, "vbroadcastss", VR128,
	f32mem, v4f32, loadf32, WriteLoad>;
	def VBROADCASTSSYrm : avx_broadcast_rm<0x18, "vbroadcastss", VR256,
	f32mem, v8f32, loadf32,
	WriteFShuffleLd>, VEX_L;
	}
	let ExeDomain = SSEPackedDouble, Predicates = [HasAVX, NoVLX] in
	def VBROADCASTSDYrm : avx_broadcast_rm<0x19, "vbroadcastsd", VR256, f64mem,
	v4f64, loadf64, WriteFShuffleLd>, VEX_L;

	let ExeDomain = SSEPackedSingle, Predicates = [HasAVX2, NoVLX] in {
	def VBROADCASTSSrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR128,
	v4f32, v4f32, WriteFShuffle>;
	def VBROADCASTSSYrr : avx2_broadcast_rr<0x18, "vbroadcastss", VR256,
	v8f32, v4f32, WriteFShuffle256>, VEX_L;
	}
	let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
	def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
	v4f64, v2f64, WriteFShuffle256>, VEX_L;

	//===----------------------------------------------------------------------===//
	// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
	// halves of a 256-bit vector.
	//
	let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX2] in
	def VBROADCASTI128 : AVX8I<0x5A, MRMSrcMem, (outs VR256:$dst),
	(ins i128mem:$src),
	"vbroadcasti128\t{$src, $dst\|$dst, $src}", []>,
	Sched<[WriteLoad]>, VEX, VEX_L;

	let mayLoad = 1, hasSideEffects = 0, Predicates = [HasAVX] in
	def VBROADCASTF128 : AVX8I<0x1A, MRMSrcMem, (outs VR256:$dst),
	(ins f128mem:$src),
	"vbroadcastf128\t{$src, $dst\|$dst, $src}", []>,
	Sched<[WriteFShuffleLd]>, VEX, VEX_L;

	let Predicates = [HasAVX2, NoVLX] in {
	def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
	(VBROADCASTI128 addr:$src)>;
	def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
	(VBROADCASTI128 addr:$src)>;
	def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
	(VBROADCASTI128 addr:$src)>;
	def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
	(VBROADCASTI128 addr:$src)>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
	(VBROADCASTF128 addr:$src)>;
	def : Pat<(v8f32 (X86SubVBroadcast (loadv4f32 addr:$src))),
	(VBROADCASTF128 addr:$src)>;
	}

	let Predicates = [HasAVX1Only] in {
	def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
	(VBROADCASTF128 addr:$src)>;
	def : Pat<(v8i32 (X86SubVBroadcast (bc_v4i32 (loadv2i64 addr:$src)))),
	(VBROADCASTF128 addr:$src)>;
	def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
	(VBROADCASTF128 addr:$src)>;
	def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
	(VBROADCASTF128 addr:$src)>;
	}

	//===----------------------------------------------------------------------===//
	// VINSERTF128 - Insert packed floating-point values
	//
	let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
	def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR128:$src2, u8imm:$src3),
	"vinsertf128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[]>, Sched<[WriteFShuffle]>, VEX_4V, VEX_L;
	let mayLoad = 1 in
	def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, f128mem:$src2, u8imm:$src3),
	"vinsertf128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[]>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L;
	}

	// To create a 256-bit all ones value, we should produce VCMPTRUEPS
	// with YMM register containing zero.
	// FIXME: Avoid producing vxorps to clear the fake inputs.
	let Predicates = [HasAVX1Only] in {
	def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>;
	}

	multiclass vinsert_lowering<string InstrStr, ValueType From, ValueType To,
	PatFrag memop_frag> {
	def : Pat<(vinsert128_insert:$ins (To VR256:$src1), (From VR128:$src2),
	(iPTR imm)),
	(!cast<Instruction>(InstrStr#rr) VR256:$src1, VR128:$src2,
	(INSERT_get_vinsert128_imm VR256:$ins))>;
	def : Pat<(vinsert128_insert:$ins (To VR256:$src1),
	(From (bitconvert (memop_frag addr:$src2))),
	(iPTR imm)),
	(!cast<Instruction>(InstrStr#rm) VR256:$src1, addr:$src2,
	(INSERT_get_vinsert128_imm VR256:$ins))>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	defm : vinsert_lowering<"VINSERTF128", v4f32, v8f32, loadv4f32>;
	defm : vinsert_lowering<"VINSERTF128", v2f64, v4f64, loadv2f64>;
	}

	let Predicates = [HasAVX1Only] in {
	defm : vinsert_lowering<"VINSERTF128", v2i64, v4i64, loadv2i64>;
	defm : vinsert_lowering<"VINSERTF128", v4i32, v8i32, loadv2i64>;
	defm : vinsert_lowering<"VINSERTF128", v8i16, v16i16, loadv2i64>;
	defm : vinsert_lowering<"VINSERTF128", v16i8, v32i8, loadv2i64>;
	}

	//===----------------------------------------------------------------------===//
	// VEXTRACTF128 - Extract packed floating-point values
	//
	let hasSideEffects = 0, ExeDomain = SSEPackedSingle in {
	def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
	(ins VR256:$src1, u8imm:$src2),
	"vextractf128\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[]>, Sched<[WriteFShuffle]>, VEX, VEX_L;
	let mayStore = 1 in
	def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
	(ins f128mem:$dst, VR256:$src1, u8imm:$src2),
	"vextractf128\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[]>, Sched<[WriteStore]>, VEX, VEX_L;
	}

	multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
	def : Pat<(vextract128_extract:$ext VR256:$src1, (iPTR imm)),
	(To (!cast<Instruction>(InstrStr#rr)
	(From VR256:$src1),
	(EXTRACT_get_vextract128_imm VR128:$ext)))>;
	def : Pat<(store (To (vextract128_extract:$ext (From VR256:$src1),
	(iPTR imm))), addr:$dst),
	(!cast<Instruction>(InstrStr#mr) addr:$dst, VR256:$src1,
	(EXTRACT_get_vextract128_imm VR128:$ext))>;
	}

	// AVX1 patterns
	let Predicates = [HasAVX, NoVLX] in {
	defm : vextract_lowering<"VEXTRACTF128", v8f32, v4f32>;
	defm : vextract_lowering<"VEXTRACTF128", v4f64, v2f64>;
	}

	let Predicates = [HasAVX1Only] in {
	defm : vextract_lowering<"VEXTRACTF128", v4i64, v2i64>;
	defm : vextract_lowering<"VEXTRACTF128", v8i32, v4i32>;
	defm : vextract_lowering<"VEXTRACTF128", v16i16, v8i16>;
	defm : vextract_lowering<"VEXTRACTF128", v32i8, v16i8>;
	}

	//===----------------------------------------------------------------------===//
	// VMASKMOV - Conditional SIMD Packed Loads and Stores
	//
	multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
	Intrinsic IntLd, Intrinsic IntLd256,
	Intrinsic IntSt, Intrinsic IntSt256> {
	def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, f128mem:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
	VEX_4V;
	def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, f256mem:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
	VEX_4V, VEX_L;
	def mr : AVX8I<opc_mr, MRMDestMem, (outs),
	(ins f128mem:$dst, VR128:$src1, VR128:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
	def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
	(ins f256mem:$dst, VR256:$src1, VR256:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
	}

	let ExeDomain = SSEPackedSingle in
	defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
	int_x86_avx_maskload_ps,
	int_x86_avx_maskload_ps_256,
	int_x86_avx_maskstore_ps,
	int_x86_avx_maskstore_ps_256>;
	let ExeDomain = SSEPackedDouble in
	defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
	int_x86_avx_maskload_pd,
	int_x86_avx_maskload_pd_256,
	int_x86_avx_maskstore_pd,
	int_x86_avx_maskstore_pd_256>;

	//===----------------------------------------------------------------------===//
	// VPERMIL - Permute Single and Double Floating-Point Values
	//
	multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
	RegisterClass RC, X86MemOperand x86memop_f,
	X86MemOperand x86memop_i, PatFrag i_frag,
	ValueType f_vt, ValueType i_vt> {
	let Predicates = [HasAVX, NoVLX] in {
	def rr : AVX8I<opc_rm, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (f_vt (X86VPermilpv RC:$src1, (i_vt RC:$src2))))]>, VEX_4V,
	Sched<[WriteFShuffle]>;
	def rm : AVX8I<opc_rm, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop_i:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (f_vt (X86VPermilpv RC:$src1,
	(i_vt (bitconvert (i_frag addr:$src2))))))]>, VEX_4V,
	Sched<[WriteFShuffleLd, ReadAfterLd]>;

	def ri : AVXAIi8<opc_rmi, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, u8imm:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst, (f_vt (X86VPermilpi RC:$src1, (i8 imm:$src2))))]>, VEX,
	Sched<[WriteFShuffle]>;
	def mi : AVXAIi8<opc_rmi, MRMSrcMem, (outs RC:$dst),
	(ins x86memop_f:$src1, u8imm:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set RC:$dst,
	(f_vt (X86VPermilpi (load addr:$src1), (i8 imm:$src2))))]>, VEX,
	Sched<[WriteFShuffleLd]>;
	}// Predicates = [HasAVX, NoVLX]
	}

	let ExeDomain = SSEPackedSingle in {
	defm VPERMILPS : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
	loadv2i64, v4f32, v4i32>;
	defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
	loadv4i64, v8f32, v8i32>, VEX_L;
	}
	let ExeDomain = SSEPackedDouble in {
	defm VPERMILPD : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
	loadv2i64, v2f64, v2i64>;
	defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
	loadv4i64, v4f64, v4i64>, VEX_L;
	}

	//===----------------------------------------------------------------------===//
	// VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
	//
	let ExeDomain = SSEPackedSingle in {
	let isCommutable = 1 in
	def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR256:$src2, u8imm:$src3),
	"vperm2f128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
	(i8 imm:$src3))))]>, VEX_4V, VEX_L,
	Sched<[WriteFShuffle]>;
	def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
	"vperm2f128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
	(i8 imm:$src3)))]>, VEX_4V, VEX_L,
	Sched<[WriteFShuffleLd, ReadAfterLd]>;
	}

	let Predicates = [HasAVX] in {
	def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
	(loadv4f64 addr:$src2), (i8 imm:$imm))),
	(VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
	}

	let Predicates = [HasAVX1Only] in {
	def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;

	def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
	(bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
	(VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
	(loadv4i64 addr:$src2), (i8 imm:$imm))),
	(VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
	(bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
	(VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
	(bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
	(VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
	}

	//===----------------------------------------------------------------------===//
	// VZERO - Zero YMM registers
	//
	// Note, these instruction do not affect the YMM16-YMM31.
	let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
	YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
	// Zero All YMM registers
	def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
	[(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>, VEX_WIG;

	// Zero Upper bits of YMM registers
	def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
	[(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>, VEX_WIG;
	}

	//===----------------------------------------------------------------------===//
	// Half precision conversion instructions
	//===----------------------------------------------------------------------===//
	multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
	def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
	"vcvtph2ps\t{$src, $dst\|$dst, $src}",
	[(set RC:$dst, (Int VR128:$src))]>,
	T8PD, VEX, Sched<[WriteCvtF2F]>;
	let hasSideEffects = 0, mayLoad = 1 in
	def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
	"vcvtph2ps\t{$src, $dst\|$dst, $src}", []>, T8PD, VEX,
	Sched<[WriteCvtF2FLd]>;
	}

	multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
	def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
	(ins RC:$src1, i32u8imm:$src2),
	"vcvtps2ph\t{$src2, $src1, $dst\|$dst, $src1, $src2}",
	[(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
	TAPD, VEX, Sched<[WriteCvtF2F]>;
	let hasSideEffects = 0, mayStore = 1,
	SchedRW = [WriteCvtF2FLd, WriteRMW] in
	def mr : Ii8<0x1D, MRMDestMem, (outs),
	(ins x86memop:$dst, RC:$src1, i32u8imm:$src2),
	"vcvtps2ph\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>,
	TAPD, VEX;
	}

	let Predicates = [HasF16C] in {
	defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
	defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
	defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
	defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;

	// Pattern match vcvtph2ps of a scalar i64 load.
	def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
	(VCVTPH2PSrm addr:$src)>;
	def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
	(VCVTPH2PSrm addr:$src)>;
	def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
	(v2i64 (scalar_to_vector (loadi64 addr:$src))))),
	(VCVTPH2PSrm addr:$src)>;

	def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
	(int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
	addr:$dst),
	(VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
	def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
	(int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
	addr:$dst),
	(VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
	def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
	addr:$dst),
	(VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
	}

	// Patterns for matching conversions from float to half-float and vice versa.
	let Predicates = [HasF16C, NoVLX] in {
	// Use MXCSR.RC for rounding instead of explicitly specifying the default
	// rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
	// configurations we support (the default). However, falling back to MXCSR is
	// more consistent with other instructions, which are always controlled by it.
	// It's encoded as 0b100.
	def : Pat<(fp_to_f16 FR32:$src),
	(i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (VCVTPS2PHrr
	(COPY_TO_REGCLASS FR32:$src, VR128), 4)), sub_16bit))>;

	def : Pat<(f16_to_fp GR16:$src),
	(f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
	(COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)), FR32)) >;

	def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
	(f32 (COPY_TO_REGCLASS (VCVTPH2PSrr
	(VCVTPS2PHrr (COPY_TO_REGCLASS FR32:$src, VR128), 4)), FR32)) >;
	}

	//===----------------------------------------------------------------------===//
	// AVX2 Instructions
	//===----------------------------------------------------------------------===//

	/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
	multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
	X86MemOperand x86memop> {
	let isCommutable = 1 in
	def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
	(ins RC:$src1, RC:$src2, u8imm:$src3),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))]>,
	Sched<[WriteBlend]>, VEX_4V;
	def rmi : AVX2AIi8<opc, MRMSrcMem, (outs RC:$dst),
	(ins RC:$src1, x86memop:$src2, u8imm:$src3),
	!strconcat(OpcodeStr,
	"\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}"),
	[(set RC:$dst,
	(OpVT (OpNode RC:$src1,
	(bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
	Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
	}

	defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
	VR128, loadv2i64, i128mem>;
	defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
	VR256, loadv4i64, i256mem>, VEX_L;

	// For insertion into the zero index (low half) of a 256-bit vector, it is
	// more efficient to generate a blend with immediate instead of an insert*128.
	let Predicates = [HasAVX2] in {
	def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
	(VPBLENDDYrri VR256:$src1,
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
	(VPBLENDDYrri VR256:$src1,
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
	(VPBLENDDYrri VR256:$src1,
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
	(VPBLENDDYrri VR256:$src1,
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	}

	let Predicates = [HasAVX1Only] in {
	def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
	(VBLENDPSYrri VR256:$src1,
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
	(VBLENDPSYrri VR256:$src1,
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
	(VBLENDPSYrri VR256:$src1,
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
	(VBLENDPSYrri VR256:$src1,
	(INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	VR128:$src2, sub_xmm), 0xf)>;
	}

	//===----------------------------------------------------------------------===//
	// VPBROADCAST - Load from memory and broadcast to all elements of the
	// destination operand
	//
	multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
	X86MemOperand x86memop, PatFrag ld_frag,
	ValueType OpVT128, ValueType OpVT256, Predicate prd> {
	let Predicates = [HasAVX2, prd] in {
	def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst,
	(OpVT128 (X86VBroadcast (OpVT128 VR128:$src))))]>,
	Sched<[WriteShuffle]>, VEX;
	def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR128:$dst,
	(OpVT128 (X86VBroadcast (ld_frag addr:$src))))]>,
	Sched<[WriteLoad]>, VEX;
	def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst,
	(OpVT256 (X86VBroadcast (OpVT128 VR128:$src))))]>,
	Sched<[WriteShuffle256]>, VEX, VEX_L;
	def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
	!strconcat(OpcodeStr, "\t{$src, $dst\|$dst, $src}"),
	[(set VR256:$dst,
	(OpVT256 (X86VBroadcast (ld_frag addr:$src))))]>,
	Sched<[WriteLoad]>, VEX, VEX_L;

	// Provide aliases for broadcast from the same register class that
	// automatically does the extract.
	def : Pat<(OpVT256 (X86VBroadcast (OpVT256 VR256:$src))),
	(!cast<Instruction>(NAME#"Yrr")
	(OpVT128 (EXTRACT_SUBREG (OpVT256 VR256:$src),sub_xmm)))>;
	}
	}

	defm VPBROADCASTB : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
	v16i8, v32i8, NoVLX_Or_NoBWI>;
	defm VPBROADCASTW : avx2_broadcast<0x79, "vpbroadcastw", i16mem, loadi16,
	v8i16, v16i16, NoVLX_Or_NoBWI>;
	defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
	v4i32, v8i32, NoVLX>;
	defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
	v2i64, v4i64, NoVLX>;

	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
	// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
	def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
	(VPBROADCASTQrm addr:$src)>;
	def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
	(VPBROADCASTQYrm addr:$src)>;
	// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
	// This means we'll encounter truncated i32 loads; match that here.
	def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
	(VPBROADCASTWrm addr:$src)>;
	def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
	(VPBROADCASTWYrm addr:$src)>;
	def : Pat<(v8i16 (X86VBroadcast
	(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
	(VPBROADCASTWrm addr:$src)>;
	def : Pat<(v16i16 (X86VBroadcast
	(i16 (trunc (i32 (zextloadi16 addr:$src)))))),
	(VPBROADCASTWYrm addr:$src)>;
	}

	let Predicates = [HasAVX2, NoVLX] in {
	// Provide aliases for broadcast from the same register class that
	// automatically does the extract.
	def : Pat<(v8f32 (X86VBroadcast (v8f32 VR256:$src))),
	(VBROADCASTSSYrr (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src),
	sub_xmm)))>;
	def : Pat<(v4f64 (X86VBroadcast (v4f64 VR256:$src))),
	(VBROADCASTSDYrr (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src),
	sub_xmm)))>;
	}

	let Predicates = [HasAVX2, NoVLX] in {
	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.
	def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
	(VBROADCASTSSrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
	def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
	(VBROADCASTSSYrr (COPY_TO_REGCLASS FR32:$src, VR128))>;
	def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
	(VBROADCASTSDYrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
	}

	let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
	def : Pat<(v16i8 (X86VBroadcast GR8:$src)),
	(VPBROADCASTBrr (COPY_TO_REGCLASS
	(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
	GR8:$src, sub_8bit)),
	VR128))>;
	def : Pat<(v32i8 (X86VBroadcast GR8:$src)),
	(VPBROADCASTBYrr (COPY_TO_REGCLASS
	(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
	GR8:$src, sub_8bit)),
	VR128))>;

	def : Pat<(v8i16 (X86VBroadcast GR16:$src)),
	(VPBROADCASTWrr (COPY_TO_REGCLASS
	(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
	GR16:$src, sub_16bit)),
	VR128))>;
	def : Pat<(v16i16 (X86VBroadcast GR16:$src)),
	(VPBROADCASTWYrr (COPY_TO_REGCLASS
	(i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
	GR16:$src, sub_16bit)),
	VR128))>;
	}
	let Predicates = [HasAVX2, NoVLX] in {
	def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
	(VPBROADCASTDrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
	def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
	(VPBROADCASTDYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
	def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
	(VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
	def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
	(VPBROADCASTQYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
	}

	// AVX1 broadcast patterns
	let Predicates = [HasAVX1Only] in {
	def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
	(VBROADCASTSSYrm addr:$src)>;
	def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
	(VBROADCASTSDYrm addr:$src)>;
	def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
	(VBROADCASTSSrm addr:$src)>;
	}

	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.
	let Predicates = [HasAVX, NoVLX] in {
	// 128bit broadcasts:
	def : Pat<(v2f64 (X86VBroadcast f64:$src)),
	(VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
	}

	let Predicates = [HasAVX1Only] in {
	def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
	(VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
	def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
	(VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
	(VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
	(VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
	def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
	(VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
	(VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_xmm),
	(VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), 1)>;

	def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
	(VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
	def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
	(VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
	(VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), sub_xmm),
	(VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0), 1)>;
	def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
	(VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)),
	(VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), sub_xmm),
	(VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;

	def : Pat<(v2i64 (X86VBroadcast i64:$src)),
	(VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>;
	}

	//===----------------------------------------------------------------------===//
	// VPERM - Permute instructions
	//

	multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
	ValueType OpVT, X86FoldableSchedWrite Sched,
	X86MemOperand memOp> {
	let Predicates = [HasAVX2, NoVLX] in {
	def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR256:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
	Sched<[Sched]>, VEX_4V, VEX_L;
	def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, memOp:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(OpVT (X86VPermv VR256:$src1,
	(bitconvert (mem_frag addr:$src2)))))]>,
	Sched<[Sched.Folded, ReadAfterLd]>, VEX_4V, VEX_L;
	}
	}

	defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256,
	i256mem>;
	let ExeDomain = SSEPackedSingle in
	defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256,
	f256mem>;

	multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
	ValueType OpVT, X86FoldableSchedWrite Sched,
	X86MemOperand memOp> {
	let Predicates = [HasAVX2, NoVLX] in {
	def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
	Sched<[Sched]>, VEX, VEX_L;
	def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
	(ins memOp:$src1, u8imm:$src2),
	!strconcat(OpcodeStr,
	"\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(OpVT (X86VPermi (mem_frag addr:$src1),
	(i8 imm:$src2))))]>,
	Sched<[Sched.Folded, ReadAfterLd]>, VEX, VEX_L;
	}
	}

	defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
	WriteShuffle256, i256mem>, VEX_W;
	let ExeDomain = SSEPackedDouble in
	defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
	WriteFShuffle256, f256mem>, VEX_W;

	//===----------------------------------------------------------------------===//
	// VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
	//
	let isCommutable = 1 in
	def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR256:$src2, u8imm:$src3),
	"vperm2i128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
	(i8 imm:$src3))))]>, Sched<[WriteShuffle256]>,
	VEX_4V, VEX_L;
	def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
	"vperm2i128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2),
	(i8 imm:$src3)))]>,
	Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;

	let Predicates = [HasAVX2] in {
	def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
	def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
	(VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;

	def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
	(i8 imm:$imm))),
	(VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
	(bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
	(VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
	def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
	(i8 imm:$imm))),
	(VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
	}


	//===----------------------------------------------------------------------===//
	// VINSERTI128 - Insert packed integer values
	//
	let hasSideEffects = 0 in {
	def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR128:$src2, u8imm:$src3),
	"vinserti128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[]>, Sched<[WriteShuffle256]>, VEX_4V, VEX_L;
	let mayLoad = 1 in
	def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, i128mem:$src2, u8imm:$src3),
	"vinserti128\t{$src3, $src2, $src1, $dst\|$dst, $src1, $src2, $src3}",
	[]>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
	}

	let Predicates = [HasAVX2, NoVLX] in {
	defm : vinsert_lowering<"VINSERTI128", v2i64, v4i64, loadv2i64>;
	defm : vinsert_lowering<"VINSERTI128", v4i32, v8i32, loadv2i64>;
	defm : vinsert_lowering<"VINSERTI128", v8i16, v16i16, loadv2i64>;
	defm : vinsert_lowering<"VINSERTI128", v16i8, v32i8, loadv2i64>;
	}

	//===----------------------------------------------------------------------===//
	// VEXTRACTI128 - Extract packed integer values
	//
	def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
	(ins VR256:$src1, u8imm:$src2),
	"vextracti128\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>,
	Sched<[WriteShuffle256]>, VEX, VEX_L;
	let hasSideEffects = 0, mayStore = 1 in
	def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
	(ins i128mem:$dst, VR256:$src1, u8imm:$src2),
	"vextracti128\t{$src2, $src1, $dst\|$dst, $src1, $src2}", []>,
	Sched<[WriteStore]>, VEX, VEX_L;

	let Predicates = [HasAVX2, NoVLX] in {
	defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
	defm : vextract_lowering<"VEXTRACTI128", v8i32, v4i32>;
	defm : vextract_lowering<"VEXTRACTI128", v16i16, v8i16>;
	defm : vextract_lowering<"VEXTRACTI128", v32i8, v16i8>;
	}

	//===----------------------------------------------------------------------===//
	// VPMASKMOV - Conditional SIMD Integer Packed Loads and Stores
	//
	multiclass avx2_pmovmask<string OpcodeStr,
	Intrinsic IntLd128, Intrinsic IntLd256,
	Intrinsic IntSt128, Intrinsic IntSt256> {
	def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
	def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, i256mem:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
	VEX_4V, VEX_L;
	def mr : AVX28I<0x8e, MRMDestMem, (outs),
	(ins i128mem:$dst, VR128:$src1, VR128:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
	def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
	(ins i256mem:$dst, VR256:$src1, VR256:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
	}

	defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
	int_x86_avx2_maskload_d,
	int_x86_avx2_maskload_d_256,
	int_x86_avx2_maskstore_d,
	int_x86_avx2_maskstore_d_256>;
	defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
	int_x86_avx2_maskload_q,
	int_x86_avx2_maskload_q_256,
	int_x86_avx2_maskstore_q,
	int_x86_avx2_maskstore_q_256>, VEX_W;

	multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
	ValueType MaskVT, string BlendStr, ValueType ZeroVT> {
	// masked store
	def: Pat<(X86mstore addr:$ptr, (MaskVT RC:$mask), (VT RC:$src)),
	(!cast<Instruction>(InstrStr#"mr") addr:$ptr, RC:$mask, RC:$src)>;
	// masked load
	def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), undef)),
	(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
	def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask),
	(VT (bitconvert (ZeroVT immAllZerosV))))),
	(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr)>;
	def: Pat<(VT (X86mload addr:$ptr, (MaskVT RC:$mask), (VT RC:$src0))),
	(!cast<Instruction>(BlendStr#"rr")
	RC:$src0,
	(!cast<Instruction>(InstrStr#"rm") RC:$mask, addr:$ptr),
	RC:$mask)>;
	}
	let Predicates = [HasAVX] in {
	defm : maskmov_lowering<"VMASKMOVPS", VR128, v4f32, v4i32, "VBLENDVPS", v4i32>;
	defm : maskmov_lowering<"VMASKMOVPD", VR128, v2f64, v2i64, "VBLENDVPD", v4i32>;
	defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8f32, v8i32, "VBLENDVPSY", v8i32>;
	defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4f64, v4i64, "VBLENDVPDY", v8i32>;
	}
	let Predicates = [HasAVX1Only] in {
	// load/store i32/i64 not supported use ps/pd version
	defm : maskmov_lowering<"VMASKMOVPSY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
	defm : maskmov_lowering<"VMASKMOVPDY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
	defm : maskmov_lowering<"VMASKMOVPS", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
	defm : maskmov_lowering<"VMASKMOVPD", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
	}
	let Predicates = [HasAVX2] in {
	defm : maskmov_lowering<"VPMASKMOVDY", VR256, v8i32, v8i32, "VBLENDVPSY", v8i32>;
	defm : maskmov_lowering<"VPMASKMOVQY", VR256, v4i64, v4i64, "VBLENDVPDY", v8i32>;
	defm : maskmov_lowering<"VPMASKMOVD", VR128, v4i32, v4i32, "VBLENDVPS", v4i32>;
	defm : maskmov_lowering<"VPMASKMOVQ", VR128, v2i64, v2i64, "VBLENDVPD", v4i32>;
	}

	//===----------------------------------------------------------------------===//
	// SubVector Broadcasts
	// Provide fallback in case the load node that is used in the patterns above
	// is used by additional users, which prevents the pattern selection.

	let Predicates = [HasAVX2, NoVLX] in {
	def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
	(VINSERTI128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v2i64 VR128:$src), 1)>;
	def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
	(VINSERTI128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v4i32 VR128:$src), 1)>;
	def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
	(VINSERTI128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v8i16 VR128:$src), 1)>;
	def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
	(VINSERTI128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v16i8 VR128:$src), 1)>;
	}

	let Predicates = [HasAVX, NoVLX] in {
	def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128:$src))),
	(VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v2f64 VR128:$src), 1)>;
	def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128:$src))),
	(VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v4f32 VR128:$src), 1)>;
	}

	let Predicates = [HasAVX1Only] in {
	def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128:$src))),
	(VINSERTF128rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v2i64 VR128:$src), 1)>;
	def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128:$src))),
	(VINSERTF128rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v4i32 VR128:$src), 1)>;
	def : Pat<(v16i16 (X86SubVBroadcast (v8i16 VR128:$src))),
	(VINSERTF128rr (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v8i16 VR128:$src), 1)>;
	def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128:$src))),
	(VINSERTF128rr (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm),
	(v16i8 VR128:$src), 1)>;
	}

	//===----------------------------------------------------------------------===//
	// Variable Bit Shifts
	//
	multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
	ValueType vt128, ValueType vt256> {
	def rr : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
	(ins VR128:$src1, VR128:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR128:$dst,
	(vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
	VEX_4V, Sched<[WriteVarVecShift]>;
	def rm : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
	(ins VR128:$src1, i128mem:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR128:$dst,
	(vt128 (OpNode VR128:$src1,
	(vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
	VEX_4V, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
	def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
	(ins VR256:$src1, VR256:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
	VEX_4V, VEX_L, Sched<[WriteVarVecShift]>;
	def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
	(ins VR256:$src1, i256mem:$src2),
	!strconcat(OpcodeStr, "\t{$src2, $src1, $dst\|$dst, $src1, $src2}"),
	[(set VR256:$dst,
	(vt256 (OpNode VR256:$src1,
	(vt256 (bitconvert (loadv4i64 addr:$src2))))))]>,
	VEX_4V, VEX_L, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
	}

	let Predicates = [HasAVX2, NoVLX] in {
	defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
	defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
	defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
	defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
	defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;

	def : Pat<(v4i32 (X86vsrav VR128:$src1, VR128:$src2)),
	(VPSRAVDrr VR128:$src1, VR128:$src2)>;
	def : Pat<(v4i32 (X86vsrav VR128:$src1,
	(bitconvert (loadv2i64 addr:$src2)))),
	(VPSRAVDrm VR128:$src1, addr:$src2)>;
	def : Pat<(v8i32 (X86vsrav VR256:$src1, VR256:$src2)),
	(VPSRAVDYrr VR256:$src1, VR256:$src2)>;
	def : Pat<(v8i32 (X86vsrav VR256:$src1,
	(bitconvert (loadv4i64 addr:$src2)))),
	(VPSRAVDYrm VR256:$src1, addr:$src2)>;
	}



	//===----------------------------------------------------------------------===//
	// VGATHER - GATHER Operations
	multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
	X86MemOperand memop128, X86MemOperand memop256> {
	def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
	(ins VR128:$src1, memop128:$src2, VR128:$mask),
	!strconcat(OpcodeStr,
	"\t{$mask, $src2, $dst\|$dst, $src2, $mask}"),
	[]>, VEX;
	def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
	(ins RC256:$src1, memop256:$src2, RC256:$mask),
	!strconcat(OpcodeStr,
	"\t{$mask, $src2, $dst\|$dst, $src2, $mask}"),
	[]>, VEX, VEX_L;
	}

	let mayLoad = 1, hasSideEffects = 0, Constraints
	= "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
	in {
	defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
	defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
	defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
	defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;

	let ExeDomain = SSEPackedDouble in {
	defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
	defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
	}

	let ExeDomain = SSEPackedSingle in {
	defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
	defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
	}
	}

	//===----------------------------------------------------------------------===//
	// Extra selection patterns for FR128, f128, f128mem

	// movaps is shorter than movdqa. movaps is in SSE and movdqa is in SSE2.
	def : Pat<(store (f128 FR128:$src), addr:$dst),
	(MOVAPSmr addr:$dst, (COPY_TO_REGCLASS (f128 FR128:$src), VR128))>;

	def : Pat<(loadf128 addr:$src),
	(COPY_TO_REGCLASS (MOVAPSrm addr:$src), FR128)>;

	// andps is shorter than andpd or pand. andps is SSE and andpd/pand are in SSE2
	def : Pat<(X86fand FR128:$src1, (loadf128 addr:$src2)),
	(COPY_TO_REGCLASS
	(ANDPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
	FR128)>;

	def : Pat<(X86fand FR128:$src1, FR128:$src2),
	(COPY_TO_REGCLASS
	(ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
	(COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;

	def : Pat<(and FR128:$src1, FR128:$src2),
	(COPY_TO_REGCLASS
	(ANDPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
	(COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;

	def : Pat<(X86for FR128:$src1, (loadf128 addr:$src2)),
	(COPY_TO_REGCLASS
	(ORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
	FR128)>;

	def : Pat<(X86for FR128:$src1, FR128:$src2),
	(COPY_TO_REGCLASS
	(ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
	(COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;

	def : Pat<(or FR128:$src1, FR128:$src2),
	(COPY_TO_REGCLASS
	(ORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
	(COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;

	def : Pat<(X86fxor FR128:$src1, (loadf128 addr:$src2)),
	(COPY_TO_REGCLASS
	(XORPSrm (COPY_TO_REGCLASS FR128:$src1, VR128), f128mem:$src2),
	FR128)>;

	def : Pat<(X86fxor FR128:$src1, FR128:$src2),
	(COPY_TO_REGCLASS
	(XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
	(COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;

	def : Pat<(xor FR128:$src1, FR128:$src2),
	(COPY_TO_REGCLASS
	(XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
	(COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
	Index: head/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
	===================================================================
	--- head/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp (revision 322320)
	@@ -1,1060 +1,1060 @@
	//===-- ArgumentPromotion.cpp - Promote by-reference arguments ------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This pass promotes "by reference" arguments to be "by value" arguments. In
	// practice, this means looking for internal functions that have pointer
	// arguments. If it can prove, through the use of alias analysis, that an
	// argument is only loaded, then it can pass the value into the function
	// instead of the address of the value. This can cause recursive simplification
	// of code and lead to the elimination of allocas (especially in C++ template
	// code like the STL).
	//
	// This pass also handles aggregate arguments that are passed into a function,
	// scalarizing them if the elements of the aggregate are only loaded. Note that
	// by default it refuses to scalarize aggregates which would require passing in
	// more than three operands to the function, because passing thousands of
	// operands for a large array or structure is unprofitable! This limit can be
	// configured or disabled, however.
	//
	// Note that this transformation could also be done for arguments that are only
	// stored to (returning the value instead), but does not currently. This case
	// would be best handled when and if LLVM begins supporting multiple return
	// values from functions.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/IPO/ArgumentPromotion.h"
	#include "llvm/ADT/DepthFirstIterator.h"
	#include "llvm/ADT/Optional.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/Analysis/AliasAnalysis.h"
	#include "llvm/Analysis/AssumptionCache.h"
	#include "llvm/Analysis/BasicAliasAnalysis.h"
	#include "llvm/Analysis/CallGraph.h"
	#include "llvm/Analysis/CallGraphSCCPass.h"
	#include "llvm/Analysis/LazyCallGraph.h"
	#include "llvm/Analysis/Loads.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/IR/CFG.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DebugInfo.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/IPO.h"
	#include <set>
	using namespace llvm;

	#define DEBUG_TYPE "argpromotion"

	STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
	STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
	STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
	STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");

	/// A vector used to hold the indices of a single GEP instruction
	typedef std::vector<uint64_t> IndicesVector;

	/// DoPromotion - This method actually performs the promotion of the specified
	/// arguments, and returns the new function. At this point, we know that it's
	/// safe to do so.
	static Function *
	doPromotion(Function F, SmallPtrSetImpl<Argument > &ArgsToPromote,
	SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
	Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
	ReplaceCallSite) {

	// Start by computing a new prototype for the function, which is the same as
	// the old function, but has modified arguments.
	FunctionType *FTy = F->getFunctionType();
	std::vector<Type *> Params;

	typedef std::set<std::pair<Type *, IndicesVector>> ScalarizeTable;

	// ScalarizedElements - If we are promoting a pointer that has elements
	// accessed out of it, keep track of which elements are accessed so that we
	// can add one argument for each.
	//
	// Arguments that are directly loaded will have a zero element value here, to
	// handle cases where there are both a direct load and GEP accesses.
	//
	std::map<Argument *, ScalarizeTable> ScalarizedElements;

	// OriginalLoads - Keep track of a representative load instruction from the
	// original function so that we can tell the alias analysis implementation
	// what the new GEP/Load instructions we are inserting look like.
	// We need to keep the original loads for each argument and the elements
	// of the argument that are accessed.
	std::map<std::pair<Argument , IndicesVector>, LoadInst > OriginalLoads;

	// Attribute - Keep track of the parameter attributes for the arguments
	// that we are not promoting. For the ones that we do promote, the parameter
	// attributes are lost
	SmallVector<AttributeSet, 8> ArgAttrVec;
	AttributeList PAL = F->getAttributes();

	// First, determine the new argument list
	unsigned ArgNo = 0;
	for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
	++I, ++ArgNo) {
	if (ByValArgsToTransform.count(&*I)) {
	// Simple byval argument? Just add all the struct element types.
	Type *AgTy = cast<PointerType>(I->getType())->getElementType();
	StructType *STy = cast<StructType>(AgTy);
	Params.insert(Params.end(), STy->element_begin(), STy->element_end());
	ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
	AttributeSet());
	++NumByValArgsPromoted;
	} else if (!ArgsToPromote.count(&*I)) {
	// Unchanged argument
	Params.push_back(I->getType());
	ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
	} else if (I->use_empty()) {
	// Dead argument (which are always marked as promotable)
	++NumArgumentsDead;

	// There may be remaining metadata uses of the argument for things like
	// llvm.dbg.value. Replace them with undef.
	I->replaceAllUsesWith(UndefValue::get(I->getType()));
	} else {
	// Okay, this is being promoted. This means that the only uses are loads
	// or GEPs which are only used by loads

	// In this table, we will track which indices are loaded from the argument
	// (where direct loads are tracked as no indices).
	ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
	for (User *U : I->users()) {
	Instruction *UI = cast<Instruction>(U);
	Type *SrcTy;
	if (LoadInst *L = dyn_cast<LoadInst>(UI))
	SrcTy = L->getType();
	else
	SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
	IndicesVector Indices;
	Indices.reserve(UI->getNumOperands() - 1);
	// Since loads will only have a single operand, and GEPs only a single
	// non-index operand, this will record direct loads without any indices,
	// and gep+loads with the GEP indices.
	for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
	II != IE; ++II)
	Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
	// GEPs with a single 0 index can be merged with direct loads
	if (Indices.size() == 1 && Indices.front() == 0)
	Indices.clear();
	ArgIndices.insert(std::make_pair(SrcTy, Indices));
	LoadInst *OrigLoad;
	if (LoadInst *L = dyn_cast<LoadInst>(UI))
	OrigLoad = L;
	else
	// Take any load, we will use it only to update Alias Analysis
	OrigLoad = cast<LoadInst>(UI->user_back());
	OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
	}

	// Add a parameter to the function for each element passed in.
	for (const auto &ArgIndex : ArgIndices) {
	// not allowed to dereference ->begin() if size() is 0
	Params.push_back(GetElementPtrInst::getIndexedType(
	cast<PointerType>(I->getType()->getScalarType())->getElementType(),
	ArgIndex.second));
	ArgAttrVec.push_back(AttributeSet());
	assert(Params.back());
	}

	if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
	++NumArgumentsPromoted;
	else
	++NumAggregatesPromoted;
	}
	}

	Type *RetTy = FTy->getReturnType();

	// Construct the new function type using the new arguments.
	FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());

	// Create the new function body and insert it into the module.
	Function *NF = Function::Create(NFTy, F->getLinkage(), F->getName());
	NF->copyAttributesFrom(F);

	// Patch the pointer to LLVM function in debug info descriptor.
	NF->setSubprogram(F->getSubprogram());
	F->setSubprogram(nullptr);

	DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n"
	<< "From: " << *F);

	// Recompute the parameter attributes list based on the new arguments for
	// the function.
	NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
	PAL.getRetAttributes(), ArgAttrVec));
	ArgAttrVec.clear();

	F->getParent()->getFunctionList().insert(F->getIterator(), NF);
	NF->takeName(F);

	// Loop over all of the callers of the function, transforming the call sites
	// to pass in the loaded pointers.
	//
	SmallVector<Value *, 16> Args;
	while (!F->use_empty()) {
	CallSite CS(F->user_back());
	assert(CS.getCalledFunction() == F);
	Instruction *Call = CS.getInstruction();
	const AttributeList &CallPAL = CS.getAttributes();

	// Loop over the operands, inserting GEP and loads in the caller as
	// appropriate.
	CallSite::arg_iterator AI = CS.arg_begin();
	ArgNo = 0;
	for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
	++I, ++AI, ++ArgNo)
	if (!ArgsToPromote.count(&I) && !ByValArgsToTransform.count(&I)) {
	Args.push_back(*AI); // Unmodified argument
	ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
	} else if (ByValArgsToTransform.count(&*I)) {
	// Emit a GEP and load for each element of the struct.
	Type *AgTy = cast<PointerType>(I->getType())->getElementType();
	StructType *STy = cast<StructType>(AgTy);
	Value *Idxs[2] = {
	ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
	Value *Idx = GetElementPtrInst::Create(
	STy, AI, Idxs, (AI)->getName() + "." + Twine(i), Call);
	// TODO: Tell AA about the new values?
	Args.push_back(new LoadInst(Idx, Idx->getName() + ".val", Call));
	ArgAttrVec.push_back(AttributeSet());
	}
	} else if (!I->use_empty()) {
	// Non-dead argument: insert GEPs and loads as appropriate.
	ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
	// Store the Value* version of the indices in here, but declare it now
	// for reuse.
	std::vector<Value *> Ops;
	for (const auto &ArgIndex : ArgIndices) {
	Value V = AI;
	LoadInst *OrigLoad =
	OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
	if (!ArgIndex.second.empty()) {
	Ops.reserve(ArgIndex.second.size());
	Type *ElTy = V->getType();
	for (auto II : ArgIndex.second) {
	// Use i32 to index structs, and i64 for others (pointers/arrays).
	// This satisfies GEP constraints.
	Type *IdxTy =
	(ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
	: Type::getInt64Ty(F->getContext()));
	Ops.push_back(ConstantInt::get(IdxTy, II));
	// Keep track of the type we're currently indexing.
	if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
	ElTy = ElPTy->getElementType();
	else
	ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
	}
	// And create a GEP to extract those indices.
	V = GetElementPtrInst::Create(ArgIndex.first, V, Ops,
	V->getName() + ".idx", Call);
	Ops.clear();
	}
	// Since we're replacing a load make sure we take the alignment
	// of the previous load.
	LoadInst *newLoad = new LoadInst(V, V->getName() + ".val", Call);
	newLoad->setAlignment(OrigLoad->getAlignment());
	// Transfer the AA info too.
	AAMDNodes AAInfo;
	OrigLoad->getAAMetadata(AAInfo);
	newLoad->setAAMetadata(AAInfo);

	Args.push_back(newLoad);
	ArgAttrVec.push_back(AttributeSet());
	}
	}

	// Push any varargs arguments on the list.
	for (; AI != CS.arg_end(); ++AI, ++ArgNo) {
	Args.push_back(*AI);
	ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
	}

	SmallVector<OperandBundleDef, 1> OpBundles;
	CS.getOperandBundlesAsDefs(OpBundles);

	CallSite NewCS;
	if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
	NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
	Args, OpBundles, "", Call);
	} else {
	auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", Call);
	NewCall->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
	NewCS = NewCall;
	}
	NewCS.setCallingConv(CS.getCallingConv());
	NewCS.setAttributes(
	AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
	CallPAL.getRetAttributes(), ArgAttrVec));
	NewCS->setDebugLoc(Call->getDebugLoc());
	uint64_t W;
	if (Call->extractProfTotalWeight(W))
	NewCS->setProfWeight(W);
	Args.clear();
	ArgAttrVec.clear();

	// Update the callgraph to know that the callsite has been transformed.
	if (ReplaceCallSite)
	(*ReplaceCallSite)(CS, NewCS);

	if (!Call->use_empty()) {
	Call->replaceAllUsesWith(NewCS.getInstruction());
	NewCS->takeName(Call);
	}

	// Finally, remove the old call from the program, reducing the use-count of
	// F.
	Call->eraseFromParent();
	}

	const DataLayout &DL = F->getParent()->getDataLayout();

	// Since we have now created the new function, splice the body of the old
	// function right into the new function, leaving the old rotting hulk of the
	// function empty.
	NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());

	// Loop over the argument list, transferring uses of the old arguments over to
	// the new arguments, also transferring over the names as well.
	//
	for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
	I2 = NF->arg_begin();
	I != E; ++I) {
	if (!ArgsToPromote.count(&I) && !ByValArgsToTransform.count(&I)) {
	// If this is an unmodified argument, move the name and users over to the
	// new version.
	I->replaceAllUsesWith(&*I2);
	I2->takeName(&*I);
	++I2;
	continue;
	}

	if (ByValArgsToTransform.count(&*I)) {
	// In the callee, we create an alloca, and store each of the new incoming
	// arguments into the alloca.
	Instruction *InsertPt = &NF->begin()->front();

	// Just add all the struct element types.
	Type *AgTy = cast<PointerType>(I->getType())->getElementType();
	Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
	- "", InsertPt);
	+ I->getParamAlignment(), "", InsertPt);
	StructType *STy = cast<StructType>(AgTy);
	Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
	nullptr};

	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
	Value *Idx = GetElementPtrInst::Create(
	AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
	InsertPt);
	I2->setName(I->getName() + "." + Twine(i));
	new StoreInst(&*I2++, Idx, InsertPt);
	}

	// Anything that used the arg should now use the alloca.
	I->replaceAllUsesWith(TheAlloca);
	TheAlloca->takeName(&*I);

	// If the alloca is used in a call, we must clear the tail flag since
	// the callee now uses an alloca from the caller.
	for (User *U : TheAlloca->users()) {
	CallInst *Call = dyn_cast<CallInst>(U);
	if (!Call)
	continue;
	Call->setTailCall(false);
	}
	continue;
	}

	if (I->use_empty())
	continue;

	// Otherwise, if we promoted this argument, then all users are load
	// instructions (or GEPs with only load users), and all loads should be
	// using the new argument that we added.
	ScalarizeTable &ArgIndices = ScalarizedElements[&*I];

	while (!I->use_empty()) {
	if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
	assert(ArgIndices.begin()->second.empty() &&
	"Load element should sort to front!");
	I2->setName(I->getName() + ".val");
	LI->replaceAllUsesWith(&*I2);
	LI->eraseFromParent();
	DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
	<< "' in function '" << F->getName() << "'\n");
	} else {
	GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
	IndicesVector Operands;
	Operands.reserve(GEP->getNumIndices());
	for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
	II != IE; ++II)
	Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());

	// GEPs with a single 0 index can be merged with direct loads
	if (Operands.size() == 1 && Operands.front() == 0)
	Operands.clear();

	Function::arg_iterator TheArg = I2;
	for (ScalarizeTable::iterator It = ArgIndices.begin();
	It->second != Operands; ++It, ++TheArg) {
	assert(It != ArgIndices.end() && "GEP not handled??");
	}

	std::string NewName = I->getName();
	for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
	NewName += "." + utostr(Operands[i]);
	}
	NewName += ".val";
	TheArg->setName(NewName);

	DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
	<< "' of function '" << NF->getName() << "'\n");

	// All of the uses must be load instructions. Replace them all with
	// the argument specified by ArgNo.
	while (!GEP->use_empty()) {
	LoadInst *L = cast<LoadInst>(GEP->user_back());
	L->replaceAllUsesWith(&*TheArg);
	L->eraseFromParent();
	}
	GEP->eraseFromParent();
	}
	}

	// Increment I2 past all of the arguments added for this promoted pointer.
	std::advance(I2, ArgIndices.size());
	}

	return NF;
	}

	/// AllCallersPassInValidPointerForArgument - Return true if we can prove that
	/// all callees pass in a valid pointer for the specified function argument.
	static bool allCallersPassInValidPointerForArgument(Argument *Arg) {
	Function *Callee = Arg->getParent();
	const DataLayout &DL = Callee->getParent()->getDataLayout();

	unsigned ArgNo = Arg->getArgNo();

	// Look at all call sites of the function. At this point we know we only have
	// direct callees.
	for (User *U : Callee->users()) {
	CallSite CS(U);
	assert(CS && "Should only have direct calls!");

	if (!isDereferenceablePointer(CS.getArgument(ArgNo), DL))
	return false;
	}
	return true;
	}

	/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
	/// that is greater than or equal to the size of prefix, and each of the
	/// elements in Prefix is the same as the corresponding elements in Longer.
	///
	/// This means it also returns true when Prefix and Longer are equal!
	static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
	if (Prefix.size() > Longer.size())
	return false;
	return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
	}

	/// Checks if Indices, or a prefix of Indices, is in Set.
	static bool prefixIn(const IndicesVector &Indices,
	std::set<IndicesVector> &Set) {
	std::set<IndicesVector>::iterator Low;
	Low = Set.upper_bound(Indices);
	if (Low != Set.begin())
	Low--;
	// Low is now the last element smaller than or equal to Indices. This means
	// it points to a prefix of Indices (possibly Indices itself), if such
	// prefix exists.
	//
	// This load is safe if any prefix of its operands is safe to load.
	return Low != Set.end() && isPrefix(*Low, Indices);
	}

	/// Mark the given indices (ToMark) as safe in the given set of indices
	/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
	/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
	/// already. Furthermore, any indices that Indices is itself a prefix of, are
	/// removed from Safe (since they are implicitely safe because of Indices now).
	static void markIndicesSafe(const IndicesVector &ToMark,
	std::set<IndicesVector> &Safe) {
	std::set<IndicesVector>::iterator Low;
	Low = Safe.upper_bound(ToMark);
	// Guard against the case where Safe is empty
	if (Low != Safe.begin())
	Low--;
	// Low is now the last element smaller than or equal to Indices. This
	// means it points to a prefix of Indices (possibly Indices itself), if
	// such prefix exists.
	if (Low != Safe.end()) {
	if (isPrefix(*Low, ToMark))
	// If there is already a prefix of these indices (or exactly these
	// indices) marked a safe, don't bother adding these indices
	return;

	// Increment Low, so we can use it as a "insert before" hint
	++Low;
	}
	// Insert
	Low = Safe.insert(Low, ToMark);
	++Low;
	// If there we're a prefix of longer index list(s), remove those
	std::set<IndicesVector>::iterator End = Safe.end();
	while (Low != End && isPrefix(ToMark, *Low)) {
	std::set<IndicesVector>::iterator Remove = Low;
	++Low;
	Safe.erase(Remove);
	}
	}

	/// isSafeToPromoteArgument - As you might guess from the name of this method,
	/// it checks to see if it is both safe and useful to promote the argument.
	/// This method limits promotion of aggregates to only promote up to three
	/// elements of the aggregate in order to avoid exploding the number of
	/// arguments passed in.
	static bool isSafeToPromoteArgument(Argument *Arg, bool isByValOrInAlloca,
	AAResults &AAR, unsigned MaxElements) {
	typedef std::set<IndicesVector> GEPIndicesSet;

	// Quick exit for unused arguments
	if (Arg->use_empty())
	return true;

	// We can only promote this argument if all of the uses are loads, or are GEP
	// instructions (with constant indices) that are subsequently loaded.
	//
	// Promoting the argument causes it to be loaded in the caller
	// unconditionally. This is only safe if we can prove that either the load
	// would have happened in the callee anyway (ie, there is a load in the entry
	// block) or the pointer passed in at every call site is guaranteed to be
	// valid.
	// In the former case, invalid loads can happen, but would have happened
	// anyway, in the latter case, invalid loads won't happen. This prevents us
	// from introducing an invalid load that wouldn't have happened in the
	// original code.
	//
	// This set will contain all sets of indices that are loaded in the entry
	// block, and thus are safe to unconditionally load in the caller.
	//
	// This optimization is also safe for InAlloca parameters, because it verifies
	// that the address isn't captured.
	GEPIndicesSet SafeToUnconditionallyLoad;

	// This set contains all the sets of indices that we are planning to promote.
	// This makes it possible to limit the number of arguments added.
	GEPIndicesSet ToPromote;

	// If the pointer is always valid, any load with first index 0 is valid.
	if (isByValOrInAlloca \|\| allCallersPassInValidPointerForArgument(Arg))
	SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));

	// First, iterate the entry block and mark loads of (geps of) arguments as
	// safe.
	BasicBlock &EntryBlock = Arg->getParent()->front();
	// Declare this here so we can reuse it
	IndicesVector Indices;
	for (Instruction &I : EntryBlock)
	if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
	Value *V = LI->getPointerOperand();
	if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
	V = GEP->getPointerOperand();
	if (V == Arg) {
	// This load actually loads (part of) Arg? Check the indices then.
	Indices.reserve(GEP->getNumIndices());
	for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
	II != IE; ++II)
	if (ConstantInt CI = dyn_cast<ConstantInt>(II))
	Indices.push_back(CI->getSExtValue());
	else
	// We found a non-constant GEP index for this argument? Bail out
	// right away, can't promote this argument at all.
	return false;

	// Indices checked out, mark them as safe
	markIndicesSafe(Indices, SafeToUnconditionallyLoad);
	Indices.clear();
	}
	} else if (V == Arg) {
	// Direct loads are equivalent to a GEP with a single 0 index.
	markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
	}
	}

	// Now, iterate all uses of the argument to see if there are any uses that are
	// not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
	SmallVector<LoadInst *, 16> Loads;
	IndicesVector Operands;
	for (Use &U : Arg->uses()) {
	User *UR = U.getUser();
	Operands.clear();
	if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
	// Don't hack volatile/atomic loads
	if (!LI->isSimple())
	return false;
	Loads.push_back(LI);
	// Direct loads are equivalent to a GEP with a zero index and then a load.
	Operands.push_back(0);
	} else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) {
	if (GEP->use_empty()) {
	// Dead GEP's cause trouble later. Just remove them if we run into
	// them.
	GEP->eraseFromParent();
	// TODO: This runs the above loop over and over again for dead GEPs
	// Couldn't we just do increment the UI iterator earlier and erase the
	// use?
	return isSafeToPromoteArgument(Arg, isByValOrInAlloca, AAR,
	MaxElements);
	}

	// Ensure that all of the indices are constants.
	for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e;
	++i)
	if (ConstantInt C = dyn_cast<ConstantInt>(i))
	Operands.push_back(C->getSExtValue());
	else
	return false; // Not a constant operand GEP!

	// Ensure that the only users of the GEP are load instructions.
	for (User *GEPU : GEP->users())
	if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
	// Don't hack volatile/atomic loads
	if (!LI->isSimple())
	return false;
	Loads.push_back(LI);
	} else {
	// Other uses than load?
	return false;
	}
	} else {
	return false; // Not a load or a GEP.
	}

	// Now, see if it is safe to promote this load / loads of this GEP. Loading
	// is safe if Operands, or a prefix of Operands, is marked as safe.
	if (!prefixIn(Operands, SafeToUnconditionallyLoad))
	return false;

	// See if we are already promoting a load with these indices. If not, check
	// to make sure that we aren't promoting too many elements. If so, nothing
	// to do.
	if (ToPromote.find(Operands) == ToPromote.end()) {
	if (MaxElements > 0 && ToPromote.size() == MaxElements) {
	DEBUG(dbgs() << "argpromotion not promoting argument '"
	<< Arg->getName()
	<< "' because it would require adding more "
	<< "than " << MaxElements
	<< " arguments to the function.\n");
	// We limit aggregate promotion to only promoting up to a fixed number
	// of elements of the aggregate.
	return false;
	}
	ToPromote.insert(std::move(Operands));
	}
	}

	if (Loads.empty())
	return true; // No users, this is a dead argument.

	// Okay, now we know that the argument is only used by load instructions and
	// it is safe to unconditionally perform all of them. Use alias analysis to
	// check to see if the pointer is guaranteed to not be modified from entry of
	// the function to each of the load instructions.

	// Because there could be several/many load instructions, remember which
	// blocks we know to be transparent to the load.
	df_iterator_default_set<BasicBlock *, 16> TranspBlocks;

	for (LoadInst *Load : Loads) {
	// Check to see if the load is invalidated from the start of the block to
	// the load itself.
	BasicBlock *BB = Load->getParent();

	MemoryLocation Loc = MemoryLocation::get(Load);
	if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, MRI_Mod))
	return false; // Pointer is invalidated!

	// Now check every path from the entry block to the load for transparency.
	// To do this, we perform a depth first search on the inverse CFG from the
	// loading block.
	for (BasicBlock *P : predecessors(BB)) {
	for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
	if (AAR.canBasicBlockModify(*TranspBB, Loc))
	return false;
	}
	}

	// If the path from the entry of the function to each load is free of
	// instructions that potentially invalidate the load, we can make the
	// transformation!
	return true;
	}

	/// \brief Checks if a type could have padding bytes.
	static bool isDenselyPacked(Type *type, const DataLayout &DL) {

	// There is no size information, so be conservative.
	if (!type->isSized())
	return false;

	// If the alloc size is not equal to the storage size, then there are padding
	// bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
	if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
	return false;

	if (!isa<CompositeType>(type))
	return true;

	// For homogenous sequential types, check for padding within members.
	if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
	return isDenselyPacked(seqTy->getElementType(), DL);

	// Check for padding within and between elements of a struct.
	StructType *StructTy = cast<StructType>(type);
	const StructLayout *Layout = DL.getStructLayout(StructTy);
	uint64_t StartPos = 0;
	for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
	Type *ElTy = StructTy->getElementType(i);
	if (!isDenselyPacked(ElTy, DL))
	return false;
	if (StartPos != Layout->getElementOffsetInBits(i))
	return false;
	StartPos += DL.getTypeAllocSizeInBits(ElTy);
	}

	return true;
	}

	/// \brief Checks if the padding bytes of an argument could be accessed.
	static bool canPaddingBeAccessed(Argument *arg) {

	assert(arg->hasByValAttr());

	// Track all the pointers to the argument to make sure they are not captured.
	SmallPtrSet<Value *, 16> PtrValues;
	PtrValues.insert(arg);

	// Track all of the stores.
	SmallVector<StoreInst *, 16> Stores;

	// Scan through the uses recursively to make sure the pointer is always used
	// sanely.
	SmallVector<Value *, 16> WorkList;
	WorkList.insert(WorkList.end(), arg->user_begin(), arg->user_end());
	while (!WorkList.empty()) {
	Value *V = WorkList.back();
	WorkList.pop_back();
	if (isa<GetElementPtrInst>(V) \|\| isa<PHINode>(V)) {
	if (PtrValues.insert(V).second)
	WorkList.insert(WorkList.end(), V->user_begin(), V->user_end());
	} else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
	Stores.push_back(Store);
	} else if (!isa<LoadInst>(V)) {
	return true;
	}
	}

	// Check to make sure the pointers aren't captured
	for (StoreInst *Store : Stores)
	if (PtrValues.count(Store->getValueOperand()))
	return true;

	return false;
	}

	/// PromoteArguments - This method checks the specified function to see if there
	/// are any promotable arguments and if it is safe to promote the function (for
	/// example, all callers are direct). If safe to promote some arguments, it
	/// calls the DoPromotion method.
	///
	static Function *
	promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
	unsigned MaxElements,
	Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
	ReplaceCallSite) {
	// Make sure that it is local to this module.
	if (!F->hasLocalLinkage())
	return nullptr;

	// Don't promote arguments for variadic functions. Adding, removing, or
	// changing non-pack parameters can change the classification of pack
	// parameters. Frontends encode that classification at the call site in the
	// IR, while in the callee the classification is determined dynamically based
	// on the number of registers consumed so far.
	if (F->isVarArg())
	return nullptr;

	// First check: see if there are any pointer arguments! If not, quick exit.
	SmallVector<Argument *, 16> PointerArgs;
	for (Argument &I : F->args())
	if (I.getType()->isPointerTy())
	PointerArgs.push_back(&I);
	if (PointerArgs.empty())
	return nullptr;

	// Second check: make sure that all callers are direct callers. We can't
	// transform functions that have indirect callers. Also see if the function
	// is self-recursive.
	bool isSelfRecursive = false;
	for (Use &U : F->uses()) {
	CallSite CS(U.getUser());
	// Must be a direct call.
	if (CS.getInstruction() == nullptr \|\| !CS.isCallee(&U))
	return nullptr;

	if (CS.getInstruction()->getParent()->getParent() == F)
	isSelfRecursive = true;
	}

	const DataLayout &DL = F->getParent()->getDataLayout();

	AAResults &AAR = AARGetter(*F);

	// Check to see which arguments are promotable. If an argument is promotable,
	// add it to ArgsToPromote.
	SmallPtrSet<Argument *, 8> ArgsToPromote;
	SmallPtrSet<Argument *, 8> ByValArgsToTransform;
	for (Argument *PtrArg : PointerArgs) {
	Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();

	// Replace sret attribute with noalias. This reduces register pressure by
	// avoiding a register copy.
	if (PtrArg->hasStructRetAttr()) {
	unsigned ArgNo = PtrArg->getArgNo();
	F->removeParamAttr(ArgNo, Attribute::StructRet);
	F->addParamAttr(ArgNo, Attribute::NoAlias);
	for (Use &U : F->uses()) {
	CallSite CS(U.getUser());
	CS.removeParamAttr(ArgNo, Attribute::StructRet);
	CS.addParamAttr(ArgNo, Attribute::NoAlias);
	}
	}

	// If this is a byval argument, and if the aggregate type is small, just
	// pass the elements, which is always safe, if the passed value is densely
	// packed or if we can prove the padding bytes are never accessed. This does
	// not apply to inalloca.
	bool isSafeToPromote =
	PtrArg->hasByValAttr() &&
	(isDenselyPacked(AgTy, DL) \|\| !canPaddingBeAccessed(PtrArg));
	if (isSafeToPromote) {
	if (StructType *STy = dyn_cast<StructType>(AgTy)) {
	if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
	DEBUG(dbgs() << "argpromotion disable promoting argument '"
	<< PtrArg->getName()
	<< "' because it would require adding more"
	<< " than " << MaxElements
	<< " arguments to the function.\n");
	continue;
	}

	// If all the elements are single-value types, we can promote it.
	bool AllSimple = true;
	for (const auto *EltTy : STy->elements()) {
	if (!EltTy->isSingleValueType()) {
	AllSimple = false;
	break;
	}
	}

	// Safe to transform, don't even bother trying to "promote" it.
	// Passing the elements as a scalar will allow sroa to hack on
	// the new alloca we introduce.
	if (AllSimple) {
	ByValArgsToTransform.insert(PtrArg);
	continue;
	}
	}
	}

	// If the argument is a recursive type and we're in a recursive
	// function, we could end up infinitely peeling the function argument.
	if (isSelfRecursive) {
	if (StructType *STy = dyn_cast<StructType>(AgTy)) {
	bool RecursiveType = false;
	for (const auto *EltTy : STy->elements()) {
	if (EltTy == PtrArg->getType()) {
	RecursiveType = true;
	break;
	}
	}
	if (RecursiveType)
	continue;
	}
	}

	// Otherwise, see if we can promote the pointer to its value.
	if (isSafeToPromoteArgument(PtrArg, PtrArg->hasByValOrInAllocaAttr(), AAR,
	MaxElements))
	ArgsToPromote.insert(PtrArg);
	}

	// No promotable pointer arguments.
	if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
	return nullptr;

	return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
	}

	PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
	CGSCCAnalysisManager &AM,
	LazyCallGraph &CG,
	CGSCCUpdateResult &UR) {
	bool Changed = false, LocalChange;

	// Iterate until we stop promoting from this SCC.
	do {
	LocalChange = false;

	for (LazyCallGraph::Node &N : C) {
	Function &OldF = N.getFunction();

	FunctionAnalysisManager &FAM =
	AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
	// FIXME: This lambda must only be used with this function. We should
	// skip the lambda and just get the AA results directly.
	auto AARGetter = [&](Function &F) -> AAResults & {
	assert(&F == &OldF && "Called with an unexpected function!");
	return FAM.getResult<AAManager>(F);
	};

	Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None);
	if (!NewF)
	continue;
	LocalChange = true;

	// Directly substitute the functions in the call graph. Note that this
	// requires the old function to be completely dead and completely
	// replaced by the new function. It does no call graph updates, it merely
	// swaps out the particular function mapped to a particular node in the
	// graph.
	C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
	OldF.eraseFromParent();
	}

	Changed \|= LocalChange;
	} while (LocalChange);

	if (!Changed)
	return PreservedAnalyses::all();

	return PreservedAnalyses::none();
	}

	namespace {
	/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
	///
	struct ArgPromotion : public CallGraphSCCPass {
	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<AssumptionCacheTracker>();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	getAAResultsAnalysisUsage(AU);
	CallGraphSCCPass::getAnalysisUsage(AU);
	}

	bool runOnSCC(CallGraphSCC &SCC) override;
	static char ID; // Pass identification, replacement for typeid
	explicit ArgPromotion(unsigned MaxElements = 3)
	: CallGraphSCCPass(ID), MaxElements(MaxElements) {
	initializeArgPromotionPass(*PassRegistry::getPassRegistry());
	}

	private:
	using llvm::Pass::doInitialization;
	bool doInitialization(CallGraph &CG) override;
	/// The maximum number of elements to expand, or 0 for unlimited.
	unsigned MaxElements;
	};
	}

	char ArgPromotion::ID = 0;
	INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
	"Promote 'by reference' arguments to scalars", false,
	false)
	INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
	INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
	"Promote 'by reference' arguments to scalars", false, false)

	Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) {
	return new ArgPromotion(MaxElements);
	}

	bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
	if (skipSCC(SCC))
	return false;

	// Get the callgraph information that we need to update to reflect our
	// changes.
	CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();

	LegacyAARGetter AARGetter(*this);

	bool Changed = false, LocalChange;

	// Iterate until we stop promoting from this SCC.
	do {
	LocalChange = false;
	// Attempt to promote arguments from all functions in this SCC.
	for (CallGraphNode *OldNode : SCC) {
	Function *OldF = OldNode->getFunction();
	if (!OldF)
	continue;

	auto ReplaceCallSite = [&](CallSite OldCS, CallSite NewCS) {
	Function *Caller = OldCS.getInstruction()->getParent()->getParent();
	CallGraphNode *NewCalleeNode =
	CG.getOrInsertFunction(NewCS.getCalledFunction());
	CallGraphNode *CallerNode = CG[Caller];
	CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
	};

	if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
	{ReplaceCallSite})) {
	LocalChange = true;

	// Update the call graph for the newly promoted function.
	CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
	NewNode->stealCalledFunctionsFrom(OldNode);
	if (OldNode->getNumReferences() == 0)
	delete CG.removeFunctionFromModule(OldNode);
	else
	OldF->setLinkage(Function::ExternalLinkage);

	// And updat ethe SCC we're iterating as well.
	SCC.ReplaceNode(OldNode, NewNode);
	}
	}
	// Remember that we changed something.
	Changed \|= LocalChange;
	} while (LocalChange);

	return Changed;
	}

	bool ArgPromotion::doInitialization(CallGraph &CG) {
	return CallGraphSCCPass::doInitialization(CG);
	}
	Index: head/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
	===================================================================
	--- head/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp (revision 322320)
	@@ -1,3011 +1,3012 @@
	//===-- AddressSanitizer.cpp - memory error detector ------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file is a part of AddressSanitizer, an address sanity checker.
	// Details of the algorithm:
	// http://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/ADT/ArrayRef.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DepthFirstIterator.h"
	#include "llvm/ADT/SetVector.h"
	#include "llvm/ADT/SmallSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/ADT/Twine.h"
	#include "llvm/Analysis/MemoryBuiltins.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/Analysis/ValueTracking.h"
	#include "llvm/IR/Argument.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/DIBuilder.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/Dominators.h"
	#include "llvm/IR/Function.h"
	#include "llvm/IR/IRBuilder.h"
	#include "llvm/IR/InlineAsm.h"
	#include "llvm/IR/InstVisitor.h"
	#include "llvm/IR/IntrinsicInst.h"
	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/MDBuilder.h"
	#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"
	#include "llvm/MC/MCSectionMachO.h"
	#include "llvm/Support/CommandLine.h"
	#include "llvm/Support/DataTypes.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/Endian.h"
	#include "llvm/Support/ScopedPrinter.h"
	#include "llvm/Support/SwapByteOrder.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/Instrumentation.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
	#include "llvm/Transforms/Utils/Cloning.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include "llvm/Transforms/Utils/ModuleUtils.h"
	#include "llvm/Transforms/Utils/PromoteMemToReg.h"
	#include <algorithm>
	#include <iomanip>
	#include <limits>
	#include <sstream>
	#include <string>
	#include <system_error>

	using namespace llvm;

	#define DEBUG_TYPE "asan"

	static const uint64_t kDefaultShadowScale = 3;
	static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
	static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
	static const uint64_t kDynamicShadowSentinel = ~(uint64_t)0;
	static const uint64_t kIOSShadowOffset32 = 1ULL << 30;
	static const uint64_t kIOSSimShadowOffset32 = 1ULL << 30;
	static const uint64_t kIOSSimShadowOffset64 = kDefaultShadowOffset64;
	static const uint64_t kSmallX86_64ShadowOffset = 0x7FFF8000; // < 2G.
	static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000;
	static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 41;
	static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52;
	static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
	static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
	static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
	static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
	static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
	static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
	static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
	// The shadow memory space is dynamically allocated.
	static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;

	static const size_t kMinStackMallocSize = 1 << 6; // 64B
	static const size_t kMaxStackMallocSize = 1 << 16; // 64K
	static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
	static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;

	static const char *const kAsanModuleCtorName = "asan.module_ctor";
	static const char *const kAsanModuleDtorName = "asan.module_dtor";
	static const uint64_t kAsanCtorAndDtorPriority = 1;
	static const char *const kAsanReportErrorTemplate = "__asan_report_";
	static const char *const kAsanRegisterGlobalsName = "__asan_register_globals";
	static const char *const kAsanUnregisterGlobalsName =
	"__asan_unregister_globals";
	static const char *const kAsanRegisterImageGlobalsName =
	"__asan_register_image_globals";
	static const char *const kAsanUnregisterImageGlobalsName =
	"__asan_unregister_image_globals";
	static const char *const kAsanRegisterElfGlobalsName =
	"__asan_register_elf_globals";
	static const char *const kAsanUnregisterElfGlobalsName =
	"__asan_unregister_elf_globals";
	static const char *const kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
	static const char *const kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
	static const char *const kAsanInitName = "__asan_init";
	static const char *const kAsanVersionCheckName =
	"__asan_version_mismatch_check_v8";
	static const char *const kAsanPtrCmp = "__sanitizer_ptr_cmp";
	static const char *const kAsanPtrSub = "__sanitizer_ptr_sub";
	static const char *const kAsanHandleNoReturnName = "__asan_handle_no_return";
	static const int kMaxAsanStackMallocSizeClass = 10;
	static const char *const kAsanStackMallocNameTemplate = "__asan_stack_malloc_";
	static const char *const kAsanStackFreeNameTemplate = "__asan_stack_free_";
	static const char *const kAsanGenPrefix = "__asan_gen_";
	static const char *const kODRGenPrefix = "__odr_asan_gen_";
	static const char *const kSanCovGenPrefix = "__sancov_gen_";
	static const char *const kAsanSetShadowPrefix = "__asan_set_shadow_";
	static const char *const kAsanPoisonStackMemoryName =
	"__asan_poison_stack_memory";
	static const char *const kAsanUnpoisonStackMemoryName =
	"__asan_unpoison_stack_memory";

	// ASan version script has __asan_* wildcard. Triple underscore prevents a
	// linker (gold) warning about attempting to export a local symbol.
	static const char *const kAsanGlobalsRegisteredFlagName =
	"___asan_globals_registered";

	static const char *const kAsanOptionDetectUseAfterReturn =
	"__asan_option_detect_stack_use_after_return";

	static const char *const kAsanShadowMemoryDynamicAddress =
	"__asan_shadow_memory_dynamic_address";

	static const char *const kAsanAllocaPoison = "__asan_alloca_poison";
	static const char *const kAsanAllocasUnpoison = "__asan_allocas_unpoison";

	// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
	static const size_t kNumberOfAccessSizes = 5;

	static const unsigned kAllocaRzSize = 32;

	// Command-line flags.
	static cl::opt<bool> ClEnableKasan(
	"asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),
	cl::Hidden, cl::init(false));
	static cl::opt<bool> ClRecover(
	"asan-recover",
	cl::desc("Enable recovery mode (continue-after-error)."),
	cl::Hidden, cl::init(false));

	// This flag may need to be replaced with -f[no-]asan-reads.
	static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
	cl::desc("instrument read instructions"),
	cl::Hidden, cl::init(true));
	static cl::opt<bool> ClInstrumentWrites(
	"asan-instrument-writes", cl::desc("instrument write instructions"),
	cl::Hidden, cl::init(true));
	static cl::opt<bool> ClInstrumentAtomics(
	"asan-instrument-atomics",
	cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
	cl::init(true));
	static cl::opt<bool> ClAlwaysSlowPath(
	"asan-always-slow-path",
	cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
	cl::init(false));
	static cl::opt<bool> ClForceDynamicShadow(
	"asan-force-dynamic-shadow",
	cl::desc("Load shadow address into a local variable for each function"),
	cl::Hidden, cl::init(false));

	// This flag limits the number of instructions to be instrumented
	// in any given BB. Normally, this should be set to unlimited (INT_MAX),
	// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
	// set it to 10000.
	static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
	"asan-max-ins-per-bb", cl::init(10000),
	cl::desc("maximal number of instructions to instrument in any given BB"),
	cl::Hidden);
	// This flag may need to be replaced with -f[no]asan-stack.
	static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
	cl::Hidden, cl::init(true));
	static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
	"asan-max-inline-poisoning-size",
	cl::desc(
	"Inline shadow poisoning for blocks up to the given size in bytes."),
	cl::Hidden, cl::init(64));
	static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
	cl::desc("Check stack-use-after-return"),
	cl::Hidden, cl::init(true));
	static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args",
	cl::desc("Create redzones for byval "
	"arguments (extra copy "
	"required)"), cl::Hidden,
	cl::init(true));
	static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
	cl::desc("Check stack-use-after-scope"),
	cl::Hidden, cl::init(false));
	// This flag may need to be replaced with -f[no]asan-globals.
	static cl::opt<bool> ClGlobals("asan-globals",
	cl::desc("Handle global objects"), cl::Hidden,
	cl::init(true));
	static cl::opt<bool> ClInitializers("asan-initialization-order",
	cl::desc("Handle C++ initializer order"),
	cl::Hidden, cl::init(true));
	static cl::opt<bool> ClInvalidPointerPairs(
	"asan-detect-invalid-pointer-pair",
	cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden,
	cl::init(false));
	static cl::opt<unsigned> ClRealignStack(
	"asan-realign-stack",
	cl::desc("Realign stack to the value of this flag (power of two)"),
	cl::Hidden, cl::init(32));
	static cl::opt<int> ClInstrumentationWithCallsThreshold(
	"asan-instrumentation-with-call-threshold",
	cl::desc(
	"If the function being instrumented contains more than "
	"this number of memory accesses, use callbacks instead of "
	"inline checks (-1 means never use callbacks)."),
	cl::Hidden, cl::init(7000));
	static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
	"asan-memory-access-callback-prefix",
	cl::desc("Prefix for memory access callbacks"), cl::Hidden,
	cl::init("__asan_"));
	static cl::opt<bool>
	ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas",
	cl::desc("instrument dynamic allocas"),
	cl::Hidden, cl::init(true));
	static cl::opt<bool> ClSkipPromotableAllocas(
	"asan-skip-promotable-allocas",
	cl::desc("Do not instrument promotable allocas"), cl::Hidden,
	cl::init(true));

	// These flags allow to change the shadow mapping.
	// The shadow mapping looks like
	// Shadow = (Mem >> scale) + offset
	static cl::opt<int> ClMappingScale("asan-mapping-scale",
	cl::desc("scale of asan shadow mapping"),
	cl::Hidden, cl::init(0));
	static cl::opt<unsigned long long> ClMappingOffset(
	"asan-mapping-offset",
	cl::desc("offset of asan shadow mapping [EXPERIMENTAL]"), cl::Hidden,
	cl::init(0));

	// Optimization flags. Not user visible, used mostly for testing
	// and benchmarking the tool.
	static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
	cl::Hidden, cl::init(true));
	static cl::opt<bool> ClOptSameTemp(
	"asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
	cl::Hidden, cl::init(true));
	static cl::opt<bool> ClOptGlobals("asan-opt-globals",
	cl::desc("Don't instrument scalar globals"),
	cl::Hidden, cl::init(true));
	static cl::opt<bool> ClOptStack(
	"asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
	cl::Hidden, cl::init(false));

	static cl::opt<bool> ClDynamicAllocaStack(
	"asan-stack-dynamic-alloca",
	cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
	cl::init(true));

	static cl::opt<uint32_t> ClForceExperiment(
	"asan-force-experiment",
	cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
	cl::init(0));

	static cl::opt<bool>
	ClUsePrivateAliasForGlobals("asan-use-private-alias",
	cl::desc("Use private aliases for global"
	" variables"),
	cl::Hidden, cl::init(false));

	static cl::opt<bool>
	ClUseGlobalsGC("asan-globals-live-support",
	cl::desc("Use linker features to support dead "
	"code stripping of globals"),
	cl::Hidden, cl::init(true));

	// This is on by default even though there is a bug in gold:
	// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
	static cl::opt<bool>
	ClWithComdat("asan-with-comdat",
	cl::desc("Place ASan constructors in comdat sections"),
	cl::Hidden, cl::init(true));

	// Debug flags.
	static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
	cl::init(0));
	static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
	cl::Hidden, cl::init(0));
	static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
	cl::desc("Debug func"));
	static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
	cl::Hidden, cl::init(-1));
	static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug max inst"),
	cl::Hidden, cl::init(-1));

	STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
	STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
	STATISTIC(NumOptimizedAccessesToGlobalVar,
	"Number of optimized accesses to global vars");
	STATISTIC(NumOptimizedAccessesToStackVar,
	"Number of optimized accesses to stack vars");

	namespace {
	/// Frontend-provided metadata for source location.
	struct LocationMetadata {
	StringRef Filename;
	int LineNo;
	int ColumnNo;

	LocationMetadata() : Filename(), LineNo(0), ColumnNo(0) {}

	bool empty() const { return Filename.empty(); }

	void parse(MDNode *MDN) {
	assert(MDN->getNumOperands() == 3);
	MDString *DIFilename = cast<MDString>(MDN->getOperand(0));
	Filename = DIFilename->getString();
	LineNo =
	mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
	ColumnNo =
	mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
	}
	};

	/// Frontend-provided metadata for global variables.
	class GlobalsMetadata {
	public:
	struct Entry {
	Entry() : SourceLoc(), Name(), IsDynInit(false), IsBlacklisted(false) {}
	LocationMetadata SourceLoc;
	StringRef Name;
	bool IsDynInit;
	bool IsBlacklisted;
	};

	GlobalsMetadata() : inited_(false) {}

	void reset() {
	inited_ = false;
	Entries.clear();
	}

	void init(Module &M) {
	assert(!inited_);
	inited_ = true;
	NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
	if (!Globals) return;
	for (auto MDN : Globals->operands()) {
	// Metadata node contains the global and the fields of "Entry".
	assert(MDN->getNumOperands() == 5);
	auto *GV = mdconst::extract_or_null<GlobalVariable>(MDN->getOperand(0));
	// The optimizer may optimize away a global entirely.
	if (!GV) continue;
	// We can already have an entry for GV if it was merged with another
	// global.
	Entry &E = Entries[GV];
	if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
	E.SourceLoc.parse(Loc);
	if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
	E.Name = Name->getString();
	ConstantInt *IsDynInit =
	mdconst::extract<ConstantInt>(MDN->getOperand(3));
	E.IsDynInit \|= IsDynInit->isOne();
	ConstantInt *IsBlacklisted =
	mdconst::extract<ConstantInt>(MDN->getOperand(4));
	E.IsBlacklisted \|= IsBlacklisted->isOne();
	}
	}

	/// Returns metadata entry for a given global.
	Entry get(GlobalVariable *G) const {
	auto Pos = Entries.find(G);
	return (Pos != Entries.end()) ? Pos->second : Entry();
	}

	private:
	bool inited_;
	DenseMap<GlobalVariable *, Entry> Entries;
	};

	/// This struct defines the shadow mapping using the rule:
	/// shadow = (mem >> Scale) ADD-or-OR Offset.
	struct ShadowMapping {
	int Scale;
	uint64_t Offset;
	bool OrShadowOffset;
	};

	static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
	bool IsKasan) {
	bool IsAndroid = TargetTriple.isAndroid();
	bool IsIOS = TargetTriple.isiOS() \|\| TargetTriple.isWatchOS();
	bool IsFreeBSD = TargetTriple.isOSFreeBSD();
	bool IsPS4CPU = TargetTriple.isPS4CPU();
	bool IsLinux = TargetTriple.isOSLinux();
	bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 \|\|
	TargetTriple.getArch() == llvm::Triple::ppc64le;
	bool IsSystemZ = TargetTriple.getArch() == llvm::Triple::systemz;
	bool IsX86 = TargetTriple.getArch() == llvm::Triple::x86;
	bool IsX86_64 = TargetTriple.getArch() == llvm::Triple::x86_64;
	bool IsMIPS32 = TargetTriple.getArch() == llvm::Triple::mips \|\|
	TargetTriple.getArch() == llvm::Triple::mipsel;
	bool IsMIPS64 = TargetTriple.getArch() == llvm::Triple::mips64 \|\|
	TargetTriple.getArch() == llvm::Triple::mips64el;
	bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64;
	bool IsWindows = TargetTriple.isOSWindows();
	bool IsFuchsia = TargetTriple.isOSFuchsia();

	ShadowMapping Mapping;

	if (LongSize == 32) {
	// Android is always PIE, which means that the beginning of the address
	// space is always available.
	if (IsAndroid)
	Mapping.Offset = 0;
	else if (IsMIPS32)
	Mapping.Offset = kMIPS32_ShadowOffset32;
	else if (IsFreeBSD)
	Mapping.Offset = kFreeBSD_ShadowOffset32;
	else if (IsIOS)
	// If we're targeting iOS and x86, the binary is built for iOS simulator.
	Mapping.Offset = IsX86 ? kIOSSimShadowOffset32 : kIOSShadowOffset32;
	else if (IsWindows)
	Mapping.Offset = kWindowsShadowOffset32;
	else
	Mapping.Offset = kDefaultShadowOffset32;
	} else { // LongSize == 64
	// Fuchsia is always PIE, which means that the beginning of the address
	// space is always available.
	if (IsFuchsia)
	Mapping.Offset = 0;
	else if (IsPPC64)
	Mapping.Offset = kPPC64_ShadowOffset64;
	else if (IsSystemZ)
	Mapping.Offset = kSystemZ_ShadowOffset64;
	else if (IsFreeBSD)
	Mapping.Offset = kFreeBSD_ShadowOffset64;
	else if (IsPS4CPU)
	Mapping.Offset = kPS4CPU_ShadowOffset64;
	else if (IsLinux && IsX86_64) {
	if (IsKasan)
	Mapping.Offset = kLinuxKasan_ShadowOffset64;
	else
	Mapping.Offset = kSmallX86_64ShadowOffset;
	} else if (IsWindows && IsX86_64) {
	Mapping.Offset = kWindowsShadowOffset64;
	} else if (IsMIPS64)
	Mapping.Offset = kMIPS64_ShadowOffset64;
	else if (IsIOS)
	// If we're targeting iOS and x86, the binary is built for iOS simulator.
	// We are using dynamic shadow offset on the 64-bit devices.
	Mapping.Offset =
	IsX86_64 ? kIOSSimShadowOffset64 : kDynamicShadowSentinel;
	else if (IsAArch64)
	Mapping.Offset = kAArch64_ShadowOffset64;
	else
	Mapping.Offset = kDefaultShadowOffset64;
	}

	if (ClForceDynamicShadow) {
	Mapping.Offset = kDynamicShadowSentinel;
	}

	Mapping.Scale = kDefaultShadowScale;
	if (ClMappingScale.getNumOccurrences() > 0) {
	Mapping.Scale = ClMappingScale;
	}

	if (ClMappingOffset.getNumOccurrences() > 0) {
	Mapping.Offset = ClMappingOffset;
	}

	// OR-ing shadow offset if more efficient (at least on x86) if the offset
	// is a power of two, but on ppc64 we have to use add since the shadow
	// offset is not necessary 1/8-th of the address space. On SystemZ,
	// we could OR the constant in a single instruction, but it's more
	// efficient to load it once and use indexed addressing.
	Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
	!(Mapping.Offset & (Mapping.Offset - 1)) &&
	Mapping.Offset != kDynamicShadowSentinel;

	return Mapping;
	}

	static size_t RedzoneSizeForScale(int MappingScale) {
	// Redzone used for stack and globals is at least 32 bytes.
	// For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
	return std::max(32U, 1U << MappingScale);
	}

	/// AddressSanitizer: instrument the code in module to find memory bugs.
	struct AddressSanitizer : public FunctionPass {
	explicit AddressSanitizer(bool CompileKernel = false, bool Recover = false,
	bool UseAfterScope = false)
	: FunctionPass(ID), CompileKernel(CompileKernel \|\| ClEnableKasan),
	Recover(Recover \|\| ClRecover),
	UseAfterScope(UseAfterScope \|\| ClUseAfterScope),
	LocalDynamicShadow(nullptr) {
	initializeAddressSanitizerPass(*PassRegistry::getPassRegistry());
	}
	StringRef getPassName() const override {
	return "AddressSanitizerFunctionPass";
	}
	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<DominatorTreeWrapperPass>();
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	}
	uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
	uint64_t ArraySize = 1;
	if (AI.isArrayAllocation()) {
	const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
	assert(CI && "non-constant array size");
	ArraySize = CI->getZExtValue();
	}
	Type *Ty = AI.getAllocatedType();
	uint64_t SizeInBytes =
	AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
	return SizeInBytes * ArraySize;
	}
	/// Check if we want (and can) handle this alloca.
	bool isInterestingAlloca(const AllocaInst &AI);

	/// If it is an interesting memory access, return the PointerOperand
	/// and set IsWrite/Alignment. Otherwise return nullptr.
	/// MaybeMask is an output parameter for the mask Value, if we're looking at a
	/// masked load/store.
	Value isInterestingMemoryAccess(Instruction I, bool *IsWrite,
	uint64_t TypeSize, unsigned Alignment,
	Value **MaybeMask = nullptr);
	void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, Instruction *I,
	bool UseCalls, const DataLayout &DL);
	void instrumentPointerComparisonOrSubtraction(Instruction *I);
	void instrumentAddress(Instruction OrigIns, Instruction InsertBefore,
	Value *Addr, uint32_t TypeSize, bool IsWrite,
	Value *SizeArgument, bool UseCalls, uint32_t Exp);
	void instrumentUnusualSizeOrAlignment(Instruction *I,
	Instruction InsertBefore, Value Addr,
	uint32_t TypeSize, bool IsWrite,
	Value *SizeArgument, bool UseCalls,
	uint32_t Exp);
	Value createSlowPathCmp(IRBuilder<> &IRB, Value AddrLong,
	Value *ShadowValue, uint32_t TypeSize);
	Instruction generateCrashCode(Instruction InsertBefore, Value *Addr,
	bool IsWrite, size_t AccessSizeIndex,
	Value *SizeArgument, uint32_t Exp);
	void instrumentMemIntrinsic(MemIntrinsic *MI);
	Value memToShadow(Value Shadow, IRBuilder<> &IRB);
	bool runOnFunction(Function &F) override;
	bool maybeInsertAsanInitAtFunctionEntry(Function &F);
	void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
	void markEscapedLocalAllocas(Function &F);
	bool doInitialization(Module &M) override;
	bool doFinalization(Module &M) override;
	static char ID; // Pass identification, replacement for typeid

	DominatorTree &getDominatorTree() const { return *DT; }

	private:
	void initializeCallbacks(Module &M);

	bool LooksLikeCodeInBug11395(Instruction *I);
	bool GlobalIsLinkerInitialized(GlobalVariable *G);
	bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
	uint64_t TypeSize) const;

	/// Helper to cleanup per-function state.
	struct FunctionStateRAII {
	AddressSanitizer *Pass;
	FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) {
	assert(Pass->ProcessedAllocas.empty() &&
	"last pass forgot to clear cache");
	assert(!Pass->LocalDynamicShadow);
	}
	~FunctionStateRAII() {
	Pass->LocalDynamicShadow = nullptr;
	Pass->ProcessedAllocas.clear();
	}
	};

	LLVMContext *C;
	Triple TargetTriple;
	int LongSize;
	bool CompileKernel;
	bool Recover;
	bool UseAfterScope;
	Type *IntptrTy;
	ShadowMapping Mapping;
	DominatorTree *DT;
	Function *AsanHandleNoReturnFunc;
	Function AsanPtrCmpFunction, AsanPtrSubFunction;
	// This array is indexed by AccessIsWrite, Experiment and log2(AccessSize).
	Function *AsanErrorCallback[2][2][kNumberOfAccessSizes];
	Function *AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes];
	// This array is indexed by AccessIsWrite and Experiment.
	Function *AsanErrorCallbackSized[2][2];
	Function *AsanMemoryAccessCallbackSized[2][2];
	Function AsanMemmove, AsanMemcpy, *AsanMemset;
	InlineAsm *EmptyAsm;
	Value *LocalDynamicShadow;
	GlobalsMetadata GlobalsMD;
	DenseMap<const AllocaInst *, bool> ProcessedAllocas;

	friend struct FunctionStackPoisoner;
	};

	class AddressSanitizerModule : public ModulePass {
	public:
	explicit AddressSanitizerModule(bool CompileKernel = false,
	bool Recover = false,
	bool UseGlobalsGC = true)
	: ModulePass(ID), CompileKernel(CompileKernel \|\| ClEnableKasan),
	Recover(Recover \|\| ClRecover),
	UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
	// Not a typo: ClWithComdat is almost completely pointless without
	// ClUseGlobalsGC (because then it only works on modules without
	// globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
	// and both suffer from gold PR19002 for which UseGlobalsGC constructor
	// argument is designed as workaround. Therefore, disable both
	// ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
	// do globals-gc.
	UseCtorComdat(UseGlobalsGC && ClWithComdat) {}
	bool runOnModule(Module &M) override;
	static char ID; // Pass identification, replacement for typeid
	StringRef getPassName() const override { return "AddressSanitizerModule"; }

	private:
	void initializeCallbacks(Module &M);

	bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
	void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
	ArrayRef<GlobalVariable *> ExtendedGlobals,
	ArrayRef<Constant *> MetadataInitializers);
	void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
	ArrayRef<GlobalVariable *> ExtendedGlobals,
	ArrayRef<Constant *> MetadataInitializers,
	const std::string &UniqueModuleId);
	void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M,
	ArrayRef<GlobalVariable *> ExtendedGlobals,
	ArrayRef<Constant *> MetadataInitializers);
	void
	InstrumentGlobalsWithMetadataArray(IRBuilder<> &IRB, Module &M,
	ArrayRef<GlobalVariable *> ExtendedGlobals,
	ArrayRef<Constant *> MetadataInitializers);

	GlobalVariable CreateMetadataGlobal(Module &M, Constant Initializer,
	StringRef OriginalName);
	void SetComdatForGlobalMetadata(GlobalVariable G, GlobalVariable Metadata,
	StringRef InternalSuffix);
	IRBuilder<> CreateAsanModuleDtor(Module &M);

	bool ShouldInstrumentGlobal(GlobalVariable *G);
	bool ShouldUseMachOGlobalsSection() const;
	StringRef getGlobalMetadataSection() const;
	void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
	void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
	size_t MinRedzoneSizeForGlobal() const {
	return RedzoneSizeForScale(Mapping.Scale);
	}

	GlobalsMetadata GlobalsMD;
	bool CompileKernel;
	bool Recover;
	bool UseGlobalsGC;
	bool UseCtorComdat;
	Type *IntptrTy;
	LLVMContext *C;
	Triple TargetTriple;
	ShadowMapping Mapping;
	Function *AsanPoisonGlobals;
	Function *AsanUnpoisonGlobals;
	Function *AsanRegisterGlobals;
	Function *AsanUnregisterGlobals;
	Function *AsanRegisterImageGlobals;
	Function *AsanUnregisterImageGlobals;
	Function *AsanRegisterElfGlobals;
	Function *AsanUnregisterElfGlobals;

	Function *AsanCtorFunction = nullptr;
	Function *AsanDtorFunction = nullptr;
	};

	// Stack poisoning does not play well with exception handling.
	// When an exception is thrown, we essentially bypass the code
	// that unpoisones the stack. This is why the run-time library has
	// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
	// stack in the interceptor. This however does not work inside the
	// actual function which catches the exception. Most likely because the
	// compiler hoists the load of the shadow value somewhere too high.
	// This causes asan to report a non-existing bug on 453.povray.
	// It sounds like an LLVM bug.
	struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
	Function &F;
	AddressSanitizer &ASan;
	DIBuilder DIB;
	LLVMContext *C;
	Type *IntptrTy;
	Type *IntptrPtrTy;
	ShadowMapping Mapping;

	SmallVector<AllocaInst *, 16> AllocaVec;
	SmallVector<AllocaInst *, 16> StaticAllocasToMoveUp;
	SmallVector<Instruction *, 8> RetVec;
	unsigned StackAlignment;

	Function *AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
	*AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
	Function *AsanSetShadowFunc[0x100] = {};
	Function AsanPoisonStackMemoryFunc, AsanUnpoisonStackMemoryFunc;
	Function AsanAllocaPoisonFunc, AsanAllocasUnpoisonFunc;

	// Stores a place and arguments of poisoning/unpoisoning call for alloca.
	struct AllocaPoisonCall {
	IntrinsicInst *InsBefore;
	AllocaInst *AI;
	uint64_t Size;
	bool DoPoison;
	};
	SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec;
	SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec;

	SmallVector<AllocaInst *, 1> DynamicAllocaVec;
	SmallVector<IntrinsicInst *, 1> StackRestoreVec;
	AllocaInst *DynamicAllocaLayout = nullptr;
	IntrinsicInst *LocalEscapeCall = nullptr;

	// Maps Value to an AllocaInst from which the Value is originated.
	typedef DenseMap<Value , AllocaInst > AllocaForValueMapTy;
	AllocaForValueMapTy AllocaForValue;

	bool HasNonEmptyInlineAsm = false;
	bool HasReturnsTwiceCall = false;
	std::unique_ptr<CallInst> EmptyInlineAsm;

	FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
	: F(F),
	ASan(ASan),
	DIB(F.getParent(), /AllowUnresolved*/ false),
	C(ASan.C),
	IntptrTy(ASan.IntptrTy),
	IntptrPtrTy(PointerType::get(IntptrTy, 0)),
	Mapping(ASan.Mapping),
	StackAlignment(1 << Mapping.Scale),
	EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}

	bool runOnFunction() {
	if (!ClStack) return false;

	- if (ClRedzoneByvalArgs) copyArgsPassedByValToAllocas();
	+ if (ClRedzoneByvalArgs && Mapping.Offset != kDynamicShadowSentinel)
	+ copyArgsPassedByValToAllocas();

	// Collect alloca, ret, lifetime instructions etc.
	for (BasicBlock BB : depth_first(&F.getEntryBlock())) visit(BB);

	if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;

	initializeCallbacks(*F.getParent());

	processDynamicAllocas();
	processStaticAllocas();

	if (ClDebugStack) {
	DEBUG(dbgs() << F);
	}
	return true;
	}

	// Arguments marked with the "byval" attribute are implicitly copied without
	// using an alloca instruction. To produce redzones for those arguments, we
	// copy them a second time into memory allocated with an alloca instruction.
	void copyArgsPassedByValToAllocas();

	// Finds all Alloca instructions and puts
	// poisoned red zones around all of them.
	// Then unpoison everything back before the function returns.
	void processStaticAllocas();
	void processDynamicAllocas();

	void createDynamicAllocasInitStorage();

	// ----------------------- Visitors.
	/// \brief Collect all Ret instructions.
	void visitReturnInst(ReturnInst &RI) { RetVec.push_back(&RI); }

	/// \brief Collect all Resume instructions.
	void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }

	/// \brief Collect all CatchReturnInst instructions.
	void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }

	void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
	Value *SavedStack) {
	IRBuilder<> IRB(InstBefore);
	Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
	// When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
	// need to adjust extracted SP to compute the address of the most recent
	// alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
	// this purpose.
	if (!isa<ReturnInst>(InstBefore)) {
	Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
	InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
	{IntptrTy});

	Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});

	DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
	DynamicAreaOffset);
	}

	IRB.CreateCall(AsanAllocasUnpoisonFunc,
	{IRB.CreateLoad(DynamicAllocaLayout), DynamicAreaPtr});
	}

	// Unpoison dynamic allocas redzones.
	void unpoisonDynamicAllocas() {
	for (auto &Ret : RetVec)
	unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout);

	for (auto &StackRestoreInst : StackRestoreVec)
	unpoisonDynamicAllocasBeforeInst(StackRestoreInst,
	StackRestoreInst->getOperand(0));
	}

	// Deploy and poison redzones around dynamic alloca call. To do this, we
	// should replace this call with another one with changed parameters and
	// replace all its uses with new address, so
	// addr = alloca type, old_size, align
	// is replaced by
	// new_size = (old_size + additional_size) * sizeof(type)
	// tmp = alloca i8, new_size, max(align, 32)
	// addr = tmp + 32 (first 32 bytes are for the left redzone).
	// Additional_size is added to make new memory allocation contain not only
	// requested memory, but also left, partial and right redzones.
	void handleDynamicAllocaCall(AllocaInst *AI);

	/// \brief Collect Alloca instructions we want (and can) handle.
	void visitAllocaInst(AllocaInst &AI) {
	if (!ASan.isInterestingAlloca(AI)) {
	if (AI.isStaticAlloca()) {
	// Skip over allocas that are present before the first instrumented
	// alloca, we don't want to move those around.
	if (AllocaVec.empty())
	return;

	StaticAllocasToMoveUp.push_back(&AI);
	}
	return;
	}

	StackAlignment = std::max(StackAlignment, AI.getAlignment());
	if (!AI.isStaticAlloca())
	DynamicAllocaVec.push_back(&AI);
	else
	AllocaVec.push_back(&AI);
	}

	/// \brief Collect lifetime intrinsic calls to check for use-after-scope
	/// errors.
	void visitIntrinsicInst(IntrinsicInst &II) {
	Intrinsic::ID ID = II.getIntrinsicID();
	if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
	if (ID == Intrinsic::localescape) LocalEscapeCall = &II;
	if (!ASan.UseAfterScope)
	return;
	if (ID != Intrinsic::lifetime_start && ID != Intrinsic::lifetime_end)
	return;
	// Found lifetime intrinsic, add ASan instrumentation if necessary.
	ConstantInt *Size = dyn_cast<ConstantInt>(II.getArgOperand(0));
	// If size argument is undefined, don't do anything.
	if (Size->isMinusOne()) return;
	// Check that size doesn't saturate uint64_t and can
	// be stored in IntptrTy.
	const uint64_t SizeValue = Size->getValue().getLimitedValue();
	if (SizeValue == ~0ULL \|\|
	!ConstantInt::isValueValidForType(IntptrTy, SizeValue))
	return;
	// Find alloca instruction that corresponds to llvm.lifetime argument.
	AllocaInst *AI = findAllocaForValue(II.getArgOperand(1));
	if (!AI \|\| !ASan.isInterestingAlloca(*AI))
	return;
	bool DoPoison = (ID == Intrinsic::lifetime_end);
	AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
	if (AI->isStaticAlloca())
	StaticAllocaPoisonCallVec.push_back(APC);
	else if (ClInstrumentDynamicAllocas)
	DynamicAllocaPoisonCallVec.push_back(APC);
	}

	void visitCallSite(CallSite CS) {
	Instruction *I = CS.getInstruction();
	if (CallInst *CI = dyn_cast<CallInst>(I)) {
	HasNonEmptyInlineAsm \|=
	CI->isInlineAsm() && !CI->isIdenticalTo(EmptyInlineAsm.get());
	HasReturnsTwiceCall \|= CI->canReturnTwice();
	}
	}

	// ---------------------- Helpers.
	void initializeCallbacks(Module &M);

	bool doesDominateAllExits(const Instruction *I) const {
	for (auto Ret : RetVec) {
	if (!ASan.getDominatorTree().dominates(I, Ret)) return false;
	}
	return true;
	}

	/// Finds alloca where the value comes from.
	AllocaInst findAllocaForValue(Value V);

	// Copies bytes from ShadowBytes into shadow memory for indexes where
	// ShadowMask is not zero. If ShadowMask[i] is zero, we assume that
	// ShadowBytes[i] is constantly zero and doesn't need to be overwritten.
	void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
	IRBuilder<> &IRB, Value *ShadowBase);
	void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
	size_t Begin, size_t End, IRBuilder<> &IRB,
	Value *ShadowBase);
	void copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
	ArrayRef<uint8_t> ShadowBytes, size_t Begin,
	size_t End, IRBuilder<> &IRB, Value *ShadowBase);

	void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);

	Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
	bool Dynamic);
	PHINode createPHI(IRBuilder<> &IRB, Value Cond, Value *ValueIfTrue,
	Instruction ThenTerm, Value ValueIfFalse);
	};

	} // anonymous namespace

	char AddressSanitizer::ID = 0;
	INITIALIZE_PASS_BEGIN(
	AddressSanitizer, "asan",
	"AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
	false)
	INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_END(
	AddressSanitizer, "asan",
	"AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
	false)
	FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
	bool Recover,
	bool UseAfterScope) {
	assert(!CompileKernel \|\| Recover);
	return new AddressSanitizer(CompileKernel, Recover, UseAfterScope);
	}

	char AddressSanitizerModule::ID = 0;
	INITIALIZE_PASS(
	AddressSanitizerModule, "asan-module",
	"AddressSanitizer: detects use-after-free and out-of-bounds bugs."
	"ModulePass",
	false, false)
	ModulePass *llvm::createAddressSanitizerModulePass(bool CompileKernel,
	bool Recover,
	bool UseGlobalsGC) {
	assert(!CompileKernel \|\| Recover);
	return new AddressSanitizerModule(CompileKernel, Recover, UseGlobalsGC);
	}

	static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
	size_t Res = countTrailingZeros(TypeSize / 8);
	assert(Res < kNumberOfAccessSizes);
	return Res;
	}

	// \brief Create a constant for Str so that we can pass it to the run-time lib.
	static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str,
	bool AllowMerging) {
	Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
	// We use private linkage for module-local strings. If they can be merged
	// with another one, we set the unnamed_addr attribute.
	GlobalVariable *GV =
	new GlobalVariable(M, StrConst->getType(), true,
	GlobalValue::PrivateLinkage, StrConst, kAsanGenPrefix);
	if (AllowMerging) GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
	GV->setAlignment(1); // Strings may not be merged w/o setting align 1.
	return GV;
	}

	/// \brief Create a global describing a source location.
	static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
	LocationMetadata MD) {
	Constant *LocData[] = {
	createPrivateGlobalForString(M, MD.Filename, true),
	ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
	ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
	};
	auto LocStruct = ConstantStruct::getAnon(LocData);
	auto GV = new GlobalVariable(M, LocStruct->getType(), true,
	GlobalValue::PrivateLinkage, LocStruct,
	kAsanGenPrefix);
	GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
	return GV;
	}

	/// \brief Check if \p G has been created by a trusted compiler pass.
	static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
	// Do not instrument asan globals.
	if (G->getName().startswith(kAsanGenPrefix) \|\|
	G->getName().startswith(kSanCovGenPrefix) \|\|
	G->getName().startswith(kODRGenPrefix))
	return true;

	// Do not instrument gcov counter arrays.
	if (G->getName() == "__llvm_gcov_ctr")
	return true;

	return false;
	}

	Value AddressSanitizer::memToShadow(Value Shadow, IRBuilder<> &IRB) {
	// Shadow >> scale
	Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
	if (Mapping.Offset == 0) return Shadow;
	// (Shadow >> scale) \| offset
	Value *ShadowBase;
	if (LocalDynamicShadow)
	ShadowBase = LocalDynamicShadow;
	else
	ShadowBase = ConstantInt::get(IntptrTy, Mapping.Offset);
	if (Mapping.OrShadowOffset)
	return IRB.CreateOr(Shadow, ShadowBase);
	else
	return IRB.CreateAdd(Shadow, ShadowBase);
	}

	// Instrument memset/memmove/memcpy
	void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
	IRBuilder<> IRB(MI);
	if (isa<MemTransferInst>(MI)) {
	IRB.CreateCall(
	isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy,
	{IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
	IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
	IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
	} else if (isa<MemSetInst>(MI)) {
	IRB.CreateCall(
	AsanMemset,
	{IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
	IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
	IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
	}
	MI->eraseFromParent();
	}

	/// Check if we want (and can) handle this alloca.
	bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
	auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI);

	if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
	return PreviouslySeenAllocaInfo->getSecond();

	bool IsInteresting =
	(AI.getAllocatedType()->isSized() &&
	// alloca() may be called with 0 size, ignore it.
	((!AI.isStaticAlloca()) \|\| getAllocaSizeInBytes(AI) > 0) &&
	// We are only interested in allocas not promotable to registers.
	// Promotable allocas are common under -O0.
	(!ClSkipPromotableAllocas \|\| !isAllocaPromotable(&AI)) &&
	// inalloca allocas are not treated as static, and we don't want
	// dynamic alloca instrumentation for them as well.
	!AI.isUsedWithInAlloca() &&
	// swifterror allocas are register promoted by ISel
	!AI.isSwiftError());

	ProcessedAllocas[&AI] = IsInteresting;
	return IsInteresting;
	}

	Value AddressSanitizer::isInterestingMemoryAccess(Instruction I,
	bool *IsWrite,
	uint64_t *TypeSize,
	unsigned *Alignment,
	Value **MaybeMask) {
	// Skip memory accesses inserted by another instrumentation.
	if (I->getMetadata("nosanitize")) return nullptr;

	// Do not instrument the load fetching the dynamic shadow address.
	if (LocalDynamicShadow == I)
	return nullptr;

	Value *PtrOperand = nullptr;
	const DataLayout &DL = I->getModule()->getDataLayout();
	if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
	if (!ClInstrumentReads) return nullptr;
	*IsWrite = false;
	*TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
	*Alignment = LI->getAlignment();
	PtrOperand = LI->getPointerOperand();
	} else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
	if (!ClInstrumentWrites) return nullptr;
	*IsWrite = true;
	*TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
	*Alignment = SI->getAlignment();
	PtrOperand = SI->getPointerOperand();
	} else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
	if (!ClInstrumentAtomics) return nullptr;
	*IsWrite = true;
	*TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
	*Alignment = 0;
	PtrOperand = RMW->getPointerOperand();
	} else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
	if (!ClInstrumentAtomics) return nullptr;
	*IsWrite = true;
	*TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
	*Alignment = 0;
	PtrOperand = XCHG->getPointerOperand();
	} else if (auto CI = dyn_cast<CallInst>(I)) {
	auto *F = dyn_cast<Function>(CI->getCalledValue());
	if (F && (F->getName().startswith("llvm.masked.load.") \|\|
	F->getName().startswith("llvm.masked.store."))) {
	unsigned OpOffset = 0;
	if (F->getName().startswith("llvm.masked.store.")) {
	if (!ClInstrumentWrites)
	return nullptr;
	// Masked store has an initial operand for the value.
	OpOffset = 1;
	*IsWrite = true;
	} else {
	if (!ClInstrumentReads)
	return nullptr;
	*IsWrite = false;
	}

	auto BasePtr = CI->getOperand(0 + OpOffset);
	auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
	*TypeSize = DL.getTypeStoreSizeInBits(Ty);
	if (auto AlignmentConstant =
	dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
	*Alignment = (unsigned)AlignmentConstant->getZExtValue();
	else
	*Alignment = 1; // No alignment guarantees. We probably got Undef
	if (MaybeMask)
	*MaybeMask = CI->getOperand(2 + OpOffset);
	PtrOperand = BasePtr;
	}
	}

	if (PtrOperand) {
	// Do not instrument acesses from different address spaces; we cannot deal
	// with them.
	Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
	if (PtrTy->getPointerAddressSpace() != 0)
	return nullptr;

	// Ignore swifterror addresses.
	// swifterror memory addresses are mem2reg promoted by instruction
	// selection. As such they cannot have regular uses like an instrumentation
	// function and it makes no sense to track them as memory.
	if (PtrOperand->isSwiftError())
	return nullptr;
	}

	// Treat memory accesses to promotable allocas as non-interesting since they
	// will not cause memory violations. This greatly speeds up the instrumented
	// executable at -O0.
	if (ClSkipPromotableAllocas)
	if (auto AI = dyn_cast_or_null<AllocaInst>(PtrOperand))
	return isInterestingAlloca(*AI) ? AI : nullptr;

	return PtrOperand;
	}

	static bool isPointerOperand(Value *V) {
	return V->getType()->isPointerTy() \|\| isa<PtrToIntInst>(V);
	}

	// This is a rough heuristic; it may cause both false positives and
	// false negatives. The proper implementation requires cooperation with
	// the frontend.
	static bool isInterestingPointerComparisonOrSubtraction(Instruction *I) {
	if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) {
	if (!Cmp->isRelational()) return false;
	} else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
	if (BO->getOpcode() != Instruction::Sub) return false;
	} else {
	return false;
	}
	return isPointerOperand(I->getOperand(0)) &&
	isPointerOperand(I->getOperand(1));
	}

	bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
	// If a global variable does not have dynamic initialization we don't
	// have to instrument it. However, if a global does not have initializer
	// at all, we assume it has dynamic initializer (in other TU).
	return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
	}

	void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
	Instruction *I) {
	IRBuilder<> IRB(I);
	Function *F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
	Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
	for (Value *&i : Param) {
	if (i->getType()->isPointerTy())
	i = IRB.CreatePointerCast(i, IntptrTy);
	}
	IRB.CreateCall(F, Param);
	}

	static void doInstrumentAddress(AddressSanitizer Pass, Instruction I,
	Instruction InsertBefore, Value Addr,
	unsigned Alignment, unsigned Granularity,
	uint32_t TypeSize, bool IsWrite,
	Value *SizeArgument, bool UseCalls,
	uint32_t Exp) {
	// Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
	// if the data is properly aligned.
	if ((TypeSize == 8 \|\| TypeSize == 16 \|\| TypeSize == 32 \|\| TypeSize == 64 \|\|
	TypeSize == 128) &&
	(Alignment >= Granularity \|\| Alignment == 0 \|\| Alignment >= TypeSize / 8))
	return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
	nullptr, UseCalls, Exp);
	Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
	IsWrite, nullptr, UseCalls, Exp);
	}

	static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
	const DataLayout &DL, Type *IntptrTy,
	Value Mask, Instruction I,
	Value *Addr, unsigned Alignment,
	unsigned Granularity, uint32_t TypeSize,
	bool IsWrite, Value *SizeArgument,
	bool UseCalls, uint32_t Exp) {
	auto *VTy = cast<PointerType>(Addr->getType())->getElementType();
	uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
	unsigned Num = VTy->getVectorNumElements();
	auto Zero = ConstantInt::get(IntptrTy, 0);
	for (unsigned Idx = 0; Idx < Num; ++Idx) {
	Value *InstrumentedAddress = nullptr;
	Instruction *InsertBefore = I;
	if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
	// dyn_cast as we might get UndefValue
	if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
	if (Masked->isZero())
	// Mask is constant false, so no instrumentation needed.
	continue;
	// If we have a true or undef value, fall through to doInstrumentAddress
	// with InsertBefore == I
	}
	} else {
	IRBuilder<> IRB(I);
	Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
	TerminatorInst *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
	InsertBefore = ThenTerm;
	}

	IRBuilder<> IRB(InsertBefore);
	InstrumentedAddress =
	IRB.CreateGEP(Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
	doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
	Granularity, ElemTypeSize, IsWrite, SizeArgument,
	UseCalls, Exp);
	}
	}

	void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
	Instruction *I, bool UseCalls,
	const DataLayout &DL) {
	bool IsWrite = false;
	unsigned Alignment = 0;
	uint64_t TypeSize = 0;
	Value *MaybeMask = nullptr;
	Value *Addr =
	isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment, &MaybeMask);
	assert(Addr);

	// Optimization experiments.
	// The experiments can be used to evaluate potential optimizations that remove
	// instrumentation (assess false negatives). Instead of completely removing
	// some instrumentation, you set Exp to a non-zero value (mask of optimization
	// experiments that want to remove instrumentation of this instruction).
	// If Exp is non-zero, this pass will emit special calls into runtime
	// (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls
	// make runtime terminate the program in a special way (with a different
	// exit status). Then you run the new compiler on a buggy corpus, collect
	// the special terminations (ideally, you don't see them at all -- no false
	// negatives) and make the decision on the optimization.
	uint32_t Exp = ClForceExperiment;

	if (ClOpt && ClOptGlobals) {
	// If initialization order checking is disabled, a simple access to a
	// dynamically initialized global is always valid.
	GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL));
	if (G && (!ClInitializers \|\| GlobalIsLinkerInitialized(G)) &&
	isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
	NumOptimizedAccessesToGlobalVar++;
	return;
	}
	}

	if (ClOpt && ClOptStack) {
	// A direct inbounds access to a stack variable is always valid.
	if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
	isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
	NumOptimizedAccessesToStackVar++;
	return;
	}
	}

	if (IsWrite)
	NumInstrumentedWrites++;
	else
	NumInstrumentedReads++;

	unsigned Granularity = 1 << Mapping.Scale;
	if (MaybeMask) {
	instrumentMaskedLoadOrStore(this, DL, IntptrTy, MaybeMask, I, Addr,
	Alignment, Granularity, TypeSize, IsWrite,
	nullptr, UseCalls, Exp);
	} else {
	doInstrumentAddress(this, I, I, Addr, Alignment, Granularity, TypeSize,
	IsWrite, nullptr, UseCalls, Exp);
	}
	}

	Instruction AddressSanitizer::generateCrashCode(Instruction InsertBefore,
	Value *Addr, bool IsWrite,
	size_t AccessSizeIndex,
	Value *SizeArgument,
	uint32_t Exp) {
	IRBuilder<> IRB(InsertBefore);
	Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp);
	CallInst *Call = nullptr;
	if (SizeArgument) {
	if (Exp == 0)
	Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0],
	{Addr, SizeArgument});
	else
	Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1],
	{Addr, SizeArgument, ExpVal});
	} else {
	if (Exp == 0)
	Call =
	IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr);
	else
	Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex],
	{Addr, ExpVal});
	}

	// We don't do Call->setDoesNotReturn() because the BB already has
	// UnreachableInst at the end.
	// This EmptyAsm is required to avoid callback merge.
	IRB.CreateCall(EmptyAsm, {});
	return Call;
	}

	Value AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value AddrLong,
	Value *ShadowValue,
	uint32_t TypeSize) {
	size_t Granularity = static_cast<size_t>(1) << Mapping.Scale;
	// Addr & (Granularity - 1)
	Value *LastAccessedByte =
	IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
	// (Addr & (Granularity - 1)) + size - 1
	if (TypeSize / 8 > 1)
	LastAccessedByte = IRB.CreateAdd(
	LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
	// (uint8_t) ((Addr & (Granularity-1)) + size - 1)
	LastAccessedByte =
	IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
	// ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
	return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
	}

	void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
	Instruction InsertBefore, Value Addr,
	uint32_t TypeSize, bool IsWrite,
	Value *SizeArgument, bool UseCalls,
	uint32_t Exp) {
	IRBuilder<> IRB(InsertBefore);
	Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
	size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);

	if (UseCalls) {
	if (Exp == 0)
	IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
	AddrLong);
	else
	IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
	{AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)});
	return;
	}

	Type *ShadowTy =
	IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
	Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
	Value *ShadowPtr = memToShadow(AddrLong, IRB);
	Value *CmpVal = Constant::getNullValue(ShadowTy);
	Value *ShadowValue =
	IRB.CreateLoad(IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));

	Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
	size_t Granularity = 1ULL << Mapping.Scale;
	TerminatorInst *CrashTerm = nullptr;

	if (ClAlwaysSlowPath \|\| (TypeSize < 8 * Granularity)) {
	// We use branch weights for the slow path check, to indicate that the slow
	// path is rarely taken. This seems to be the case for SPEC benchmarks.
	TerminatorInst *CheckTerm = SplitBlockAndInsertIfThen(
	Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
	assert(cast<BranchInst>(CheckTerm)->isUnconditional());
	BasicBlock *NextBB = CheckTerm->getSuccessor(0);
	IRB.SetInsertPoint(CheckTerm);
	Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
	if (Recover) {
	CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false);
	} else {
	BasicBlock *CrashBlock =
	BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
	CrashTerm = new UnreachableInst(*C, CrashBlock);
	BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
	ReplaceInstWithInst(CheckTerm, NewTerm);
	}
	} else {
	CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover);
	}

	Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
	AccessSizeIndex, SizeArgument, Exp);
	Crash->setDebugLoc(OrigIns->getDebugLoc());
	}

	// Instrument unusual size or unusual alignment.
	// We can not do it with a single check, so we do 1-byte check for the first
	// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
	// to report the actual access size.
	void AddressSanitizer::instrumentUnusualSizeOrAlignment(
	Instruction I, Instruction InsertBefore, Value *Addr, uint32_t TypeSize,
	bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
	IRBuilder<> IRB(InsertBefore);
	Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
	Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
	if (UseCalls) {
	if (Exp == 0)
	IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
	{AddrLong, Size});
	else
	IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1],
	{AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)});
	} else {
	Value *LastByte = IRB.CreateIntToPtr(
	IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
	Addr->getType());
	instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
	instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
	}
	}

	void AddressSanitizerModule::poisonOneInitializer(Function &GlobalInit,
	GlobalValue *ModuleName) {
	// Set up the arguments to our poison/unpoison functions.
	IRBuilder<> IRB(&GlobalInit.front(),
	GlobalInit.front().getFirstInsertionPt());

	// Add a call to poison all external globals before the given function starts.
	Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
	IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);

	// Add calls to unpoison all globals before each return instruction.
	for (auto &BB : GlobalInit.getBasicBlockList())
	if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
	CallInst::Create(AsanUnpoisonGlobals, "", RI);
	}

	void AddressSanitizerModule::createInitializerPoisonCalls(
	Module &M, GlobalValue *ModuleName) {
	GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
	if (!GV)
	return;

	ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
	if (!CA)
	return;

	for (Use &OP : CA->operands()) {
	if (isa<ConstantAggregateZero>(OP)) continue;
	ConstantStruct *CS = cast<ConstantStruct>(OP);

	// Must have a function or null ptr.
	if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
	if (F->getName() == kAsanModuleCtorName) continue;
	ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
	// Don't instrument CTORs that will run before asan.module_ctor.
	if (Priority->getLimitedValue() <= kAsanCtorAndDtorPriority) continue;
	poisonOneInitializer(*F, ModuleName);
	}
	}
	}

	bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
	Type *Ty = G->getValueType();
	DEBUG(dbgs() << "GLOBAL: " << *G << "\n");

	if (GlobalsMD.get(G).IsBlacklisted) return false;
	if (!Ty->isSized()) return false;
	if (!G->hasInitializer()) return false;
	if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
	// Touch only those globals that will not be defined in other modules.
	// Don't handle ODR linkage types and COMDATs since other modules may be built
	// without ASan.
	if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
	G->getLinkage() != GlobalVariable::PrivateLinkage &&
	G->getLinkage() != GlobalVariable::InternalLinkage)
	return false;
	if (G->hasComdat()) return false;
	// Two problems with thread-locals:
	// - The address of the main thread's copy can't be computed at link-time.
	// - Need to poison all copies, not just the main thread's one.
	if (G->isThreadLocal()) return false;
	// For now, just ignore this Global if the alignment is large.
	if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false;

	if (G->hasSection()) {
	StringRef Section = G->getSection();

	// Globals from llvm.metadata aren't emitted, do not instrument them.
	if (Section == "llvm.metadata") return false;
	// Do not instrument globals from special LLVM sections.
	if (Section.find("__llvm") != StringRef::npos \|\| Section.find("__LLVM") != StringRef::npos) return false;

	// Do not instrument function pointers to initialization and termination
	// routines: dynamic linker will not properly handle redzones.
	if (Section.startswith(".preinit_array") \|\|
	Section.startswith(".init_array") \|\|
	Section.startswith(".fini_array")) {
	return false;
	}

	// Callbacks put into the CRT initializer/terminator sections
	// should not be instrumented.
	// See https://code.google.com/p/address-sanitizer/issues/detail?id=305
	// and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
	if (Section.startswith(".CRT")) {
	DEBUG(dbgs() << "Ignoring a global initializer callback: " << *G << "\n");
	return false;
	}

	if (TargetTriple.isOSBinFormatMachO()) {
	StringRef ParsedSegment, ParsedSection;
	unsigned TAA = 0, StubSize = 0;
	bool TAAParsed;
	std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
	Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
	assert(ErrorCode.empty() && "Invalid section specifier.");

	// Ignore the globals from the __OBJC section. The ObjC runtime assumes
	// those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
	// them.
	if (ParsedSegment == "__OBJC" \|\|
	(ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
	DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
	return false;
	}
	// See http://code.google.com/p/address-sanitizer/issues/detail?id=32
	// Constant CFString instances are compiled in the following way:
	// -- the string buffer is emitted into
	// __TEXT,__cstring,cstring_literals
	// -- the constant NSConstantString structure referencing that buffer
	// is placed into __DATA,__cfstring
	// Therefore there's no point in placing redzones into __DATA,__cfstring.
	// Moreover, it causes the linker to crash on OS X 10.7
	if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
	DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
	return false;
	}
	// The linker merges the contents of cstring_literals and removes the
	// trailing zeroes.
	if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
	DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
	return false;
	}
	}
	}

	return true;
	}

	// On Mach-O platforms, we emit global metadata in a separate section of the
	// binary in order to allow the linker to properly dead strip. This is only
	// supported on recent versions of ld64.
	bool AddressSanitizerModule::ShouldUseMachOGlobalsSection() const {
	if (!TargetTriple.isOSBinFormatMachO())
	return false;

	if (TargetTriple.isMacOSX() && !TargetTriple.isMacOSXVersionLT(10, 11))
	return true;
	if (TargetTriple.isiOS() /* or tvOS */ && !TargetTriple.isOSVersionLT(9))
	return true;
	if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
	return true;

	return false;
	}

	StringRef AddressSanitizerModule::getGlobalMetadataSection() const {
	switch (TargetTriple.getObjectFormat()) {
	case Triple::COFF: return ".ASAN$GL";
	case Triple::ELF: return "asan_globals";
	case Triple::MachO: return "__DATA,__asan_globals,regular";
	default: break;
	}
	llvm_unreachable("unsupported object format");
	}

	void AddressSanitizerModule::initializeCallbacks(Module &M) {
	IRBuilder<> IRB(*C);

	// Declare our poisoning and unpoisoning functions.
	AsanPoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy));
	AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
	AsanUnpoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanUnpoisonGlobalsName, IRB.getVoidTy()));
	AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);

	// Declare functions that register/unregister globals.
	AsanRegisterGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy));
	AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
	AsanUnregisterGlobals = checkSanitizerInterfaceFunction(
	M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(),
	IntptrTy, IntptrTy));
	AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);

	// Declare the functions that find globals in a shared object and then invoke
	// the (un)register function on them.
	AsanRegisterImageGlobals =
	checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
	AsanRegisterImageGlobals->setLinkage(Function::ExternalLinkage);

	AsanUnregisterImageGlobals =
	checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
	AsanUnregisterImageGlobals->setLinkage(Function::ExternalLinkage);

	AsanRegisterElfGlobals = checkSanitizerInterfaceFunction(
	M.getOrInsertFunction(kAsanRegisterElfGlobalsName, IRB.getVoidTy(),
	IntptrTy, IntptrTy, IntptrTy));
	AsanRegisterElfGlobals->setLinkage(Function::ExternalLinkage);

	AsanUnregisterElfGlobals = checkSanitizerInterfaceFunction(
	M.getOrInsertFunction(kAsanUnregisterElfGlobalsName, IRB.getVoidTy(),
	IntptrTy, IntptrTy, IntptrTy));
	AsanUnregisterElfGlobals->setLinkage(Function::ExternalLinkage);
	}

	// Put the metadata and the instrumented global in the same group. This ensures
	// that the metadata is discarded if the instrumented global is discarded.
	void AddressSanitizerModule::SetComdatForGlobalMetadata(
	GlobalVariable G, GlobalVariable Metadata, StringRef InternalSuffix) {
	Module &M = *G->getParent();
	Comdat *C = G->getComdat();
	if (!C) {
	if (!G->hasName()) {
	// If G is unnamed, it must be internal. Give it an artificial name
	// so we can put it in a comdat.
	assert(G->hasLocalLinkage());
	G->setName(Twine(kAsanGenPrefix) + "_anon_global");
	}

	if (!InternalSuffix.empty() && G->hasLocalLinkage()) {
	std::string Name = G->getName();
	Name += InternalSuffix;
	C = M.getOrInsertComdat(Name);
	} else {
	C = M.getOrInsertComdat(G->getName());
	}

	// Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF.
	if (TargetTriple.isOSBinFormatCOFF())
	C->setSelectionKind(Comdat::NoDuplicates);
	G->setComdat(C);
	}

	assert(G->hasComdat());
	Metadata->setComdat(G->getComdat());
	}

	// Create a separate metadata global and put it in the appropriate ASan
	// global registration section.
	GlobalVariable *
	AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
	StringRef OriginalName) {
	auto Linkage = TargetTriple.isOSBinFormatMachO()
	? GlobalVariable::InternalLinkage
	: GlobalVariable::PrivateLinkage;
	GlobalVariable *Metadata = new GlobalVariable(
	M, Initializer->getType(), false, Linkage, Initializer,
	Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
	Metadata->setSection(getGlobalMetadataSection());
	return Metadata;
	}

	IRBuilder<> AddressSanitizerModule::CreateAsanModuleDtor(Module &M) {
	AsanDtorFunction =
	Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
	GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
	BasicBlock AsanDtorBB = BasicBlock::Create(C, "", AsanDtorFunction);

	return IRBuilder<>(ReturnInst::Create(*C, AsanDtorBB));
	}

	void AddressSanitizerModule::InstrumentGlobalsCOFF(
	IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
	ArrayRef<Constant *> MetadataInitializers) {
	assert(ExtendedGlobals.size() == MetadataInitializers.size());
	auto &DL = M.getDataLayout();

	for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
	Constant *Initializer = MetadataInitializers[i];
	GlobalVariable *G = ExtendedGlobals[i];
	GlobalVariable *Metadata =
	CreateMetadataGlobal(M, Initializer, G->getName());

	// The MSVC linker always inserts padding when linking incrementally. We
	// cope with that by aligning each struct to its size, which must be a power
	// of two.
	unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
	assert(isPowerOf2_32(SizeOfGlobalStruct) &&
	"global metadata will not be padded appropriately");
	Metadata->setAlignment(SizeOfGlobalStruct);

	SetComdatForGlobalMetadata(G, Metadata, "");
	}
	}

	void AddressSanitizerModule::InstrumentGlobalsELF(
	IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
	ArrayRef<Constant *> MetadataInitializers,
	const std::string &UniqueModuleId) {
	assert(ExtendedGlobals.size() == MetadataInitializers.size());

	SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
	for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
	GlobalVariable *G = ExtendedGlobals[i];
	GlobalVariable *Metadata =
	CreateMetadataGlobal(M, MetadataInitializers[i], G->getName());
	MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
	Metadata->setMetadata(LLVMContext::MD_associated, MD);
	MetadataGlobals[i] = Metadata;

	SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
	}

	// Update llvm.compiler.used, adding the new metadata globals. This is
	// needed so that during LTO these variables stay alive.
	if (!MetadataGlobals.empty())
	appendToCompilerUsed(M, MetadataGlobals);

	// RegisteredFlag serves two purposes. First, we can pass it to dladdr()
	// to look up the loaded image that contains it. Second, we can store in it
	// whether registration has already occurred, to prevent duplicate
	// registration.
	//
	// Common linkage ensures that there is only one global per shared library.
	GlobalVariable *RegisteredFlag = new GlobalVariable(
	M, IntptrTy, false, GlobalVariable::CommonLinkage,
	ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
	RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);

	// Create start and stop symbols.
	GlobalVariable *StartELFMetadata = new GlobalVariable(
	M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
	"__start_" + getGlobalMetadataSection());
	StartELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
	GlobalVariable *StopELFMetadata = new GlobalVariable(
	M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
	"__stop_" + getGlobalMetadataSection());
	StopELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);

	// Create a call to register the globals with the runtime.
	IRB.CreateCall(AsanRegisterElfGlobals,
	{IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
	IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
	IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});

	// We also need to unregister globals at the end, e.g., when a shared library
	// gets closed.
	IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
	IRB_Dtor.CreateCall(AsanUnregisterElfGlobals,
	{IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
	IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
	IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
	}

	void AddressSanitizerModule::InstrumentGlobalsMachO(
	IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
	ArrayRef<Constant *> MetadataInitializers) {
	assert(ExtendedGlobals.size() == MetadataInitializers.size());

	// On recent Mach-O platforms, use a structure which binds the liveness of
	// the global variable to the metadata struct. Keep the list of "Liveness" GV
	// created to be added to llvm.compiler.used
	StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
	SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());

	for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
	Constant *Initializer = MetadataInitializers[i];
	GlobalVariable *G = ExtendedGlobals[i];
	GlobalVariable *Metadata =
	CreateMetadataGlobal(M, Initializer, G->getName());

	// On recent Mach-O platforms, we emit the global metadata in a way that
	// allows the linker to properly strip dead globals.
	auto LivenessBinder =
	ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
	ConstantExpr::getPointerCast(Metadata, IntptrTy));
	GlobalVariable *Liveness = new GlobalVariable(
	M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
	Twine("__asan_binder_") + G->getName());
	Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
	LivenessGlobals[i] = Liveness;
	}

	// Update llvm.compiler.used, adding the new liveness globals. This is
	// needed so that during LTO these variables stay alive. The alternative
	// would be to have the linker handling the LTO symbols, but libLTO
	// current API does not expose access to the section for each symbol.
	if (!LivenessGlobals.empty())
	appendToCompilerUsed(M, LivenessGlobals);

	// RegisteredFlag serves two purposes. First, we can pass it to dladdr()
	// to look up the loaded image that contains it. Second, we can store in it
	// whether registration has already occurred, to prevent duplicate
	// registration.
	//
	// common linkage ensures that there is only one global per shared library.
	GlobalVariable *RegisteredFlag = new GlobalVariable(
	M, IntptrTy, false, GlobalVariable::CommonLinkage,
	ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
	RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);

	IRB.CreateCall(AsanRegisterImageGlobals,
	{IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});

	// We also need to unregister globals at the end, e.g., when a shared library
	// gets closed.
	IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
	IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
	{IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
	}

	void AddressSanitizerModule::InstrumentGlobalsWithMetadataArray(
	IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
	ArrayRef<Constant *> MetadataInitializers) {
	assert(ExtendedGlobals.size() == MetadataInitializers.size());
	unsigned N = ExtendedGlobals.size();
	assert(N > 0);

	// On platforms that don't have a custom metadata section, we emit an array
	// of global metadata structures.
	ArrayType *ArrayOfGlobalStructTy =
	ArrayType::get(MetadataInitializers[0]->getType(), N);
	auto AllGlobals = new GlobalVariable(
	M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
	ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");

	IRB.CreateCall(AsanRegisterGlobals,
	{IRB.CreatePointerCast(AllGlobals, IntptrTy),
	ConstantInt::get(IntptrTy, N)});

	// We also need to unregister globals at the end, e.g., when a shared library
	// gets closed.
	IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
	IRB_Dtor.CreateCall(AsanUnregisterGlobals,
	{IRB.CreatePointerCast(AllGlobals, IntptrTy),
	ConstantInt::get(IntptrTy, N)});
	}

	// This function replaces all global variables with new variables that have
	// trailing redzones. It also creates a function that poisons
	// redzones and inserts this function into llvm.global_ctors.
	// Sets *CtorComdat to true if the global registration code emitted into the
	// asan constructor is comdat-compatible.
	bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat) {
	*CtorComdat = false;
	GlobalsMD.init(M);

	SmallVector<GlobalVariable *, 16> GlobalsToChange;

	for (auto &G : M.globals()) {
	if (ShouldInstrumentGlobal(&G)) GlobalsToChange.push_back(&G);
	}

	size_t n = GlobalsToChange.size();
	if (n == 0) {
	*CtorComdat = true;
	return false;
	}

	auto &DL = M.getDataLayout();

	// A global is described by a structure
	// size_t beg;
	// size_t size;
	// size_t size_with_redzone;
	// const char *name;
	// const char *module_name;
	// size_t has_dynamic_init;
	// void *source_location;
	// size_t odr_indicator;
	// We initialize an array of such structures and pass it to a run-time call.
	StructType *GlobalStructTy =
	StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
	IntptrTy, IntptrTy, IntptrTy);
	SmallVector<GlobalVariable *, 16> NewGlobals(n);
	SmallVector<Constant *, 16> Initializers(n);

	bool HasDynamicallyInitializedGlobals = false;

	// We shouldn't merge same module names, as this string serves as unique
	// module ID in runtime.
	GlobalVariable *ModuleName = createPrivateGlobalForString(
	M, M.getModuleIdentifier(), /AllowMerging/ false);

	for (size_t i = 0; i < n; i++) {
	static const uint64_t kMaxGlobalRedzone = 1 << 18;
	GlobalVariable *G = GlobalsToChange[i];

	auto MD = GlobalsMD.get(G);
	StringRef NameForGlobal = G->getName();
	// Create string holding the global name (use global name from metadata
	// if it's available, otherwise just write the name of global variable).
	GlobalVariable *Name = createPrivateGlobalForString(
	M, MD.Name.empty() ? NameForGlobal : MD.Name,
	/AllowMerging/ true);

	Type *Ty = G->getValueType();
	uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
	uint64_t MinRZ = MinRedzoneSizeForGlobal();
	// MinRZ <= RZ <= kMaxGlobalRedzone
	// and trying to make RZ to be ~ 1/4 of SizeInBytes.
	uint64_t RZ = std::max(
	MinRZ, std::min(kMaxGlobalRedzone, (SizeInBytes / MinRZ / 4) * MinRZ));
	uint64_t RightRedzoneSize = RZ;
	// Round up to MinRZ
	if (SizeInBytes % MinRZ) RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
	assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
	Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);

	StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
	Constant *NewInitializer = ConstantStruct::get(
	NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));

	// Create a new global variable with enough space for a redzone.
	GlobalValue::LinkageTypes Linkage = G->getLinkage();
	if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
	Linkage = GlobalValue::InternalLinkage;
	GlobalVariable *NewGlobal =
	new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer,
	"", G, G->getThreadLocalMode());
	NewGlobal->copyAttributesFrom(G);
	NewGlobal->setAlignment(MinRZ);

	// Move null-terminated C strings to "__asan_cstring" section on Darwin.
	if (TargetTriple.isOSBinFormatMachO() && !G->hasSection() &&
	G->isConstant()) {
	auto Seq = dyn_cast<ConstantDataSequential>(G->getInitializer());
	if (Seq && Seq->isCString())
	NewGlobal->setSection("__TEXT,__asan_cstring,regular");
	}

	// Transfer the debug info. The payload starts at offset zero so we can
	// copy the debug info over as is.
	SmallVector<DIGlobalVariableExpression *, 1> GVs;
	G->getDebugInfo(GVs);
	for (auto *GV : GVs)
	NewGlobal->addDebugInfo(GV);

	Value *Indices2[2];
	Indices2[0] = IRB.getInt32(0);
	Indices2[1] = IRB.getInt32(0);

	G->replaceAllUsesWith(
	ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true));
	NewGlobal->takeName(G);
	G->eraseFromParent();
	NewGlobals[i] = NewGlobal;

	Constant *SourceLoc;
	if (!MD.SourceLoc.empty()) {
	auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
	SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
	} else {
	SourceLoc = ConstantInt::get(IntptrTy, 0);
	}

	Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
	GlobalValue *InstrumentedGlobal = NewGlobal;

	bool CanUsePrivateAliases =
	TargetTriple.isOSBinFormatELF() \|\| TargetTriple.isOSBinFormatMachO() \|\|
	TargetTriple.isOSBinFormatWasm();
	if (CanUsePrivateAliases && ClUsePrivateAliasForGlobals) {
	// Create local alias for NewGlobal to avoid crash on ODR between
	// instrumented and non-instrumented libraries.
	auto *GA = GlobalAlias::create(GlobalValue::InternalLinkage,
	NameForGlobal + M.getName(), NewGlobal);

	// With local aliases, we need to provide another externally visible
	// symbol __odr_asan_XXX to detect ODR violation.
	auto *ODRIndicatorSym =
	new GlobalVariable(M, IRB.getInt8Ty(), false, Linkage,
	Constant::getNullValue(IRB.getInt8Ty()),
	kODRGenPrefix + NameForGlobal, nullptr,
	NewGlobal->getThreadLocalMode());

	// Set meaningful attributes for indicator symbol.
	ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
	ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
	ODRIndicatorSym->setAlignment(1);
	ODRIndicator = ODRIndicatorSym;
	InstrumentedGlobal = GA;
	}

	Constant *Initializer = ConstantStruct::get(
	GlobalStructTy,
	ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy),
	ConstantInt::get(IntptrTy, SizeInBytes),
	ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
	ConstantExpr::getPointerCast(Name, IntptrTy),
	ConstantExpr::getPointerCast(ModuleName, IntptrTy),
	ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
	ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));

	if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;

	DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");

	Initializers[i] = Initializer;
	}

	std::string ELFUniqueModuleId =
	(UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
	: "";

	if (!ELFUniqueModuleId.empty()) {
	InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
	*CtorComdat = true;
	} else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
	InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
	} else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
	InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
	} else {
	InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
	}

	// Create calls for poisoning before initializers run and unpoisoning after.
	if (HasDynamicallyInitializedGlobals)
	createInitializerPoisonCalls(M, ModuleName);

	DEBUG(dbgs() << M);
	return true;
	}

	bool AddressSanitizerModule::runOnModule(Module &M) {
	C = &(M.getContext());
	int LongSize = M.getDataLayout().getPointerSizeInBits();
	IntptrTy = Type::getIntNTy(*C, LongSize);
	TargetTriple = Triple(M.getTargetTriple());
	Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
	initializeCallbacks(M);

	if (CompileKernel)
	return false;

	// Create a module constructor. A destructor is created lazily because not all
	// platforms, and not all modules need it.
	std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
	M, kAsanModuleCtorName, kAsanInitName, /InitArgTypes=/{},
	/InitArgs=/{}, kAsanVersionCheckName);

	bool CtorComdat = true;
	bool Changed = false;
	// TODO(glider): temporarily disabled globals instrumentation for KASan.
	if (ClGlobals) {
	IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
	Changed \|= InstrumentGlobals(IRB, M, &CtorComdat);
	}

	// Put the constructor and destructor in comdat if both
	// (1) global instrumentation is not TU-specific
	// (2) target is ELF.
	if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
	AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
	appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority,
	AsanCtorFunction);
	if (AsanDtorFunction) {
	AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
	appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority,
	AsanDtorFunction);
	}
	} else {
	appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
	if (AsanDtorFunction)
	appendToGlobalDtors(M, AsanDtorFunction, kAsanCtorAndDtorPriority);
	}

	return Changed;
	}

	void AddressSanitizer::initializeCallbacks(Module &M) {
	IRBuilder<> IRB(*C);
	// Create __asan_report* callbacks.
	// IsWrite, TypeSize and Exp are encoded in the function name.
	for (int Exp = 0; Exp < 2; Exp++) {
	for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
	const std::string TypeStr = AccessIsWrite ? "store" : "load";
	const std::string ExpStr = Exp ? "exp_" : "";
	const std::string SuffixStr = CompileKernel ? "N" : "_n";
	const std::string EndingStr = Recover ? "_noabort" : "";

	SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
	SmallVector<Type *, 2> Args1{1, IntptrTy};
	if (Exp) {
	Type ExpType = Type::getInt32Ty(C);
	Args2.push_back(ExpType);
	Args1.push_back(ExpType);
	}
	AsanErrorCallbackSized[AccessIsWrite][Exp] =
	checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr +
	EndingStr,
	FunctionType::get(IRB.getVoidTy(), Args2, false)));

	AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
	checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
	FunctionType::get(IRB.getVoidTy(), Args2, false)));

	for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
	AccessSizeIndex++) {
	const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
	AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
	checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
	FunctionType::get(IRB.getVoidTy(), Args1, false)));

	AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
	checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
	FunctionType::get(IRB.getVoidTy(), Args1, false)));
	}
	}
	}

	const std::string MemIntrinCallbackPrefix =
	CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
	AsanMemmove = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	MemIntrinCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
	IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
	AsanMemcpy = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	MemIntrinCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
	IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
	AsanMemset = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	MemIntrinCallbackPrefix + "memset", IRB.getInt8PtrTy(),
	IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy));

	AsanHandleNoReturnFunc = checkSanitizerInterfaceFunction(
	M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy()));

	AsanPtrCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy));
	AsanPtrSubFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy));
	// We insert an empty inline asm after __asan_report* to avoid callback merge.
	EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
	StringRef(""), StringRef(""),
	/hasSideEffects=/true);
	}

	// virtual
	bool AddressSanitizer::doInitialization(Module &M) {
	// Initialize the private fields. No one has accessed them before.
	GlobalsMD.init(M);

	C = &(M.getContext());
	LongSize = M.getDataLayout().getPointerSizeInBits();
	IntptrTy = Type::getIntNTy(*C, LongSize);
	TargetTriple = Triple(M.getTargetTriple());

	Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
	return true;
	}

	bool AddressSanitizer::doFinalization(Module &M) {
	GlobalsMD.reset();
	return false;
	}

	bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
	// For each NSObject descendant having a +load method, this method is invoked
	// by the ObjC runtime before any of the static constructors is called.
	// Therefore we need to instrument such methods with a call to __asan_init
	// at the beginning in order to initialize our runtime before any access to
	// the shadow memory.
	// We cannot just ignore these methods, because they may call other
	// instrumented functions.
	if (F.getName().find(" load]") != std::string::npos) {
	Function *AsanInitFunction =
	declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
	IRBuilder<> IRB(&F.front(), F.front().begin());
	IRB.CreateCall(AsanInitFunction, {});
	return true;
	}
	return false;
	}

	void AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
	// Generate code only when dynamic addressing is needed.
	if (Mapping.Offset != kDynamicShadowSentinel)
	return;

	IRBuilder<> IRB(&F.front().front());
	Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
	kAsanShadowMemoryDynamicAddress, IntptrTy);
	LocalDynamicShadow = IRB.CreateLoad(GlobalDynamicAddress);
	}

	void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
	// Find the one possible call to llvm.localescape and pre-mark allocas passed
	// to it as uninteresting. This assumes we haven't started processing allocas
	// yet. This check is done up front because iterating the use list in
	// isInterestingAlloca would be algorithmically slower.
	assert(ProcessedAllocas.empty() && "must process localescape before allocas");

	// Try to get the declaration of llvm.localescape. If it's not in the module,
	// we can exit early.
	if (!F.getParent()->getFunction("llvm.localescape")) return;

	// Look for a call to llvm.localescape call in the entry block. It can't be in
	// any other block.
	for (Instruction &I : F.getEntryBlock()) {
	IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
	if (II && II->getIntrinsicID() == Intrinsic::localescape) {
	// We found a call. Mark all the allocas passed in as uninteresting.
	for (Value *Arg : II->arg_operands()) {
	AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
	assert(AI && AI->isStaticAlloca() &&
	"non-static alloca arg to localescape");
	ProcessedAllocas[AI] = false;
	}
	break;
	}
	}
	}

	bool AddressSanitizer::runOnFunction(Function &F) {
	if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
	if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
	if (F.getName().startswith("__asan_")) return false;

	bool FunctionModified = false;

	// If needed, insert __asan_init before checking for SanitizeAddress attr.
	// This function needs to be called even if the function body is not
	// instrumented.
	if (maybeInsertAsanInitAtFunctionEntry(F))
	FunctionModified = true;

	// Leave if the function doesn't need instrumentation.
	if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;

	DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");

	initializeCallbacks(*F.getParent());
	DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();

	FunctionStateRAII CleanupObj(this);

	maybeInsertDynamicShadowAtFunctionEntry(F);

	// We can't instrument allocas used with llvm.localescape. Only static allocas
	// can be passed to that intrinsic.
	markEscapedLocalAllocas(F);

	// We want to instrument every address only once per basic block (unless there
	// are calls between uses).
	SmallSet<Value *, 16> TempsToInstrument;
	SmallVector<Instruction *, 16> ToInstrument;
	SmallVector<Instruction *, 8> NoReturnCalls;
	SmallVector<BasicBlock *, 16> AllBlocks;
	SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
	int NumAllocas = 0;
	bool IsWrite;
	unsigned Alignment;
	uint64_t TypeSize;
	const TargetLibraryInfo *TLI =
	&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();

	// Fill the set of memory operations to instrument.
	for (auto &BB : F) {
	AllBlocks.push_back(&BB);
	TempsToInstrument.clear();
	int NumInsnsPerBB = 0;
	for (auto &Inst : BB) {
	if (LooksLikeCodeInBug11395(&Inst)) return false;
	Value *MaybeMask = nullptr;
	if (Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
	&Alignment, &MaybeMask)) {
	if (ClOpt && ClOptSameTemp) {
	// If we have a mask, skip instrumentation if we've already
	// instrumented the full object. But don't add to TempsToInstrument
	// because we might get another load/store with a different mask.
	if (MaybeMask) {
	if (TempsToInstrument.count(Addr))
	continue; // We've seen this (whole) temp in the current BB.
	} else {
	if (!TempsToInstrument.insert(Addr).second)
	continue; // We've seen this temp in the current BB.
	}
	}
	} else if (ClInvalidPointerPairs &&
	isInterestingPointerComparisonOrSubtraction(&Inst)) {
	PointerComparisonsOrSubtracts.push_back(&Inst);
	continue;
	} else if (isa<MemIntrinsic>(Inst)) {
	// ok, take it.
	} else {
	if (isa<AllocaInst>(Inst)) NumAllocas++;
	CallSite CS(&Inst);
	if (CS) {
	// A call inside BB.
	TempsToInstrument.clear();
	if (CS.doesNotReturn()) NoReturnCalls.push_back(CS.getInstruction());
	}
	if (CallInst *CI = dyn_cast<CallInst>(&Inst))
	maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
	continue;
	}
	ToInstrument.push_back(&Inst);
	NumInsnsPerBB++;
	if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
	}
	}

	bool UseCalls =
	CompileKernel \|\|
	(ClInstrumentationWithCallsThreshold >= 0 &&
	ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
	const DataLayout &DL = F.getParent()->getDataLayout();
	ObjectSizeOpts ObjSizeOpts;
	ObjSizeOpts.RoundToAlign = true;
	ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);

	// Instrument.
	int NumInstrumented = 0;
	for (auto Inst : ToInstrument) {
	if (ClDebugMin < 0 \|\| ClDebugMax < 0 \|\|
	(NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
	if (isInterestingMemoryAccess(Inst, &IsWrite, &TypeSize, &Alignment))
	instrumentMop(ObjSizeVis, Inst, UseCalls,
	F.getParent()->getDataLayout());
	else
	instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
	}
	NumInstrumented++;
	}

	FunctionStackPoisoner FSP(F, *this);
	bool ChangedStack = FSP.runOnFunction();

	// We must unpoison the stack before every NoReturn call (throw, _exit, etc).
	// See e.g. http://code.google.com/p/address-sanitizer/issues/detail?id=37
	for (auto CI : NoReturnCalls) {
	IRBuilder<> IRB(CI);
	IRB.CreateCall(AsanHandleNoReturnFunc, {});
	}

	for (auto Inst : PointerComparisonsOrSubtracts) {
	instrumentPointerComparisonOrSubtraction(Inst);
	NumInstrumented++;
	}

	if (NumInstrumented > 0 \|\| ChangedStack \|\| !NoReturnCalls.empty())
	FunctionModified = true;

	DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
	<< F << "\n");

	return FunctionModified;
	}

	// Workaround for bug 11395: we don't want to instrument stack in functions
	// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
	// FIXME: remove once the bug 11395 is fixed.
	bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
	if (LongSize != 32) return false;
	CallInst *CI = dyn_cast<CallInst>(I);
	if (!CI \|\| !CI->isInlineAsm()) return false;
	if (CI->getNumArgOperands() <= 5) return false;
	// We have inline assembly with quite a few arguments.
	return true;
	}

	void FunctionStackPoisoner::initializeCallbacks(Module &M) {
	IRBuilder<> IRB(*C);
	for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
	std::string Suffix = itostr(i);
	AsanStackMallocFunc[i] = checkSanitizerInterfaceFunction(
	M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
	IntptrTy));
	AsanStackFreeFunc[i] = checkSanitizerInterfaceFunction(
	M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
	IRB.getVoidTy(), IntptrTy, IntptrTy));
	}
	if (ASan.UseAfterScope) {
	AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
	M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
	IntptrTy, IntptrTy));
	AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
	M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
	IntptrTy, IntptrTy));
	}

	for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
	std::ostringstream Name;
	Name << kAsanSetShadowPrefix;
	Name << std::setw(2) << std::setfill('0') << std::hex << Val;
	AsanSetShadowFunc[Val] =
	checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy));
	}

	AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
	AsanAllocasUnpoisonFunc =
	checkSanitizerInterfaceFunction(M.getOrInsertFunction(
	kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
	}

	void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
	ArrayRef<uint8_t> ShadowBytes,
	size_t Begin, size_t End,
	IRBuilder<> &IRB,
	Value *ShadowBase) {
	if (Begin >= End)
	return;

	const size_t LargestStoreSizeInBytes =
	std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8);

	const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian();

	// Poison given range in shadow using larges store size with out leading and
	// trailing zeros in ShadowMask. Zeros never change, so they need neither
	// poisoning nor up-poisoning. Still we don't mind if some of them get into a
	// middle of a store.
	for (size_t i = Begin; i < End;) {
	if (!ShadowMask[i]) {
	assert(!ShadowBytes[i]);
	++i;
	continue;
	}

	size_t StoreSizeInBytes = LargestStoreSizeInBytes;
	// Fit store size into the range.
	while (StoreSizeInBytes > End - i)
	StoreSizeInBytes /= 2;

	// Minimize store size by trimming trailing zeros.
	for (size_t j = StoreSizeInBytes - 1; j && !ShadowMask[i + j]; --j) {
	while (j <= StoreSizeInBytes / 2)
	StoreSizeInBytes /= 2;
	}

	uint64_t Val = 0;
	for (size_t j = 0; j < StoreSizeInBytes; j++) {
	if (IsLittleEndian)
	Val \|= (uint64_t)ShadowBytes[i + j] << (8 * j);
	else
	Val = (Val << 8) \| ShadowBytes[i + j];
	}

	Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
	Value Poison = IRB.getIntN(StoreSizeInBytes 8, Val);
	IRB.CreateAlignedStore(
	Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()), 1);

	i += StoreSizeInBytes;
	}
	}

	void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
	ArrayRef<uint8_t> ShadowBytes,
	IRBuilder<> &IRB, Value *ShadowBase) {
	copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase);
	}

	void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
	ArrayRef<uint8_t> ShadowBytes,
	size_t Begin, size_t End,
	IRBuilder<> &IRB, Value *ShadowBase) {
	assert(ShadowMask.size() == ShadowBytes.size());
	size_t Done = Begin;
	for (size_t i = Begin, j = Begin + 1; i < End; i = j++) {
	if (!ShadowMask[i]) {
	assert(!ShadowBytes[i]);
	continue;
	}
	uint8_t Val = ShadowBytes[i];
	if (!AsanSetShadowFunc[Val])
	continue;

	// Skip same values.
	for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) {
	}

	if (j - i >= ClMaxInlinePoisoningSize) {
	copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase);
	IRB.CreateCall(AsanSetShadowFunc[Val],
	{IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)),
	ConstantInt::get(IntptrTy, j - i)});
	Done = j;
	}
	}

	copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase);
	}

	// Fake stack allocator (asan_fake_stack.h) has 11 size classes
	// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
	static int StackMallocSizeClass(uint64_t LocalStackSize) {
	assert(LocalStackSize <= kMaxStackMallocSize);
	uint64_t MaxSize = kMinStackMallocSize;
	for (int i = 0;; i++, MaxSize *= 2)
	if (LocalStackSize <= MaxSize) return i;
	llvm_unreachable("impossible LocalStackSize");
	}

	void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
	BasicBlock &FirstBB = *F.begin();
	IRBuilder<> IRB(&FirstBB, FirstBB.getFirstInsertionPt());
	const DataLayout &DL = F.getParent()->getDataLayout();
	for (Argument &Arg : F.args()) {
	if (Arg.hasByValAttr()) {
	Type *Ty = Arg.getType()->getPointerElementType();
	unsigned Align = Arg.getParamAlignment();
	if (Align == 0) Align = DL.getABITypeAlignment(Ty);

	const std::string &Name = Arg.hasName() ? Arg.getName().str() :
	"Arg" + llvm::to_string(Arg.getArgNo());
	AllocaInst *AI = IRB.CreateAlloca(Ty, nullptr, Twine(Name) + ".byval");
	AI->setAlignment(Align);
	Arg.replaceAllUsesWith(AI);

	uint64_t AllocSize = DL.getTypeAllocSize(Ty);
	IRB.CreateMemCpy(AI, &Arg, AllocSize, Align);
	}
	}
	}

	PHINode FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value Cond,
	Value *ValueIfTrue,
	Instruction *ThenTerm,
	Value *ValueIfFalse) {
	PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
	BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
	PHI->addIncoming(ValueIfFalse, CondBlock);
	BasicBlock *ThenBlock = ThenTerm->getParent();
	PHI->addIncoming(ValueIfTrue, ThenBlock);
	return PHI;
	}

	Value *FunctionStackPoisoner::createAllocaForLayout(
	IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) {
	AllocaInst *Alloca;
	if (Dynamic) {
	Alloca = IRB.CreateAlloca(IRB.getInt8Ty(),
	ConstantInt::get(IRB.getInt64Ty(), L.FrameSize),
	"MyAlloca");
	} else {
	Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize),
	nullptr, "MyAlloca");
	assert(Alloca->isStaticAlloca());
	}
	assert((ClRealignStack & (ClRealignStack - 1)) == 0);
	size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
	Alloca->setAlignment(FrameAlignment);
	return IRB.CreatePointerCast(Alloca, IntptrTy);
	}

	void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
	BasicBlock &FirstBB = *F.begin();
	IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
	DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
	IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
	DynamicAllocaLayout->setAlignment(32);
	}

	void FunctionStackPoisoner::processDynamicAllocas() {
	if (!ClInstrumentDynamicAllocas \|\| DynamicAllocaVec.empty()) {
	assert(DynamicAllocaPoisonCallVec.empty());
	return;
	}

	// Insert poison calls for lifetime intrinsics for dynamic allocas.
	for (const auto &APC : DynamicAllocaPoisonCallVec) {
	assert(APC.InsBefore);
	assert(APC.AI);
	assert(ASan.isInterestingAlloca(*APC.AI));
	assert(!APC.AI->isStaticAlloca());

	IRBuilder<> IRB(APC.InsBefore);
	poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
	// Dynamic allocas will be unpoisoned unconditionally below in
	// unpoisonDynamicAllocas.
	// Flag that we need unpoison static allocas.
	}

	// Handle dynamic allocas.
	createDynamicAllocasInitStorage();
	for (auto &AI : DynamicAllocaVec)
	handleDynamicAllocaCall(AI);
	unpoisonDynamicAllocas();
	}

	void FunctionStackPoisoner::processStaticAllocas() {
	if (AllocaVec.empty()) {
	assert(StaticAllocaPoisonCallVec.empty());
	return;
	}

	int StackMallocIdx = -1;
	DebugLoc EntryDebugLocation;
	if (auto SP = F.getSubprogram())
	EntryDebugLocation = DebugLoc::get(SP->getScopeLine(), 0, SP);

	Instruction *InsBefore = AllocaVec[0];
	IRBuilder<> IRB(InsBefore);
	IRB.SetCurrentDebugLocation(EntryDebugLocation);

	// Make sure non-instrumented allocas stay in the entry block. Otherwise,
	// debug info is broken, because only entry-block allocas are treated as
	// regular stack slots.
	auto InsBeforeB = InsBefore->getParent();
	assert(InsBeforeB == &F.getEntryBlock());
	for (auto *AI : StaticAllocasToMoveUp)
	if (AI->getParent() == InsBeforeB)
	AI->moveBefore(InsBefore);

	// If we have a call to llvm.localescape, keep it in the entry block.
	if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);

	SmallVector<ASanStackVariableDescription, 16> SVD;
	SVD.reserve(AllocaVec.size());
	for (AllocaInst *AI : AllocaVec) {
	ASanStackVariableDescription D = {AI->getName().data(),
	ASan.getAllocaSizeInBytes(*AI),
	0,
	AI->getAlignment(),
	AI,
	0,
	0};
	SVD.push_back(D);
	}

	// Minimal header size (left redzone) is 4 pointers,
	// i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
	size_t MinHeaderSize = ASan.LongSize / 2;
	const ASanStackFrameLayout &L =
	ComputeASanStackFrameLayout(SVD, 1ULL << Mapping.Scale, MinHeaderSize);

	// Build AllocaToSVDMap for ASanStackVariableDescription lookup.
	DenseMap<const AllocaInst , ASanStackVariableDescription > AllocaToSVDMap;
	for (auto &Desc : SVD)
	AllocaToSVDMap[Desc.AI] = &Desc;

	// Update SVD with information from lifetime intrinsics.
	for (const auto &APC : StaticAllocaPoisonCallVec) {
	assert(APC.InsBefore);
	assert(APC.AI);
	assert(ASan.isInterestingAlloca(*APC.AI));
	assert(APC.AI->isStaticAlloca());

	ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
	Desc.LifetimeSize = Desc.Size;
	if (const DILocation *FnLoc = EntryDebugLocation.get()) {
	if (const DILocation *LifetimeLoc = APC.InsBefore->getDebugLoc().get()) {
	if (LifetimeLoc->getFile() == FnLoc->getFile())
	if (unsigned Line = LifetimeLoc->getLine())
	Desc.Line = std::min(Desc.Line ? Desc.Line : Line, Line);
	}
	}
	}

	auto DescriptionString = ComputeASanStackFrameDescription(SVD);
	DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
	uint64_t LocalStackSize = L.FrameSize;
	bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
	LocalStackSize <= kMaxStackMallocSize;
	bool DoDynamicAlloca = ClDynamicAllocaStack;
	// Don't do dynamic alloca or stack malloc if:
	// 1) There is inline asm: too often it makes assumptions on which registers
	// are available.
	// 2) There is a returns_twice call (typically setjmp), which is
	// optimization-hostile, and doesn't play well with introduced indirect
	// register-relative calculation of local variable addresses.
	DoDynamicAlloca &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
	DoStackMalloc &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;

	Value *StaticAlloca =
	DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);

	Value *FakeStack;
	Value *LocalStackBase;

	if (DoStackMalloc) {
	// void *FakeStack = __asan_option_detect_stack_use_after_return
	// ? __asan_stack_malloc_N(LocalStackSize)
	// : nullptr;
	// void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
	Constant *OptionDetectUseAfterReturn = F.getParent()->getOrInsertGlobal(
	kAsanOptionDetectUseAfterReturn, IRB.getInt32Ty());
	Value *UseAfterReturnIsEnabled =
	IRB.CreateICmpNE(IRB.CreateLoad(OptionDetectUseAfterReturn),
	Constant::getNullValue(IRB.getInt32Ty()));
	Instruction *Term =
	SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
	IRBuilder<> IRBIf(Term);
	IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
	StackMallocIdx = StackMallocSizeClass(LocalStackSize);
	assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
	Value *FakeStackValue =
	IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
	ConstantInt::get(IntptrTy, LocalStackSize));
	IRB.SetInsertPoint(InsBefore);
	IRB.SetCurrentDebugLocation(EntryDebugLocation);
	FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
	ConstantInt::get(IntptrTy, 0));

	Value *NoFakeStack =
	IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
	Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
	IRBIf.SetInsertPoint(Term);
	IRBIf.SetCurrentDebugLocation(EntryDebugLocation);
	Value *AllocaValue =
	DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
	IRB.SetInsertPoint(InsBefore);
	IRB.SetCurrentDebugLocation(EntryDebugLocation);
	LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
	} else {
	// void *FakeStack = nullptr;
	// void *LocalStackBase = alloca(LocalStackSize);
	FakeStack = ConstantInt::get(IntptrTy, 0);
	LocalStackBase =
	DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
	}

	// Replace Alloca instructions with base+offset.
	for (const auto &Desc : SVD) {
	AllocaInst *AI = Desc.AI;
	Value *NewAllocaPtr = IRB.CreateIntToPtr(
	IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
	AI->getType());
	replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB, DIExpression::NoDeref);
	AI->replaceAllUsesWith(NewAllocaPtr);
	}

	// The left-most redzone has enough space for at least 4 pointers.
	// Write the Magic value to redzone[0].
	Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
	IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
	BasePlus0);
	// Write the frame description constant to redzone[1].
	Value *BasePlus1 = IRB.CreateIntToPtr(
	IRB.CreateAdd(LocalStackBase,
	ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
	IntptrPtrTy);
	GlobalVariable *StackDescriptionGlobal =
	createPrivateGlobalForString(*F.getParent(), DescriptionString,
	/AllowMerging/ true);
	Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
	IRB.CreateStore(Description, BasePlus1);
	// Write the PC to redzone[2].
	Value *BasePlus2 = IRB.CreateIntToPtr(
	IRB.CreateAdd(LocalStackBase,
	ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
	IntptrPtrTy);
	IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);

	const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);

	// Poison the stack red zones at the entry.
	Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
	// As mask we must use most poisoned case: red zones and after scope.
	// As bytes we can use either the same or just red zones only.
	copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);

	if (!StaticAllocaPoisonCallVec.empty()) {
	const auto &ShadowInScope = GetShadowBytes(SVD, L);

	// Poison static allocas near lifetime intrinsics.
	for (const auto &APC : StaticAllocaPoisonCallVec) {
	const ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
	assert(Desc.Offset % L.Granularity == 0);
	size_t Begin = Desc.Offset / L.Granularity;
	size_t End = Begin + (APC.Size + L.Granularity - 1) / L.Granularity;

	IRBuilder<> IRB(APC.InsBefore);
	copyToShadow(ShadowAfterScope,
	APC.DoPoison ? ShadowAfterScope : ShadowInScope, Begin, End,
	IRB, ShadowBase);
	}
	}

	SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
	SmallVector<uint8_t, 64> ShadowAfterReturn;

	// (Un)poison the stack before all ret instructions.
	for (auto Ret : RetVec) {
	IRBuilder<> IRBRet(Ret);
	// Mark the current frame as retired.
	IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
	BasePlus0);
	if (DoStackMalloc) {
	assert(StackMallocIdx >= 0);
	// if FakeStack != 0 // LocalStackBase == FakeStack
	// // In use-after-return mode, poison the whole stack frame.
	// if StackMallocIdx <= 4
	// // For small sizes inline the whole thing:
	// memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
	// **SavedFlagPtr(FakeStack) = 0
	// else
	// __asan_stack_free_N(FakeStack, LocalStackSize)
	// else
	// <This is not a fake stack; unpoison the redzones>
	Value *Cmp =
	IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
	TerminatorInst ThenTerm, ElseTerm;
	SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);

	IRBuilder<> IRBPoison(ThenTerm);
	if (StackMallocIdx <= 4) {
	int ClassSize = kMinStackMallocSize << StackMallocIdx;
	ShadowAfterReturn.resize(ClassSize / L.Granularity,
	kAsanStackUseAfterReturnMagic);
	copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
	ShadowBase);
	Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
	FakeStack,
	ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
	Value *SavedFlagPtr = IRBPoison.CreateLoad(
	IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
	IRBPoison.CreateStore(
	Constant::getNullValue(IRBPoison.getInt8Ty()),
	IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
	} else {
	// For larger frames call __asan_stack_free_*.
	IRBPoison.CreateCall(
	AsanStackFreeFunc[StackMallocIdx],
	{FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
	}

	IRBuilder<> IRBElse(ElseTerm);
	copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase);
	} else {
	copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase);
	}
	}

	// We are done. Remove the old unused alloca instructions.
	for (auto AI : AllocaVec) AI->eraseFromParent();
	}

	void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
	IRBuilder<> &IRB, bool DoPoison) {
	// For now just insert the call to ASan runtime.
	Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
	Value *SizeArg = ConstantInt::get(IntptrTy, Size);
	IRB.CreateCall(
	DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc,
	{AddrArg, SizeArg});
	}

	// Handling llvm.lifetime intrinsics for a given %alloca:
	// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
	// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
	// invalid accesses) and unpoison it for llvm.lifetime.start (the memory
	// could be poisoned by previous llvm.lifetime.end instruction, as the
	// variable may go in and out of scope several times, e.g. in loops).
	// (3) if we poisoned at least one %alloca in a function,
	// unpoison the whole stack frame at function exit.

	AllocaInst FunctionStackPoisoner::findAllocaForValue(Value V) {
	if (AllocaInst *AI = dyn_cast<AllocaInst>(V))
	// We're interested only in allocas we can handle.
	return ASan.isInterestingAlloca(*AI) ? AI : nullptr;
	// See if we've already calculated (or started to calculate) alloca for a
	// given value.
	AllocaForValueMapTy::iterator I = AllocaForValue.find(V);
	if (I != AllocaForValue.end()) return I->second;
	// Store 0 while we're calculating alloca for value V to avoid
	// infinite recursion if the value references itself.
	AllocaForValue[V] = nullptr;
	AllocaInst *Res = nullptr;
	if (CastInst *CI = dyn_cast<CastInst>(V))
	Res = findAllocaForValue(CI->getOperand(0));
	else if (PHINode *PN = dyn_cast<PHINode>(V)) {
	for (Value *IncValue : PN->incoming_values()) {
	// Allow self-referencing phi-nodes.
	if (IncValue == PN) continue;
	AllocaInst *IncValueAI = findAllocaForValue(IncValue);
	// AI for incoming values should exist and should all be equal.
	if (IncValueAI == nullptr \|\| (Res != nullptr && IncValueAI != Res))
	return nullptr;
	Res = IncValueAI;
	}
	} else if (GetElementPtrInst *EP = dyn_cast<GetElementPtrInst>(V)) {
	Res = findAllocaForValue(EP->getPointerOperand());
	} else {
	DEBUG(dbgs() << "Alloca search canceled on unknown instruction: " << *V << "\n");
	}
	if (Res) AllocaForValue[V] = Res;
	return Res;
	}

	void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
	IRBuilder<> IRB(AI);

	const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment());
	const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;

	Value *Zero = Constant::getNullValue(IntptrTy);
	Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);
	Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask);

	// Since we need to extend alloca with additional memory to locate
	// redzones, and OldSize is number of allocated blocks with
	// ElementSize size, get allocated memory size in bytes by
	// OldSize * ElementSize.
	const unsigned ElementSize =
	F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
	Value *OldSize =
	IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false),
	ConstantInt::get(IntptrTy, ElementSize));

	// PartialSize = OldSize % 32
	Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask);

	// Misalign = kAllocaRzSize - PartialSize;
	Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize);

	// PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0;
	Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
	Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);

	// AdditionalChunkSize = Align + PartialPadding + kAllocaRzSize
	// Align is added to locate left redzone, PartialPadding for possible
	// partial redzone and kAllocaRzSize for right redzone respectively.
	Value *AdditionalChunkSize = IRB.CreateAdd(
	ConstantInt::get(IntptrTy, Align + kAllocaRzSize), PartialPadding);

	Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);

	// Insert new alloca with new NewSize and Align params.
	AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
	NewAlloca->setAlignment(Align);

	// NewAddress = Address + Align
	Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
	ConstantInt::get(IntptrTy, Align));

	// Insert __asan_alloca_poison call for new created alloca.
	IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});

	// Store the last alloca's address to DynamicAllocaLayout. We'll need this
	// for unpoisoning stuff.
	IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout);

	Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());

	// Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
	AI->replaceAllUsesWith(NewAddressPtr);

	// We are done. Erase old alloca from parent.
	AI->eraseFromParent();
	}

	// isSafeAccess returns true if Addr is always inbounds with respect to its
	// base object. For example, it is a field access or an array access with
	// constant inbounds index.
	bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
	Value *Addr, uint64_t TypeSize) const {
	SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
	if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
	uint64_t Size = SizeOffset.first.getZExtValue();
	int64_t Offset = SizeOffset.second.getSExtValue();
	// Three checks are required to ensure safety:
	// . Offset >= 0 (since the offset is given from the base ptr)
	// . Size >= Offset (unsigned)
	// . Size - Offset >= NeededSize (unsigned)
	return Offset >= 0 && Size >= uint64_t(Offset) &&
	Size - uint64_t(Offset) >= TypeSize / 8;
	}
	Index: head/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp
	===================================================================
	--- head/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Transforms/Scalar/SCCP.cpp (revision 322320)
	@@ -1,1983 +1,1984 @@
	//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements sparse conditional constant propagation and merging:
	//
	// Specifically, this:
	// * Assumes values are constant unless proven otherwise
	// * Assumes BasicBlocks are dead unless proven otherwise
	// * Proves values to be constant, and replaces them with constants
	// * Proves conditional branches to be unconditional
	//
	//===----------------------------------------------------------------------===//

	#include "llvm/Transforms/IPO/SCCP.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/PointerIntPair.h"
	#include "llvm/ADT/SmallPtrSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/ADT/Statistic.h"
	#include "llvm/Analysis/ConstantFolding.h"
	#include "llvm/Analysis/GlobalsModRef.h"
	#include "llvm/Analysis/TargetLibraryInfo.h"
	#include "llvm/IR/CallSite.h"
	#include "llvm/IR/Constants.h"
	#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/InstVisitor.h"
	#include "llvm/IR/Instructions.h"
	#include "llvm/Pass.h"
	#include "llvm/Support/Debug.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/raw_ostream.h"
	#include "llvm/Transforms/IPO.h"
	#include "llvm/Transforms/Scalar.h"
	#include "llvm/Transforms/Scalar/SCCP.h"
	#include "llvm/Transforms/Utils/Local.h"
	#include <algorithm>
	using namespace llvm;

	#define DEBUG_TYPE "sccp"

	STATISTIC(NumInstRemoved, "Number of instructions removed");
	STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");

	STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
	STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
	STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");

	namespace {
	/// LatticeVal class - This class represents the different lattice values that
	/// an LLVM value may occupy. It is a simple class with value semantics.
	///
	class LatticeVal {
	enum LatticeValueTy {
	/// unknown - This LLVM Value has no known value yet.
	unknown,

	/// constant - This LLVM Value has a specific constant value.
	constant,

	/// forcedconstant - This LLVM Value was thought to be undef until
	/// ResolvedUndefsIn. This is treated just like 'constant', but if merged
	/// with another (different) constant, it goes to overdefined, instead of
	/// asserting.
	forcedconstant,

	/// overdefined - This instruction is not known to be constant, and we know
	/// it has a value.
	overdefined
	};

	/// Val: This stores the current lattice value along with the Constant* for
	/// the constant if this is a 'constant' or 'forcedconstant' value.
	PointerIntPair<Constant *, 2, LatticeValueTy> Val;

	LatticeValueTy getLatticeValue() const {
	return Val.getInt();
	}

	public:
	LatticeVal() : Val(nullptr, unknown) {}

	bool isUnknown() const { return getLatticeValue() == unknown; }
	bool isConstant() const {
	return getLatticeValue() == constant \|\| getLatticeValue() == forcedconstant;
	}
	bool isOverdefined() const { return getLatticeValue() == overdefined; }

	Constant *getConstant() const {
	assert(isConstant() && "Cannot get the constant of a non-constant!");
	return Val.getPointer();
	}

	/// markOverdefined - Return true if this is a change in status.
	bool markOverdefined() {
	if (isOverdefined())
	return false;

	Val.setInt(overdefined);
	return true;
	}

	/// markConstant - Return true if this is a change in status.
	bool markConstant(Constant *V) {
	if (getLatticeValue() == constant) { // Constant but not forcedconstant.
	assert(getConstant() == V && "Marking constant with different value");
	return false;
	}

	if (isUnknown()) {
	Val.setInt(constant);
	assert(V && "Marking constant with NULL");
	Val.setPointer(V);
	} else {
	assert(getLatticeValue() == forcedconstant &&
	"Cannot move from overdefined to constant!");
	// Stay at forcedconstant if the constant is the same.
	if (V == getConstant()) return false;

	// Otherwise, we go to overdefined. Assumptions made based on the
	// forced value are possibly wrong. Assuming this is another constant
	// could expose a contradiction.
	Val.setInt(overdefined);
	}
	return true;
	}

	/// getConstantInt - If this is a constant with a ConstantInt value, return it
	/// otherwise return null.
	ConstantInt *getConstantInt() const {
	if (isConstant())
	return dyn_cast<ConstantInt>(getConstant());
	return nullptr;
	}

	/// getBlockAddress - If this is a constant with a BlockAddress value, return
	/// it, otherwise return null.
	BlockAddress *getBlockAddress() const {
	if (isConstant())
	return dyn_cast<BlockAddress>(getConstant());
	return nullptr;
	}

	void markForcedConstant(Constant *V) {
	assert(isUnknown() && "Can't force a defined value!");
	Val.setInt(forcedconstant);
	Val.setPointer(V);
	}
	};
	} // end anonymous namespace.


	namespace {

	//===----------------------------------------------------------------------===//
	//
	/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
	/// Constant Propagation.
	///
	class SCCPSolver : public InstVisitor<SCCPSolver> {
	const DataLayout &DL;
	const TargetLibraryInfo *TLI;
	SmallPtrSet<BasicBlock*, 8> BBExecutable; // The BBs that are executable.
	DenseMap<Value*, LatticeVal> ValueState; // The state each value is in.

	/// StructValueState - This maintains ValueState for values that have
	/// StructType, for example for formal arguments, calls, insertelement, etc.
	///
	DenseMap<std::pair<Value*, unsigned>, LatticeVal> StructValueState;

	/// GlobalValue - If we are tracking any values for the contents of a global
	/// variable, we keep a mapping from the constant accessor to the element of
	/// the global, to the currently known value. If the value becomes
	/// overdefined, it's entry is simply removed from this map.
	DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;

	/// TrackedRetVals - If we are tracking arguments into and the return
	/// value out of a function, it will have an entry in this map, indicating
	/// what the known return value for the function is.
	DenseMap<Function*, LatticeVal> TrackedRetVals;

	/// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
	/// that return multiple values.
	DenseMap<std::pair<Function*, unsigned>, LatticeVal> TrackedMultipleRetVals;

	/// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
	/// represented here for efficient lookup.
	SmallPtrSet<Function*, 16> MRVFunctionsTracked;

	/// TrackingIncomingArguments - This is the set of functions for whose
	/// arguments we make optimistic assumptions about and try to prove as
	/// constants.
	SmallPtrSet<Function*, 16> TrackingIncomingArguments;

	/// The reason for two worklists is that overdefined is the lowest state
	/// on the lattice, and moving things to overdefined as fast as possible
	/// makes SCCP converge much faster.
	///
	/// By having a separate worklist, we accomplish this because everything
	/// possibly overdefined will become overdefined at the soonest possible
	/// point.
	SmallVector<Value*, 64> OverdefinedInstWorkList;
	SmallVector<Value*, 64> InstWorkList;


	SmallVector<BasicBlock*, 64> BBWorkList; // The BasicBlock work list

	/// KnownFeasibleEdges - Entries in this set are edges which have already had
	/// PHI nodes retriggered.
	typedef std::pair<BasicBlock, BasicBlock> Edge;
	DenseSet<Edge> KnownFeasibleEdges;
	public:
	SCCPSolver(const DataLayout &DL, const TargetLibraryInfo *tli)
	: DL(DL), TLI(tli) {}

	/// MarkBlockExecutable - This method can be used by clients to mark all of
	/// the blocks that are known to be intrinsically live in the processed unit.
	///
	/// This returns true if the block was not considered live before.
	bool MarkBlockExecutable(BasicBlock *BB) {
	if (!BBExecutable.insert(BB).second)
	return false;
	DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
	BBWorkList.push_back(BB); // Add the block to the work list!
	return true;
	}

	/// TrackValueOfGlobalVariable - Clients can use this method to
	/// inform the SCCPSolver that it should track loads and stores to the
	/// specified global variable if it can. This is only legal to call if
	/// performing Interprocedural SCCP.
	void TrackValueOfGlobalVariable(GlobalVariable *GV) {
	// We only track the contents of scalar globals.
	if (GV->getValueType()->isSingleValueType()) {
	LatticeVal &IV = TrackedGlobals[GV];
	if (!isa<UndefValue>(GV->getInitializer()))
	IV.markConstant(GV->getInitializer());
	}
	}

	/// AddTrackedFunction - If the SCCP solver is supposed to track calls into
	/// and out of the specified function (which cannot have its address taken),
	/// this method must be called.
	void AddTrackedFunction(Function *F) {
	// Add an entry, F -> undef.
	if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
	MRVFunctionsTracked.insert(F);
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
	TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
	LatticeVal()));
	} else
	TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
	}

	void AddArgumentTrackedFunction(Function *F) {
	TrackingIncomingArguments.insert(F);
	}

	/// Solve - Solve for constants and executable blocks.
	///
	void Solve();

	/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
	/// that branches on undef values cannot reach any of their successors.
	/// However, this is not a safe assumption. After we solve dataflow, this
	/// method should be use to handle this. If this returns true, the solver
	/// should be rerun.
	bool ResolvedUndefsIn(Function &F);

	bool isBlockExecutable(BasicBlock *BB) const {
	return BBExecutable.count(BB);
	}

	std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
	std::vector<LatticeVal> StructValues;
	auto *STy = dyn_cast<StructType>(V->getType());
	assert(STy && "getStructLatticeValueFor() can be called only on structs");
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	auto I = StructValueState.find(std::make_pair(V, i));
	assert(I != StructValueState.end() && "Value not in valuemap!");
	StructValues.push_back(I->second);
	}
	return StructValues;
	}

	LatticeVal getLatticeValueFor(Value *V) const {
	DenseMap<Value*, LatticeVal>::const_iterator I = ValueState.find(V);
	assert(I != ValueState.end() && "V is not in valuemap!");
	return I->second;
	}

	/// getTrackedRetVals - Get the inferred return value map.
	///
	const DenseMap<Function*, LatticeVal> &getTrackedRetVals() {
	return TrackedRetVals;
	}

	/// getTrackedGlobals - Get and return the set of inferred initializers for
	/// global variables.
	const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
	return TrackedGlobals;
	}

	/// getMRVFunctionsTracked - Get the set of functions which return multiple
	/// values tracked by the pass.
	const SmallPtrSet<Function *, 16> getMRVFunctionsTracked() {
	return MRVFunctionsTracked;
	}

	/// markOverdefined - Mark the specified value overdefined. This
	/// works with both scalars and structs.
	void markOverdefined(Value *V) {
	if (auto *STy = dyn_cast<StructType>(V->getType()))
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
	markOverdefined(getStructValueState(V, i), V);
	else
	markOverdefined(ValueState[V], V);
	}

	// isStructLatticeConstant - Return true if all the lattice values
	// corresponding to elements of the structure are not overdefined,
	// false otherwise.
	bool isStructLatticeConstant(Function F, StructType STy) {
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
	assert(It != TrackedMultipleRetVals.end());
	LatticeVal LV = It->second;
	if (LV.isOverdefined())
	return false;
	}
	return true;
	}

	private:
	// pushToWorkList - Helper for markConstant/markForcedConstant/markOverdefined
	void pushToWorkList(LatticeVal &IV, Value *V) {
	if (IV.isOverdefined())
	return OverdefinedInstWorkList.push_back(V);
	InstWorkList.push_back(V);
	}

	// markConstant - Make a value be marked as "constant". If the value
	// is not already a constant, add it to the instruction work list so that
	// the users of the instruction are updated later.
	//
	void markConstant(LatticeVal &IV, Value V, Constant C) {
	if (!IV.markConstant(C)) return;
	DEBUG(dbgs() << "markConstant: " << C << ": " << V << '\n');
	pushToWorkList(IV, V);
	}

	void markConstant(Value V, Constant C) {
	assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
	markConstant(ValueState[V], V, C);
	}

	void markForcedConstant(Value V, Constant C) {
	assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
	LatticeVal &IV = ValueState[V];
	IV.markForcedConstant(C);
	DEBUG(dbgs() << "markForcedConstant: " << C << ": " << V << '\n');
	pushToWorkList(IV, V);
	}


	// markOverdefined - Make a value be marked as "overdefined". If the
	// value is not already overdefined, add it to the overdefined instruction
	// work list so that the users of the instruction are updated later.
	void markOverdefined(LatticeVal &IV, Value *V) {
	if (!IV.markOverdefined()) return;

	DEBUG(dbgs() << "markOverdefined: ";
	if (auto *F = dyn_cast<Function>(V))
	dbgs() << "Function '" << F->getName() << "'\n";
	else
	dbgs() << *V << '\n');
	// Only instructions go on the work list
	pushToWorkList(IV, V);
	}

	void mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
	if (IV.isOverdefined() \|\| MergeWithV.isUnknown())
	return; // Noop.
	if (MergeWithV.isOverdefined())
	return markOverdefined(IV, V);
	if (IV.isUnknown())
	return markConstant(IV, V, MergeWithV.getConstant());
	if (IV.getConstant() != MergeWithV.getConstant())
	return markOverdefined(IV, V);
	}

	void mergeInValue(Value *V, LatticeVal MergeWithV) {
	assert(!V->getType()->isStructTy() &&
	"non-structs should use markConstant");
	mergeInValue(ValueState[V], V, MergeWithV);
	}


	/// getValueState - Return the LatticeVal object that corresponds to the
	/// value. This function handles the case when the value hasn't been seen yet
	/// by properly seeding constants etc.
	LatticeVal &getValueState(Value *V) {
	assert(!V->getType()->isStructTy() && "Should use getStructValueState");

	std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I =
	ValueState.insert(std::make_pair(V, LatticeVal()));
	LatticeVal &LV = I.first->second;

	if (!I.second)
	return LV; // Common case, already in the map.

	if (auto *C = dyn_cast<Constant>(V)) {
	// Undef values remain unknown.
	if (!isa<UndefValue>(V))
	LV.markConstant(C); // Constants are constant
	}

	// All others are underdefined by default.
	return LV;
	}

	/// getStructValueState - Return the LatticeVal object that corresponds to the
	/// value/field pair. This function handles the case when the value hasn't
	/// been seen yet by properly seeding constants etc.
	LatticeVal &getStructValueState(Value *V, unsigned i) {
	assert(V->getType()->isStructTy() && "Should use getValueState");
	assert(i < cast<StructType>(V->getType())->getNumElements() &&
	"Invalid element #");

	std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator,
	bool> I = StructValueState.insert(
	std::make_pair(std::make_pair(V, i), LatticeVal()));
	LatticeVal &LV = I.first->second;

	if (!I.second)
	return LV; // Common case, already in the map.

	if (auto *C = dyn_cast<Constant>(V)) {
	Constant *Elt = C->getAggregateElement(i);

	if (!Elt)
	LV.markOverdefined(); // Unknown sort of constant.
	else if (isa<UndefValue>(Elt))
	; // Undef values remain unknown.
	else
	LV.markConstant(Elt); // Constants are constant.
	}

	// All others are underdefined by default.
	return LV;
	}


	/// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
	/// work list if it is not already executable.
	void markEdgeExecutable(BasicBlock Source, BasicBlock Dest) {
	if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
	return; // This edge is already known to be executable!

	if (!MarkBlockExecutable(Dest)) {
	// If the destination is already executable, we just made an edge
	// feasible that wasn't before. Revisit the PHI nodes in the block
	// because they have potentially new operands.
	DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
	<< " -> " << Dest->getName() << '\n');

	PHINode *PN;
	for (BasicBlock::iterator I = Dest->begin();
	(PN = dyn_cast<PHINode>(I)); ++I)
	visitPHINode(*PN);
	}
	}

	// getFeasibleSuccessors - Return a vector of booleans to indicate which
	// successors are reachable from a given terminator instruction.
	//
	void getFeasibleSuccessors(TerminatorInst &TI, SmallVectorImpl<bool> &Succs);

	// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
	// block to the 'To' basic block is currently feasible.
	//
	bool isEdgeFeasible(BasicBlock From, BasicBlock To);

	// OperandChangedState - This method is invoked on all of the users of an
	// instruction that was just changed state somehow. Based on this
	// information, we need to update the specified user of this instruction.
	//
	void OperandChangedState(Instruction *I) {
	if (BBExecutable.count(I->getParent())) // Inst is executable?
	visit(*I);
	}

	private:
	friend class InstVisitor<SCCPSolver>;

	// visit implementations - Something changed in this instruction. Either an
	// operand made a transition, or the instruction is newly executable. Change
	// the value type of I to reflect these changes if appropriate.
	void visitPHINode(PHINode &I);

	// Terminators
	void visitReturnInst(ReturnInst &I);
	void visitTerminatorInst(TerminatorInst &TI);

	void visitCastInst(CastInst &I);
	void visitSelectInst(SelectInst &I);
	void visitBinaryOperator(Instruction &I);
	void visitCmpInst(CmpInst &I);
	void visitExtractValueInst(ExtractValueInst &EVI);
	void visitInsertValueInst(InsertValueInst &IVI);
	void visitCatchSwitchInst(CatchSwitchInst &CPI) {
	markOverdefined(&CPI);
	visitTerminatorInst(CPI);
	}

	// Instructions that cannot be folded away.
	void visitStoreInst (StoreInst &I);
	void visitLoadInst (LoadInst &I);
	void visitGetElementPtrInst(GetElementPtrInst &I);
	void visitCallInst (CallInst &I) {
	visitCallSite(&I);
	}
	void visitInvokeInst (InvokeInst &II) {
	visitCallSite(&II);
	visitTerminatorInst(II);
	}
	void visitCallSite (CallSite CS);
	void visitResumeInst (TerminatorInst &I) { /returns void/ }
	void visitUnreachableInst(TerminatorInst &I) { /returns void/ }
	void visitFenceInst (FenceInst &I) { /returns void/ }
	void visitInstruction(Instruction &I) {
	// All the instructions we don't do any special handling for just
	// go to overdefined.
	DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
	markOverdefined(&I);
	}
	};

	} // end anonymous namespace


	// getFeasibleSuccessors - Return a vector of booleans to indicate which
	// successors are reachable from a given terminator instruction.
	//
	void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
	SmallVectorImpl<bool> &Succs) {
	Succs.resize(TI.getNumSuccessors());
	if (auto *BI = dyn_cast<BranchInst>(&TI)) {
	if (BI->isUnconditional()) {
	Succs[0] = true;
	return;
	}

	LatticeVal BCValue = getValueState(BI->getCondition());
	ConstantInt *CI = BCValue.getConstantInt();
	if (!CI) {
	// Overdefined condition variables, and branches on unfoldable constant
	// conditions, mean the branch could go either way.
	if (!BCValue.isUnknown())
	Succs[0] = Succs[1] = true;
	return;
	}

	// Constant condition variables mean the branch can only go a single way.
	Succs[CI->isZero()] = true;
	return;
	}

	// Unwinding instructions successors are always executable.
	if (TI.isExceptional()) {
	Succs.assign(TI.getNumSuccessors(), true);
	return;
	}

	if (auto *SI = dyn_cast<SwitchInst>(&TI)) {
	if (!SI->getNumCases()) {
	Succs[0] = true;
	return;
	}
	LatticeVal SCValue = getValueState(SI->getCondition());
	ConstantInt *CI = SCValue.getConstantInt();

	if (!CI) { // Overdefined or unknown condition?
	// All destinations are executable!
	if (!SCValue.isUnknown())
	Succs.assign(TI.getNumSuccessors(), true);
	return;
	}

	Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
	return;
	}

	// In case of indirect branch and its address is a blockaddress, we mark
	// the target as executable.
	if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
	// Casts are folded by visitCastInst.
	LatticeVal IBRValue = getValueState(IBR->getAddress());
	BlockAddress *Addr = IBRValue.getBlockAddress();
	if (!Addr) { // Overdefined or unknown condition?
	// All destinations are executable!
	if (!IBRValue.isUnknown())
	Succs.assign(TI.getNumSuccessors(), true);
	return;
	}

	BasicBlock* T = Addr->getBasicBlock();
	assert(Addr->getFunction() == T->getParent() &&
	"Block address of a different function ?");
	for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
	// This is the target.
	if (IBR->getDestination(i) == T) {
	Succs[i] = true;
	return;
	}
	}

	// If we didn't find our destination in the IBR successor list, then we
	// have undefined behavior. Its ok to assume no successor is executable.
	return;
	}

	DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
	llvm_unreachable("SCCP: Don't know how to handle this terminator!");
	}


	// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
	// block to the 'To' basic block is currently feasible.
	//
	bool SCCPSolver::isEdgeFeasible(BasicBlock From, BasicBlock To) {
	assert(BBExecutable.count(To) && "Dest should always be alive!");

	// Make sure the source basic block is executable!!
	if (!BBExecutable.count(From)) return false;

	// Check to make sure this edge itself is actually feasible now.
	TerminatorInst *TI = From->getTerminator();
	if (auto *BI = dyn_cast<BranchInst>(TI)) {
	if (BI->isUnconditional())
	return true;

	LatticeVal BCValue = getValueState(BI->getCondition());

	// Overdefined condition variables mean the branch could go either way,
	// undef conditions mean that neither edge is feasible yet.
	ConstantInt *CI = BCValue.getConstantInt();
	if (!CI)
	return !BCValue.isUnknown();

	// Constant condition variables mean the branch can only go a single way.
	return BI->getSuccessor(CI->isZero()) == To;
	}

	// Unwinding instructions successors are always executable.
	if (TI->isExceptional())
	return true;

	if (auto *SI = dyn_cast<SwitchInst>(TI)) {
	if (SI->getNumCases() < 1)
	return true;

	LatticeVal SCValue = getValueState(SI->getCondition());
	ConstantInt *CI = SCValue.getConstantInt();

	if (!CI)
	return !SCValue.isUnknown();

	return SI->findCaseValue(CI)->getCaseSuccessor() == To;
	}

	// In case of indirect branch and its address is a blockaddress, we mark
	// the target as executable.
	if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
	LatticeVal IBRValue = getValueState(IBR->getAddress());
	BlockAddress *Addr = IBRValue.getBlockAddress();

	if (!Addr)
	return !IBRValue.isUnknown();

	// At this point, the indirectbr is branching on a blockaddress.
	return Addr->getBasicBlock() == To;
	}

	DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
	llvm_unreachable("SCCP: Don't know how to handle this terminator!");
	}

	// visit Implementations - Something changed in this instruction, either an
	// operand made a transition, or the instruction is newly executable. Change
	// the value type of I to reflect these changes if appropriate. This method
	// makes sure to do the following actions:
	//
	// 1. If a phi node merges two constants in, and has conflicting value coming
	// from different branches, or if the PHI node merges in an overdefined
	// value, then the PHI node becomes overdefined.
	// 2. If a phi node merges only constants in, and they all agree on value, the
	// PHI node becomes a constant value equal to that.
	// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
	// 4. If V <- x (op) y && (isOverdefined(x) \|\| isOverdefined(y)) V = Overdefined
	// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
	// 6. If a conditional branch has a value that is constant, make the selected
	// destination executable
	// 7. If a conditional branch has a value that is overdefined, make all
	// successors executable.
	//
	void SCCPSolver::visitPHINode(PHINode &PN) {
	// If this PN returns a struct, just mark the result overdefined.
	// TODO: We could do a lot better than this if code actually uses this.
	if (PN.getType()->isStructTy())
	return markOverdefined(&PN);

	if (getValueState(&PN).isOverdefined())
	return; // Quick exit

	// Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
	// and slow us down a lot. Just mark them overdefined.
	if (PN.getNumIncomingValues() > 64)
	return markOverdefined(&PN);

	// Look at all of the executable operands of the PHI node. If any of them
	// are overdefined, the PHI becomes overdefined as well. If they are all
	// constant, and they agree with each other, the PHI becomes the identical
	// constant. If they are constant and don't agree, the PHI is overdefined.
	// If there are no executable operands, the PHI remains unknown.
	//
	Constant *OperandVal = nullptr;
	for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
	LatticeVal IV = getValueState(PN.getIncomingValue(i));
	if (IV.isUnknown()) continue; // Doesn't influence PHI node.

	if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
	continue;

	if (IV.isOverdefined()) // PHI node becomes overdefined!
	return markOverdefined(&PN);

	if (!OperandVal) { // Grab the first value.
	OperandVal = IV.getConstant();
	continue;
	}

	// There is already a reachable operand. If we conflict with it,
	// then the PHI node becomes overdefined. If we agree with it, we
	// can continue on.

	// Check to see if there are two different constants merging, if so, the PHI
	// node is overdefined.
	if (IV.getConstant() != OperandVal)
	return markOverdefined(&PN);
	}

	// If we exited the loop, this means that the PHI node only has constant
	// arguments that agree with each other(and OperandVal is the constant) or
	// OperandVal is null because there are no defined incoming arguments. If
	// this is the case, the PHI remains unknown.
	//
	if (OperandVal)
	markConstant(&PN, OperandVal); // Acquire operand value
	}

	void SCCPSolver::visitReturnInst(ReturnInst &I) {
	if (I.getNumOperands() == 0) return; // ret void

	Function *F = I.getParent()->getParent();
	Value *ResultOp = I.getOperand(0);

	// If we are tracking the return value of this function, merge it in.
	if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
	DenseMap<Function*, LatticeVal>::iterator TFRVI =
	TrackedRetVals.find(F);
	if (TFRVI != TrackedRetVals.end()) {
	mergeInValue(TFRVI->second, F, getValueState(ResultOp));
	return;
	}
	}

	// Handle functions that return multiple values.
	if (!TrackedMultipleRetVals.empty()) {
	if (auto *STy = dyn_cast<StructType>(ResultOp->getType()))
	if (MRVFunctionsTracked.count(F))
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
	mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
	getStructValueState(ResultOp, i));

	}
	}

	void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
	SmallVector<bool, 16> SuccFeasible;
	getFeasibleSuccessors(TI, SuccFeasible);

	BasicBlock *BB = TI.getParent();

	// Mark all feasible successors executable.
	for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
	if (SuccFeasible[i])
	markEdgeExecutable(BB, TI.getSuccessor(i));
	}

	void SCCPSolver::visitCastInst(CastInst &I) {
	LatticeVal OpSt = getValueState(I.getOperand(0));
	if (OpSt.isOverdefined()) // Inherit overdefinedness of operand
	markOverdefined(&I);
	else if (OpSt.isConstant()) {
	// Fold the constant as we build.
	Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpSt.getConstant(),
	I.getType(), DL);
	if (isa<UndefValue>(C))
	return;
	// Propagate constant value
	markConstant(&I, C);
	}
	}


	void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
	// If this returns a struct, mark all elements over defined, we don't track
	// structs in structs.
	if (EVI.getType()->isStructTy())
	return markOverdefined(&EVI);

	// If this is extracting from more than one level of struct, we don't know.
	if (EVI.getNumIndices() != 1)
	return markOverdefined(&EVI);

	Value *AggVal = EVI.getAggregateOperand();
	if (AggVal->getType()->isStructTy()) {
	unsigned i = *EVI.idx_begin();
	LatticeVal EltVal = getStructValueState(AggVal, i);
	mergeInValue(getValueState(&EVI), &EVI, EltVal);
	} else {
	// Otherwise, must be extracting from an array.
	return markOverdefined(&EVI);
	}
	}

	void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
	auto *STy = dyn_cast<StructType>(IVI.getType());
	if (!STy)
	return markOverdefined(&IVI);

	// If this has more than one index, we can't handle it, drive all results to
	// undef.
	if (IVI.getNumIndices() != 1)
	return markOverdefined(&IVI);

	Value *Aggr = IVI.getAggregateOperand();
	unsigned Idx = *IVI.idx_begin();

	// Compute the result based on what we're inserting.
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	// This passes through all values that aren't the inserted element.
	if (i != Idx) {
	LatticeVal EltVal = getStructValueState(Aggr, i);
	mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
	continue;
	}

	Value *Val = IVI.getInsertedValueOperand();
	if (Val->getType()->isStructTy())
	// We don't track structs in structs.
	markOverdefined(getStructValueState(&IVI, i), &IVI);
	else {
	LatticeVal InVal = getValueState(Val);
	mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
	}
	}
	}

	void SCCPSolver::visitSelectInst(SelectInst &I) {
	// If this select returns a struct, just mark the result overdefined.
	// TODO: We could do a lot better than this if code actually uses this.
	if (I.getType()->isStructTy())
	return markOverdefined(&I);

	LatticeVal CondValue = getValueState(I.getCondition());
	if (CondValue.isUnknown())
	return;

	if (ConstantInt *CondCB = CondValue.getConstantInt()) {
	Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
	mergeInValue(&I, getValueState(OpVal));
	return;
	}

	// Otherwise, the condition is overdefined or a constant we can't evaluate.
	// See if we can produce something better than overdefined based on the T/F
	// value.
	LatticeVal TVal = getValueState(I.getTrueValue());
	LatticeVal FVal = getValueState(I.getFalseValue());

	// select ?, C, C -> C.
	if (TVal.isConstant() && FVal.isConstant() &&
	TVal.getConstant() == FVal.getConstant())
	return markConstant(&I, FVal.getConstant());

	if (TVal.isUnknown()) // select ?, undef, X -> X.
	return mergeInValue(&I, FVal);
	if (FVal.isUnknown()) // select ?, X, undef -> X.
	return mergeInValue(&I, TVal);
	markOverdefined(&I);
	}

	// Handle Binary Operators.
	void SCCPSolver::visitBinaryOperator(Instruction &I) {
	LatticeVal V1State = getValueState(I.getOperand(0));
	LatticeVal V2State = getValueState(I.getOperand(1));

	LatticeVal &IV = ValueState[&I];
	if (IV.isOverdefined()) return;

	if (V1State.isConstant() && V2State.isConstant()) {
	Constant *C = ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
	V2State.getConstant());
	// X op Y -> undef.
	if (isa<UndefValue>(C))
	return;
	return markConstant(IV, &I, C);
	}

	// If something is undef, wait for it to resolve.
	if (!V1State.isOverdefined() && !V2State.isOverdefined())
	return;

	// Otherwise, one of our operands is overdefined. Try to produce something
	// better than overdefined with some tricks.
	// If this is 0 / Y, it doesn't matter that the second operand is
	// overdefined, and we can replace it with zero.
	if (I.getOpcode() == Instruction::UDiv \|\| I.getOpcode() == Instruction::SDiv)
	if (V1State.isConstant() && V1State.getConstant()->isNullValue())
	return markConstant(IV, &I, V1State.getConstant());

	// If this is:
	// -> AND/MUL with 0
	// -> OR with -1
	// it doesn't matter that the other operand is overdefined.
	if (I.getOpcode() == Instruction::And \|\| I.getOpcode() == Instruction::Mul \|\|
	I.getOpcode() == Instruction::Or) {
	LatticeVal *NonOverdefVal = nullptr;
	if (!V1State.isOverdefined())
	NonOverdefVal = &V1State;
	else if (!V2State.isOverdefined())
	NonOverdefVal = &V2State;

	if (NonOverdefVal) {
	if (NonOverdefVal->isUnknown())
	return;

	if (I.getOpcode() == Instruction::And \|\|
	I.getOpcode() == Instruction::Mul) {
	// X and 0 = 0
	// X * 0 = 0
	if (NonOverdefVal->getConstant()->isNullValue())
	return markConstant(IV, &I, NonOverdefVal->getConstant());
	} else {
	// X or -1 = -1
	if (ConstantInt *CI = NonOverdefVal->getConstantInt())
	if (CI->isMinusOne())
	return markConstant(IV, &I, NonOverdefVal->getConstant());
	}
	}
	}


	markOverdefined(&I);
	}

	// Handle ICmpInst instruction.
	void SCCPSolver::visitCmpInst(CmpInst &I) {
	LatticeVal V1State = getValueState(I.getOperand(0));
	LatticeVal V2State = getValueState(I.getOperand(1));

	LatticeVal &IV = ValueState[&I];
	if (IV.isOverdefined()) return;

	if (V1State.isConstant() && V2State.isConstant()) {
	Constant *C = ConstantExpr::getCompare(
	I.getPredicate(), V1State.getConstant(), V2State.getConstant());
	if (isa<UndefValue>(C))
	return;
	return markConstant(IV, &I, C);
	}

	// If operands are still unknown, wait for it to resolve.
	if (!V1State.isOverdefined() && !V2State.isOverdefined())
	return;

	markOverdefined(&I);
	}

	// Handle getelementptr instructions. If all operands are constants then we
	// can turn this into a getelementptr ConstantExpr.
	//
	void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
	if (ValueState[&I].isOverdefined()) return;

	SmallVector<Constant*, 8> Operands;
	Operands.reserve(I.getNumOperands());

	for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
	LatticeVal State = getValueState(I.getOperand(i));
	if (State.isUnknown())
	return; // Operands are not resolved yet.

	if (State.isOverdefined())
	return markOverdefined(&I);

	assert(State.isConstant() && "Unknown state!");
	Operands.push_back(State.getConstant());
	}

	Constant *Ptr = Operands[0];
	auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
	Constant *C =
	ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
	if (isa<UndefValue>(C))
	return;
	markConstant(&I, C);
	}

	void SCCPSolver::visitStoreInst(StoreInst &SI) {
	// If this store is of a struct, ignore it.
	if (SI.getOperand(0)->getType()->isStructTy())
	return;

	if (TrackedGlobals.empty() \|\| !isa<GlobalVariable>(SI.getOperand(1)))
	return;

	GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
	DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
	if (I == TrackedGlobals.end() \|\| I->second.isOverdefined()) return;

	// Get the value we are storing into the global, then merge it.
	mergeInValue(I->second, GV, getValueState(SI.getOperand(0)));
	if (I->second.isOverdefined())
	TrackedGlobals.erase(I); // No need to keep tracking this!
	}


	// Handle load instructions. If the operand is a constant pointer to a constant
	// global, we can replace the load with the loaded constant value!
	void SCCPSolver::visitLoadInst(LoadInst &I) {
	// If this load is of a struct, just mark the result overdefined.
	if (I.getType()->isStructTy())
	return markOverdefined(&I);

	LatticeVal PtrVal = getValueState(I.getOperand(0));
	if (PtrVal.isUnknown()) return; // The pointer is not resolved yet!

	LatticeVal &IV = ValueState[&I];
	if (IV.isOverdefined()) return;

	if (!PtrVal.isConstant() \|\| I.isVolatile())
	return markOverdefined(IV, &I);

	Constant *Ptr = PtrVal.getConstant();

	// load null is undefined.
	if (isa<ConstantPointerNull>(Ptr) && I.getPointerAddressSpace() == 0)
	return;

	// Transform load (constant global) into the value loaded.
	if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
	if (!TrackedGlobals.empty()) {
	// If we are tracking this global, merge in the known value for it.
	DenseMap<GlobalVariable*, LatticeVal>::iterator It =
	TrackedGlobals.find(GV);
	if (It != TrackedGlobals.end()) {
	mergeInValue(IV, &I, It->second);
	return;
	}
	}
	}

	// Transform load from a constant into a constant if possible.
	if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
	if (isa<UndefValue>(C))
	return;
	return markConstant(IV, &I, C);
	}

	// Otherwise we cannot say for certain what value this load will produce.
	// Bail out.
	markOverdefined(IV, &I);
	}

	void SCCPSolver::visitCallSite(CallSite CS) {
	Function *F = CS.getCalledFunction();
	Instruction *I = CS.getInstruction();

	// The common case is that we aren't tracking the callee, either because we
	// are not doing interprocedural analysis or the callee is indirect, or is
	// external. Handle these cases first.
	if (!F \|\| F->isDeclaration()) {
	CallOverdefined:
	// Void return and not tracking callee, just bail.
	if (I->getType()->isVoidTy()) return;

	// Otherwise, if we have a single return value case, and if the function is
	// a declaration, maybe we can constant fold it.
	if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
	canConstantFoldCallTo(CS, F)) {

	SmallVector<Constant*, 8> Operands;
	for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
	AI != E; ++AI) {
	LatticeVal State = getValueState(*AI);

	if (State.isUnknown())
	return; // Operands are not resolved yet.
	if (State.isOverdefined())
	return markOverdefined(I);
	assert(State.isConstant() && "Unknown state!");
	Operands.push_back(State.getConstant());
	}

	if (getValueState(I).isOverdefined())
	return;

	// If we can constant fold this, mark the result of the call as a
	// constant.
	if (Constant *C = ConstantFoldCall(CS, F, Operands, TLI)) {
	// call -> undef.
	if (isa<UndefValue>(C))
	return;
	return markConstant(I, C);
	}
	}

	// Otherwise, we don't know anything about this call, mark it overdefined.
	return markOverdefined(I);
	}

	// If this is a local function that doesn't have its address taken, mark its
	// entry block executable and merge in the actual arguments to the call into
	// the formal arguments of the function.
	if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
	MarkBlockExecutable(&F->front());

	// Propagate information from this call site into the callee.
	CallSite::arg_iterator CAI = CS.arg_begin();
	for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
	AI != E; ++AI, ++CAI) {
	// If this argument is byval, and if the function is not readonly, there
	// will be an implicit copy formed of the input aggregate.
	if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
	markOverdefined(&*AI);
	continue;
	}

	if (auto *STy = dyn_cast<StructType>(AI->getType())) {
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	LatticeVal CallArg = getStructValueState(*CAI, i);
	mergeInValue(getStructValueState(&AI, i), &AI, CallArg);
	}
	} else {
	mergeInValue(&AI, getValueState(CAI));
	}
	}
	}

	// If this is a single/zero retval case, see if we're tracking the function.
	if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
	if (!MRVFunctionsTracked.count(F))
	goto CallOverdefined; // Not tracking this callee.

	// If we are tracking this callee, propagate the result of the function
	// into this call site.
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
	mergeInValue(getStructValueState(I, i), I,
	TrackedMultipleRetVals[std::make_pair(F, i)]);
	} else {
	DenseMap<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
	if (TFRVI == TrackedRetVals.end())
	goto CallOverdefined; // Not tracking this callee.

	// If so, propagate the return value of the callee into this call result.
	mergeInValue(I, TFRVI->second);
	}
	}

	void SCCPSolver::Solve() {
	// Process the work lists until they are empty!
	while (!BBWorkList.empty() \|\| !InstWorkList.empty() \|\|
	!OverdefinedInstWorkList.empty()) {
	// Process the overdefined instruction's work list first, which drives other
	// things to overdefined more quickly.
	while (!OverdefinedInstWorkList.empty()) {
	Value *I = OverdefinedInstWorkList.pop_back_val();

	DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');

	// "I" got into the work list because it either made the transition from
	// bottom to constant, or to overdefined.
	//
	// Anything on this worklist that is overdefined need not be visited
	// since all of its users will have already been marked as overdefined
	// Update all of the users of this instruction's value.
	//
	for (User *U : I->users())
	if (auto *UI = dyn_cast<Instruction>(U))
	OperandChangedState(UI);
	}

	// Process the instruction work list.
	while (!InstWorkList.empty()) {
	Value *I = InstWorkList.pop_back_val();

	DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');

	// "I" got into the work list because it made the transition from undef to
	// constant.
	//
	// Anything on this worklist that is overdefined need not be visited
	// since all of its users will have already been marked as overdefined.
	// Update all of the users of this instruction's value.
	//
	if (I->getType()->isStructTy() \|\| !getValueState(I).isOverdefined())
	for (User *U : I->users())
	if (auto *UI = dyn_cast<Instruction>(U))
	OperandChangedState(UI);
	}

	// Process the basic block work list.
	while (!BBWorkList.empty()) {
	BasicBlock *BB = BBWorkList.back();
	BBWorkList.pop_back();

	DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');

	// Notify all instructions in this basic block that they are newly
	// executable.
	visit(BB);
	}
	}
	}

	/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
	/// that branches on undef values cannot reach any of their successors.
	/// However, this is not a safe assumption. After we solve dataflow, this
	/// method should be use to handle this. If this returns true, the solver
	/// should be rerun.
	///
	/// This method handles this by finding an unresolved branch and marking it one
	/// of the edges from the block as being feasible, even though the condition
	/// doesn't say it would otherwise be. This allows SCCP to find the rest of the
	/// CFG and only slightly pessimizes the analysis results (by marking one,
	/// potentially infeasible, edge feasible). This cannot usefully modify the
	/// constraints on the condition of the branch, as that would impact other users
	/// of the value.
	///
	/// This scan also checks for values that use undefs, whose results are actually
	/// defined. For example, 'zext i8 undef to i32' should produce all zeros
	/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
	/// even if X isn't defined.
	bool SCCPSolver::ResolvedUndefsIn(Function &F) {
	for (BasicBlock &BB : F) {
	if (!BBExecutable.count(&BB))
	continue;

	for (Instruction &I : BB) {
	// Look for instructions which produce undef values.
	if (I.getType()->isVoidTy()) continue;

	if (auto *STy = dyn_cast<StructType>(I.getType())) {
	// Only a few things that can be structs matter for undef.

	// Tracked calls must never be marked overdefined in ResolvedUndefsIn.
	if (CallSite CS = CallSite(&I))
	if (Function *F = CS.getCalledFunction())
	if (MRVFunctionsTracked.count(F))
	continue;

	// extractvalue and insertvalue don't need to be marked; they are
	// tracked as precisely as their operands.
	if (isa<ExtractValueInst>(I) \|\| isa<InsertValueInst>(I))
	continue;

	// Send the results of everything else to overdefined. We could be
	// more precise than this but it isn't worth bothering.
	for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
	LatticeVal &LV = getStructValueState(&I, i);
	if (LV.isUnknown())
	markOverdefined(LV, &I);
	}
	continue;
	}

	LatticeVal &LV = getValueState(&I);
	if (!LV.isUnknown()) continue;

	// extractvalue is safe; check here because the argument is a struct.
	if (isa<ExtractValueInst>(I))
	continue;

	// Compute the operand LatticeVals, for convenience below.
	// Anything taking a struct is conservatively assumed to require
	// overdefined markings.
	if (I.getOperand(0)->getType()->isStructTy()) {
	markOverdefined(&I);
	return true;
	}
	LatticeVal Op0LV = getValueState(I.getOperand(0));
	LatticeVal Op1LV;
	if (I.getNumOperands() == 2) {
	if (I.getOperand(1)->getType()->isStructTy()) {
	markOverdefined(&I);
	return true;
	}

	Op1LV = getValueState(I.getOperand(1));
	}
	// If this is an instructions whose result is defined even if the input is
	// not fully defined, propagate the information.
	Type *ITy = I.getType();
	switch (I.getOpcode()) {
	case Instruction::Add:
	case Instruction::Sub:
	case Instruction::Trunc:
	case Instruction::FPTrunc:
	case Instruction::BitCast:
	break; // Any undef -> undef
	case Instruction::FSub:
	case Instruction::FAdd:
	case Instruction::FMul:
	case Instruction::FDiv:
	case Instruction::FRem:
	// Floating-point binary operation: be conservative.
	if (Op0LV.isUnknown() && Op1LV.isUnknown())
	markForcedConstant(&I, Constant::getNullValue(ITy));
	else
	markOverdefined(&I);
	return true;
	case Instruction::ZExt:
	case Instruction::SExt:
	case Instruction::FPToUI:
	case Instruction::FPToSI:
	case Instruction::FPExt:
	case Instruction::PtrToInt:
	case Instruction::IntToPtr:
	case Instruction::SIToFP:
	case Instruction::UIToFP:
	// undef -> 0; some outputs are impossible
	markForcedConstant(&I, Constant::getNullValue(ITy));
	return true;
	case Instruction::Mul:
	case Instruction::And:
	// Both operands undef -> undef
	if (Op0LV.isUnknown() && Op1LV.isUnknown())
	break;
	// undef * X -> 0. X could be zero.
	// undef & X -> 0. X could be zero.
	markForcedConstant(&I, Constant::getNullValue(ITy));
	return true;

	case Instruction::Or:
	// Both operands undef -> undef
	if (Op0LV.isUnknown() && Op1LV.isUnknown())
	break;
	// undef \| X -> -1. X could be -1.
	markForcedConstant(&I, Constant::getAllOnesValue(ITy));
	return true;

	case Instruction::Xor:
	// undef ^ undef -> 0; strictly speaking, this is not strictly
	// necessary, but we try to be nice to people who expect this
	// behavior in simple cases
	if (Op0LV.isUnknown() && Op1LV.isUnknown()) {
	markForcedConstant(&I, Constant::getNullValue(ITy));
	return true;
	}
	// undef ^ X -> undef
	break;

	case Instruction::SDiv:
	case Instruction::UDiv:
	case Instruction::SRem:
	case Instruction::URem:
	// X / undef -> undef. No change.
	// X % undef -> undef. No change.
	if (Op1LV.isUnknown()) break;

	// X / 0 -> undef. No change.
	// X % 0 -> undef. No change.
	if (Op1LV.isConstant() && Op1LV.getConstant()->isZeroValue())
	break;

	// undef / X -> 0. X could be maxint.
	// undef % X -> 0. X could be 1.
	markForcedConstant(&I, Constant::getNullValue(ITy));
	return true;

	case Instruction::AShr:
	// X >>a undef -> undef.
	if (Op1LV.isUnknown()) break;

	// Shifting by the bitwidth or more is undefined.
	if (Op1LV.isConstant()) {
	if (auto *ShiftAmt = Op1LV.getConstantInt())
	if (ShiftAmt->getLimitedValue() >=
	ShiftAmt->getType()->getScalarSizeInBits())
	break;
	}

	// undef >>a X -> 0
	markForcedConstant(&I, Constant::getNullValue(ITy));
	return true;
	case Instruction::LShr:
	case Instruction::Shl:
	// X << undef -> undef.
	// X >> undef -> undef.
	if (Op1LV.isUnknown()) break;

	// Shifting by the bitwidth or more is undefined.
	if (Op1LV.isConstant()) {
	if (auto *ShiftAmt = Op1LV.getConstantInt())
	if (ShiftAmt->getLimitedValue() >=
	ShiftAmt->getType()->getScalarSizeInBits())
	break;
	}

	// undef << X -> 0
	// undef >> X -> 0
	markForcedConstant(&I, Constant::getNullValue(ITy));
	return true;
	case Instruction::Select:
	Op1LV = getValueState(I.getOperand(1));
	// undef ? X : Y -> X or Y. There could be commonality between X/Y.
	if (Op0LV.isUnknown()) {
	if (!Op1LV.isConstant()) // Pick the constant one if there is any.
	Op1LV = getValueState(I.getOperand(2));
	} else if (Op1LV.isUnknown()) {
	// c ? undef : undef -> undef. No change.
	Op1LV = getValueState(I.getOperand(2));
	if (Op1LV.isUnknown())
	break;
	// Otherwise, c ? undef : x -> x.
	} else {
	// Leave Op1LV as Operand(1)'s LatticeValue.
	}

	if (Op1LV.isConstant())
	markForcedConstant(&I, Op1LV.getConstant());
	else
	markOverdefined(&I);
	return true;
	case Instruction::Load:
	// A load here means one of two things: a load of undef from a global,
	// a load from an unknown pointer. Either way, having it return undef
	// is okay.
	break;
	case Instruction::ICmp:
	// X == undef -> undef. Other comparisons get more complicated.
	if (cast<ICmpInst>(&I)->isEquality())
	break;
	markOverdefined(&I);
	return true;
	case Instruction::Call:
	case Instruction::Invoke: {
	// There are two reasons a call can have an undef result
	// 1. It could be tracked.
	// 2. It could be constant-foldable.
	// Because of the way we solve return values, tracked calls must
	// never be marked overdefined in ResolvedUndefsIn.
	if (Function *F = CallSite(&I).getCalledFunction())
	if (TrackedRetVals.count(F))
	break;

	// If the call is constant-foldable, we mark it overdefined because
	// we do not know what return values are valid.
	markOverdefined(&I);
	return true;
	}
	default:
	// If we don't know what should happen here, conservatively mark it
	// overdefined.
	markOverdefined(&I);
	return true;
	}
	}

	// Check to see if we have a branch or switch on an undefined value. If so
	// we force the branch to go one way or the other to make the successor
	// values live. It doesn't really matter which way we force it.
	TerminatorInst *TI = BB.getTerminator();
	if (auto *BI = dyn_cast<BranchInst>(TI)) {
	if (!BI->isConditional()) continue;
	if (!getValueState(BI->getCondition()).isUnknown())
	continue;

	// If the input to SCCP is actually branch on undef, fix the undef to
	// false.
	if (isa<UndefValue>(BI->getCondition())) {
	BI->setCondition(ConstantInt::getFalse(BI->getContext()));
	markEdgeExecutable(&BB, TI->getSuccessor(1));
	return true;
	}

	// Otherwise, it is a branch on a symbolic value which is currently
	// considered to be undef. Handle this by forcing the input value to the
	// branch to false.
	markForcedConstant(BI->getCondition(),
	ConstantInt::getFalse(TI->getContext()));
	return true;
	}

	if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
	// Indirect branch with no successor ?. Its ok to assume it branches
	// to no target.
	if (IBR->getNumSuccessors() < 1)
	continue;

	if (!getValueState(IBR->getAddress()).isUnknown())
	continue;

	// If the input to SCCP is actually branch on undef, fix the undef to
	// the first successor of the indirect branch.
	if (isa<UndefValue>(IBR->getAddress())) {
	IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
	markEdgeExecutable(&BB, IBR->getSuccessor(0));
	return true;
	}

	// Otherwise, it is a branch on a symbolic value which is currently
	// considered to be undef. Handle this by forcing the input value to the
	// branch to the first successor.
	markForcedConstant(IBR->getAddress(),
	BlockAddress::get(IBR->getSuccessor(0)));
	return true;
	}

	if (auto *SI = dyn_cast<SwitchInst>(TI)) {
	if (!SI->getNumCases() \|\| !getValueState(SI->getCondition()).isUnknown())
	continue;

	// If the input to SCCP is actually switch on undef, fix the undef to
	// the first constant.
	if (isa<UndefValue>(SI->getCondition())) {
	SI->setCondition(SI->case_begin()->getCaseValue());
	markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
	return true;
	}

	markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
	return true;
	}
	}

	return false;
	}

	static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
	Constant *Const = nullptr;
	if (V->getType()->isStructTy()) {
	std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V);
	if (any_of(IVs, [](const LatticeVal &LV) { return LV.isOverdefined(); }))
	return false;
	std::vector<Constant *> ConstVals;
	auto *ST = dyn_cast<StructType>(V->getType());
	for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
	LatticeVal V = IVs[i];
	ConstVals.push_back(V.isConstant()
	? V.getConstant()
	: UndefValue::get(ST->getElementType(i)));
	}
	Const = ConstantStruct::get(ST, ConstVals);
	} else {
	LatticeVal IV = Solver.getLatticeValueFor(V);
	if (IV.isOverdefined())
	return false;
	Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
	}
	assert(Const && "Constant is nullptr here!");
	DEBUG(dbgs() << " Constant: " << Const << " = " << V << '\n');

	// Replaces all of the uses of a variable with uses of the constant.
	V->replaceAllUsesWith(Const);
	return true;
	}

	// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
	// and return true if the function was modified.
	//
	static bool runSCCP(Function &F, const DataLayout &DL,
	const TargetLibraryInfo *TLI) {
	DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
	SCCPSolver Solver(DL, TLI);

	// Mark the first block of the function as being executable.
	Solver.MarkBlockExecutable(&F.front());

	// Mark all arguments to the function as being overdefined.
	for (Argument &AI : F.args())
	Solver.markOverdefined(&AI);

	// Solve for constants.
	bool ResolvedUndefs = true;
	while (ResolvedUndefs) {
	Solver.Solve();
	DEBUG(dbgs() << "RESOLVING UNDEFs\n");
	ResolvedUndefs = Solver.ResolvedUndefsIn(F);
	}

	bool MadeChanges = false;

	// If we decided that there are basic blocks that are dead in this function,
	// delete their contents now. Note that we cannot actually delete the blocks,
	// as we cannot modify the CFG of the function.

	for (BasicBlock &BB : F) {
	if (!Solver.isBlockExecutable(&BB)) {
	DEBUG(dbgs() << " BasicBlock Dead:" << BB);

	++NumDeadBlocks;
	NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB);

	MadeChanges = true;
	continue;
	}

	// Iterate over all of the instructions in a function, replacing them with
	// constants if we have found them to be of constant values.
	//
	for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
	Instruction Inst = &BI++;
	if (Inst->getType()->isVoidTy() \|\| isa<TerminatorInst>(Inst))
	continue;

	if (tryToReplaceWithConstant(Solver, Inst)) {
	if (isInstructionTriviallyDead(Inst))
	Inst->eraseFromParent();
	// Hey, we just changed something!
	MadeChanges = true;
	++NumInstRemoved;
	}
	}
	}

	return MadeChanges;
	}

	PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
	const DataLayout &DL = F.getParent()->getDataLayout();
	auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
	if (!runSCCP(F, DL, &TLI))
	return PreservedAnalyses::all();

	auto PA = PreservedAnalyses();
	PA.preserve<GlobalsAA>();
	return PA;
	}

	namespace {
	//===--------------------------------------------------------------------===//
	//
	/// SCCP Class - This class uses the SCCPSolver to implement a per-function
	/// Sparse Conditional Constant Propagator.
	///
	class SCCPLegacyPass : public FunctionPass {
	public:
	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	AU.addPreserved<GlobalsAAWrapperPass>();
	}
	static char ID; // Pass identification, replacement for typeid
	SCCPLegacyPass() : FunctionPass(ID) {
	initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
	}

	// runOnFunction - Run the Sparse Conditional Constant Propagation
	// algorithm, and return true if the function was modified.
	//
	bool runOnFunction(Function &F) override {
	if (skipFunction(F))
	return false;
	const DataLayout &DL = F.getParent()->getDataLayout();
	const TargetLibraryInfo *TLI =
	&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
	return runSCCP(F, DL, TLI);
	}
	};
	} // end anonymous namespace

	char SCCPLegacyPass::ID = 0;
	INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
	"Sparse Conditional Constant Propagation", false, false)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
	"Sparse Conditional Constant Propagation", false, false)

	// createSCCPPass - This is the public interface to this file.
	FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }

	static bool AddressIsTaken(const GlobalValue *GV) {
	// Delete any dead constantexpr klingons.
	GV->removeDeadConstantUsers();

	for (const Use &U : GV->uses()) {
	const User *UR = U.getUser();
	if (const auto *SI = dyn_cast<StoreInst>(UR)) {
	if (SI->getOperand(0) == GV \|\| SI->isVolatile())
	return true; // Storing addr of GV.
	} else if (isa<InvokeInst>(UR) \|\| isa<CallInst>(UR)) {
	// Make sure we are calling the function, not passing the address.
	ImmutableCallSite CS(cast<Instruction>(UR));
	if (!CS.isCallee(&U))
	return true;
	} else if (const auto *LI = dyn_cast<LoadInst>(UR)) {
	if (LI->isVolatile())
	return true;
	} else if (isa<BlockAddress>(UR)) {
	// blockaddress doesn't take the address of the function, it takes addr
	// of label.
	} else {
	return true;
	}
	}
	return false;
	}

	static void findReturnsToZap(Function &F,
	SmallPtrSet<Function *, 32> &AddressTakenFunctions,
	SmallVector<ReturnInst *, 8> &ReturnsToZap) {
	// We can only do this if we know that nothing else can call the function.
	if (!F.hasLocalLinkage() \|\| AddressTakenFunctions.count(&F))
	return;

	for (BasicBlock &BB : F)
	if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
	if (!isa<UndefValue>(RI->getOperand(0)))
	ReturnsToZap.push_back(RI);
	}

	static bool runIPSCCP(Module &M, const DataLayout &DL,
	const TargetLibraryInfo *TLI) {
	SCCPSolver Solver(DL, TLI);

	// AddressTakenFunctions - This set keeps track of the address-taken functions
	// that are in the input. As IPSCCP runs through and simplifies code,
	// functions that were address taken can end up losing their
	// address-taken-ness. Because of this, we keep track of their addresses from
	// the first pass so we can use them for the later simplification pass.
	SmallPtrSet<Function*, 32> AddressTakenFunctions;

	// Loop over all functions, marking arguments to those with their addresses
	// taken or that are external as overdefined.
	//
	for (Function &F : M) {
	if (F.isDeclaration())
	continue;

	// If this is an exact definition of this function, then we can propagate
	// information about its result into callsites of it.
	// Don't touch naked functions. They may contain asm returning a
	// value we don't see, so we may end up interprocedurally propagating
	// the return value incorrectly.
	if (F.hasExactDefinition() && !F.hasFnAttribute(Attribute::Naked))
	Solver.AddTrackedFunction(&F);

	// If this function only has direct calls that we can see, we can track its
	// arguments and return value aggressively, and can assume it is not called
	// unless we see evidence to the contrary.
	if (F.hasLocalLinkage()) {
	if (F.hasAddressTaken()) {
	AddressTakenFunctions.insert(&F);
	}
	else {
	Solver.AddArgumentTrackedFunction(&F);
	continue;
	}
	}

	// Assume the function is called.
	Solver.MarkBlockExecutable(&F.front());

	// Assume nothing about the incoming arguments.
	for (Argument &AI : F.args())
	Solver.markOverdefined(&AI);
	}

	// Loop over global variables. We inform the solver about any internal global
	// variables that do not have their 'addresses taken'. If they don't have
	// their addresses taken, we can propagate constants through them.
	for (GlobalVariable &G : M.globals())
	- if (!G.isConstant() && G.hasLocalLinkage() && !AddressIsTaken(&G))
	+ if (!G.isConstant() && G.hasLocalLinkage() &&
	+ G.hasDefinitiveInitializer() && !AddressIsTaken(&G))
	Solver.TrackValueOfGlobalVariable(&G);

	// Solve for constants.
	bool ResolvedUndefs = true;
	while (ResolvedUndefs) {
	Solver.Solve();

	DEBUG(dbgs() << "RESOLVING UNDEFS\n");
	ResolvedUndefs = false;
	for (Function &F : M)
	ResolvedUndefs \|= Solver.ResolvedUndefsIn(F);
	}

	bool MadeChanges = false;

	// Iterate over all of the instructions in the module, replacing them with
	// constants if we have found them to be of constant values.
	//
	SmallVector<BasicBlock*, 512> BlocksToErase;

	for (Function &F : M) {
	if (F.isDeclaration())
	continue;

	if (Solver.isBlockExecutable(&F.front()))
	for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;
	++AI)
	if (!AI->use_empty() && tryToReplaceWithConstant(Solver, &*AI))
	++IPNumArgsElimed;

	for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
	if (!Solver.isBlockExecutable(&*BB)) {
	DEBUG(dbgs() << " BasicBlock Dead:" << *BB);

	++NumDeadBlocks;
	NumInstRemoved +=
	changeToUnreachable(BB->getFirstNonPHI(), /UseLLVMTrap=/false);

	MadeChanges = true;

	if (&*BB != &F.front())
	BlocksToErase.push_back(&*BB);
	continue;
	}

	for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
	Instruction Inst = &BI++;
	if (Inst->getType()->isVoidTy())
	continue;
	if (tryToReplaceWithConstant(Solver, Inst)) {
	if (!isa<CallInst>(Inst) && !isa<TerminatorInst>(Inst))
	Inst->eraseFromParent();
	// Hey, we just changed something!
	MadeChanges = true;
	++IPNumInstRemoved;
	}
	}
	}

	// Now that all instructions in the function are constant folded, erase dead
	// blocks, because we can now use ConstantFoldTerminator to get rid of
	// in-edges.
	for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
	// If there are any PHI nodes in this successor, drop entries for BB now.
	BasicBlock *DeadBB = BlocksToErase[i];
	for (Value::user_iterator UI = DeadBB->user_begin(),
	UE = DeadBB->user_end();
	UI != UE;) {
	// Grab the user and then increment the iterator early, as the user
	// will be deleted. Step past all adjacent uses from the same user.
	auto I = dyn_cast<Instruction>(UI);
	do { ++UI; } while (UI != UE && *UI == I);

	// Ignore blockaddress users; BasicBlock's dtor will handle them.
	if (!I) continue;

	bool Folded = ConstantFoldTerminator(I->getParent());
	assert(Folded &&
	"Expect TermInst on constantint or blockaddress to be folded");
	(void) Folded;
	}

	// Finally, delete the basic block.
	F.getBasicBlockList().erase(DeadBB);
	}
	BlocksToErase.clear();
	}

	// If we inferred constant or undef return values for a function, we replaced
	// all call uses with the inferred value. This means we don't need to bother
	// actually returning anything from the function. Replace all return
	// instructions with return undef.
	//
	// Do this in two stages: first identify the functions we should process, then
	// actually zap their returns. This is important because we can only do this
	// if the address of the function isn't taken. In cases where a return is the
	// last use of a function, the order of processing functions would affect
	// whether other functions are optimizable.
	SmallVector<ReturnInst*, 8> ReturnsToZap;

	const DenseMap<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
	for (const auto &I : RV) {
	Function *F = I.first;
	if (I.second.isOverdefined() \|\| F->getReturnType()->isVoidTy())
	continue;
	findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
	}

	for (const auto &F : Solver.getMRVFunctionsTracked()) {
	assert(F->getReturnType()->isStructTy() &&
	"The return type should be a struct");
	StructType *STy = cast<StructType>(F->getReturnType());
	if (Solver.isStructLatticeConstant(F, STy))
	findReturnsToZap(*F, AddressTakenFunctions, ReturnsToZap);
	}

	// Zap all returns which we've identified as zap to change.
	for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
	Function *F = ReturnsToZap[i]->getParent()->getParent();
	ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
	}

	// If we inferred constant or undef values for globals variables, we can
	// delete the global and any stores that remain to it.
	const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
	for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
	E = TG.end(); I != E; ++I) {
	GlobalVariable *GV = I->first;
	assert(!I->second.isOverdefined() &&
	"Overdefined values should have been taken out of the map!");
	DEBUG(dbgs() << "Found that GV '" << GV->getName() << "' is constant!\n");
	while (!GV->use_empty()) {
	StoreInst *SI = cast<StoreInst>(GV->user_back());
	SI->eraseFromParent();
	}
	M.getGlobalList().erase(GV);
	++IPNumGlobalConst;
	}

	return MadeChanges;
	}

	PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
	const DataLayout &DL = M.getDataLayout();
	auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
	if (!runIPSCCP(M, DL, &TLI))
	return PreservedAnalyses::all();
	return PreservedAnalyses::none();
	}

	namespace {
	//===--------------------------------------------------------------------===//
	//
	/// IPSCCP Class - This class implements interprocedural Sparse Conditional
	/// Constant Propagation.
	///
	class IPSCCPLegacyPass : public ModulePass {
	public:
	static char ID;

	IPSCCPLegacyPass() : ModulePass(ID) {
	initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
	}

	bool runOnModule(Module &M) override {
	if (skipModule(M))
	return false;
	const DataLayout &DL = M.getDataLayout();
	const TargetLibraryInfo *TLI =
	&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
	return runIPSCCP(M, DL, TLI);
	}

	void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.addRequired<TargetLibraryInfoWrapperPass>();
	}
	};
	} // end anonymous namespace

	char IPSCCPLegacyPass::ID = 0;
	INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
	"Interprocedural Sparse Conditional Constant Propagation",
	false, false)
	INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
	INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
	"Interprocedural Sparse Conditional Constant Propagation",
	false, false)

	// createIPSCCPPass - This is the public interface to this file.
	ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
	Index: head/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp
	===================================================================
	--- head/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp (revision 322319)
	+++ head/contrib/llvm/lib/Transforms/Utils/CloneModule.cpp (revision 322320)
	@@ -1,200 +1,201 @@
	//===- CloneModule.cpp - Clone an entire module ---------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the CloneModule interface which makes a copy of an
	// entire module.
	//
	//===----------------------------------------------------------------------===//

	#include "llvm-c/Core.h"
	#include "llvm/IR/Constant.h"
	#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Module.h"
	#include "llvm/Transforms/Utils/Cloning.h"
	#include "llvm/Transforms/Utils/ValueMapper.h"
	using namespace llvm;

	static void copyComdat(GlobalObject Dst, const GlobalObject Src) {
	const Comdat *SC = Src->getComdat();
	if (!SC)
	return;
	Comdat *DC = Dst->getParent()->getOrInsertComdat(SC->getName());
	DC->setSelectionKind(SC->getSelectionKind());
	Dst->setComdat(DC);
	}

	/// This is not as easy as it might seem because we have to worry about making
	/// copies of global variables and functions, and making their (initializers and
	/// references, respectively) refer to the right globals.
	///
	std::unique_ptr<Module> llvm::CloneModule(const Module *M) {
	// Create the value map that maps things from the old module over to the new
	// module.
	ValueToValueMapTy VMap;
	return CloneModule(M, VMap);
	}

	std::unique_ptr<Module> llvm::CloneModule(const Module *M,
	ValueToValueMapTy &VMap) {
	return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
	}

	std::unique_ptr<Module> llvm::CloneModule(
	const Module *M, ValueToValueMapTy &VMap,
	function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
	// First off, we need to create the new module.
	std::unique_ptr<Module> New =
	llvm::make_unique<Module>(M->getModuleIdentifier(), M->getContext());
	New->setDataLayout(M->getDataLayout());
	New->setTargetTriple(M->getTargetTriple());
	New->setModuleInlineAsm(M->getModuleInlineAsm());

	// Loop over all of the global variables, making corresponding globals in the
	// new module. Here we add them to the VMap and to the new Module. We
	// don't worry about attributes or initializers, they will come later.
	//
	for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
	I != E; ++I) {
	GlobalVariable GV = new GlobalVariable(New,
	I->getValueType(),
	I->isConstant(), I->getLinkage(),
	(Constant*) nullptr, I->getName(),
	(GlobalVariable*) nullptr,
	I->getThreadLocalMode(),
	I->getType()->getAddressSpace());
	GV->copyAttributesFrom(&*I);
	VMap[&*I] = GV;
	}

	// Loop over the functions in the module, making external functions as before
	for (const Function &I : *M) {
	Function *NF = Function::Create(cast<FunctionType>(I.getValueType()),
	I.getLinkage(), I.getName(), New.get());
	NF->copyAttributesFrom(&I);
	VMap[&I] = NF;
	}

	// Loop over the aliases in the module
	for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
	I != E; ++I) {
	if (!ShouldCloneDefinition(&*I)) {
	// An alias cannot act as an external reference, so we need to create
	// either a function or a global variable depending on the value type.
	// FIXME: Once pointee types are gone we can probably pick one or the
	// other.
	GlobalValue *GV;
	if (I->getValueType()->isFunctionTy())
	GV = Function::Create(cast<FunctionType>(I->getValueType()),
	GlobalValue::ExternalLinkage, I->getName(),
	New.get());
	else
	GV = new GlobalVariable(
	*New, I->getValueType(), false, GlobalValue::ExternalLinkage,
	nullptr, I->getName(), nullptr,
	I->getThreadLocalMode(), I->getType()->getAddressSpace());
	VMap[&*I] = GV;
	// We do not copy attributes (mainly because copying between different
	// kinds of globals is forbidden), but this is generally not required for
	// correctness.
	continue;
	}
	auto *GA = GlobalAlias::create(I->getValueType(),
	I->getType()->getPointerAddressSpace(),
	I->getLinkage(), I->getName(), New.get());
	GA->copyAttributesFrom(&*I);
	VMap[&*I] = GA;
	}

	// Now that all of the things that global variable initializer can refer to
	// have been created, loop through and copy the global variable referrers
	// over... We also set the attributes on the global now.
	//
	for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
	I != E; ++I) {
	if (I->isDeclaration())
	continue;

	GlobalVariable GV = cast<GlobalVariable>(VMap[&I]);
	if (!ShouldCloneDefinition(&*I)) {
	// Skip after setting the correct linkage for an external reference.
	GV->setLinkage(GlobalValue::ExternalLinkage);
	continue;
	}
	if (I->hasInitializer())
	GV->setInitializer(MapValue(I->getInitializer(), VMap));

	SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
	I->getAllMetadata(MDs);
	for (auto MD : MDs)
	- GV->addMetadata(MD.first, *MapMetadata(MD.second, VMap));
	+ GV->addMetadata(MD.first,
	+ *MapMetadata(MD.second, VMap, RF_MoveDistinctMDs));

	copyComdat(GV, &*I);
	}

	// Similarly, copy over function bodies now...
	//
	for (const Function &I : *M) {
	if (I.isDeclaration())
	continue;

	Function *F = cast<Function>(VMap[&I]);
	if (!ShouldCloneDefinition(&I)) {
	// Skip after setting the correct linkage for an external reference.
	F->setLinkage(GlobalValue::ExternalLinkage);
	// Personality function is not valid on a declaration.
	F->setPersonalityFn(nullptr);
	continue;
	}

	Function::arg_iterator DestI = F->arg_begin();
	for (Function::const_arg_iterator J = I.arg_begin(); J != I.arg_end();
	++J) {
	DestI->setName(J->getName());
	VMap[&J] = &DestI++;
	}

	SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
	CloneFunctionInto(F, &I, VMap, /ModuleLevelChanges=/true, Returns);

	if (I.hasPersonalityFn())
	F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));

	copyComdat(F, &I);
	}

	// And aliases
	for (Module::const_alias_iterator I = M->alias_begin(), E = M->alias_end();
	I != E; ++I) {
	// We already dealt with undefined aliases above.
	if (!ShouldCloneDefinition(&*I))
	continue;
	GlobalAlias GA = cast<GlobalAlias>(VMap[&I]);
	if (const Constant *C = I->getAliasee())
	GA->setAliasee(MapValue(C, VMap));
	}

	// And named metadata....
	for (Module::const_named_metadata_iterator I = M->named_metadata_begin(),
	E = M->named_metadata_end(); I != E; ++I) {
	const NamedMDNode &NMD = *I;
	NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
	for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
	NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
	}

	return New;
	}

	extern "C" {

	LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
	return wrap(CloneModule(unwrap(M)).release());
	}

	}
	Index: head/contrib/llvm/tools/clang/include/clang/AST/StmtCXX.h
	===================================================================
	--- head/contrib/llvm/tools/clang/include/clang/AST/StmtCXX.h (revision 322319)
	+++ head/contrib/llvm/tools/clang/include/clang/AST/StmtCXX.h (revision 322320)
	@@ -1,482 +1,487 @@
	//===--- StmtCXX.h - Classes for representing C++ statements ----- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the C++ statement AST node classes.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_CLANG_AST_STMTCXX_H
	#define LLVM_CLANG_AST_STMTCXX_H

	#include "clang/AST/DeclarationName.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/NestedNameSpecifier.h"
	#include "clang/AST/Stmt.h"
	#include "llvm/Support/Compiler.h"

	namespace clang {

	class VarDecl;

	/// CXXCatchStmt - This represents a C++ catch block.
	///
	class CXXCatchStmt : public Stmt {
	SourceLocation CatchLoc;
	/// The exception-declaration of the type.
	VarDecl *ExceptionDecl;
	/// The handler block.
	Stmt *HandlerBlock;

	public:
	CXXCatchStmt(SourceLocation catchLoc, VarDecl exDecl, Stmt handlerBlock)
	: Stmt(CXXCatchStmtClass), CatchLoc(catchLoc), ExceptionDecl(exDecl),
	HandlerBlock(handlerBlock) {}

	CXXCatchStmt(EmptyShell Empty)
	: Stmt(CXXCatchStmtClass), ExceptionDecl(nullptr), HandlerBlock(nullptr) {}

	SourceLocation getLocStart() const LLVM_READONLY { return CatchLoc; }
	SourceLocation getLocEnd() const LLVM_READONLY {
	return HandlerBlock->getLocEnd();
	}

	SourceLocation getCatchLoc() const { return CatchLoc; }
	VarDecl *getExceptionDecl() const { return ExceptionDecl; }
	QualType getCaughtType() const;
	Stmt *getHandlerBlock() const { return HandlerBlock; }

	static bool classof(const Stmt *T) {
	return T->getStmtClass() == CXXCatchStmtClass;
	}

	child_range children() { return child_range(&HandlerBlock, &HandlerBlock+1); }

	friend class ASTStmtReader;
	};

	/// CXXTryStmt - A C++ try block, including all handlers.
	///
	class CXXTryStmt : public Stmt {
	SourceLocation TryLoc;
	unsigned NumHandlers;

	CXXTryStmt(SourceLocation tryLoc, Stmt tryBlock, ArrayRef<Stmt> handlers);

	CXXTryStmt(EmptyShell Empty, unsigned numHandlers)
	: Stmt(CXXTryStmtClass), NumHandlers(numHandlers) { }

	Stmt const * const *getStmts() const {
	return reinterpret_cast<Stmt const * const*>(this + 1);
	}
	Stmt **getStmts() {
	return reinterpret_cast<Stmt **>(this + 1);
	}

	public:
	static CXXTryStmt *Create(const ASTContext &C, SourceLocation tryLoc,
	Stmt tryBlock, ArrayRef<Stmt> handlers);

	static CXXTryStmt *Create(const ASTContext &C, EmptyShell Empty,
	unsigned numHandlers);

	SourceLocation getLocStart() const LLVM_READONLY { return getTryLoc(); }
	SourceLocation getLocEnd() const LLVM_READONLY { return getEndLoc(); }

	SourceLocation getTryLoc() const { return TryLoc; }
	SourceLocation getEndLoc() const {
	return getStmts()[NumHandlers]->getLocEnd();
	}

	CompoundStmt *getTryBlock() {
	return cast<CompoundStmt>(getStmts()[0]);
	}
	const CompoundStmt *getTryBlock() const {
	return cast<CompoundStmt>(getStmts()[0]);
	}

	unsigned getNumHandlers() const { return NumHandlers; }
	CXXCatchStmt *getHandler(unsigned i) {
	return cast<CXXCatchStmt>(getStmts()[i + 1]);
	}
	const CXXCatchStmt *getHandler(unsigned i) const {
	return cast<CXXCatchStmt>(getStmts()[i + 1]);
	}

	static bool classof(const Stmt *T) {
	return T->getStmtClass() == CXXTryStmtClass;
	}

	child_range children() {
	return child_range(getStmts(), getStmts() + getNumHandlers() + 1);
	}

	friend class ASTStmtReader;
	};

	/// CXXForRangeStmt - This represents C++0x [stmt.ranged]'s ranged for
	/// statement, represented as 'for (range-declarator : range-expression)'.
	///
	/// This is stored in a partially-desugared form to allow full semantic
	/// analysis of the constituent components. The original syntactic components
	/// can be extracted using getLoopVariable and getRangeInit.
	class CXXForRangeStmt : public Stmt {
	SourceLocation ForLoc;
	enum { RANGE, BEGINSTMT, ENDSTMT, COND, INC, LOOPVAR, BODY, END };
	// SubExprs[RANGE] is an expression or declstmt.
	// SubExprs[COND] and SubExprs[INC] are expressions.
	Stmt *SubExprs[END];
	SourceLocation CoawaitLoc;
	SourceLocation ColonLoc;
	SourceLocation RParenLoc;

	friend class ASTStmtReader;
	public:
	CXXForRangeStmt(DeclStmt Range, DeclStmt Begin, DeclStmt *End,
	Expr Cond, Expr Inc, DeclStmt LoopVar, Stmt Body,
	SourceLocation FL, SourceLocation CAL, SourceLocation CL,
	SourceLocation RPL);
	CXXForRangeStmt(EmptyShell Empty) : Stmt(CXXForRangeStmtClass, Empty) { }


	VarDecl *getLoopVariable();
	Expr *getRangeInit();

	const VarDecl *getLoopVariable() const;
	const Expr *getRangeInit() const;


	DeclStmt *getRangeStmt() { return cast<DeclStmt>(SubExprs[RANGE]); }
	DeclStmt *getBeginStmt() {
	return cast_or_null<DeclStmt>(SubExprs[BEGINSTMT]);
	}
	DeclStmt *getEndStmt() { return cast_or_null<DeclStmt>(SubExprs[ENDSTMT]); }
	Expr *getCond() { return cast_or_null<Expr>(SubExprs[COND]); }
	Expr *getInc() { return cast_or_null<Expr>(SubExprs[INC]); }
	DeclStmt *getLoopVarStmt() { return cast<DeclStmt>(SubExprs[LOOPVAR]); }
	Stmt *getBody() { return SubExprs[BODY]; }

	const DeclStmt *getRangeStmt() const {
	return cast<DeclStmt>(SubExprs[RANGE]);
	}
	const DeclStmt *getBeginStmt() const {
	return cast_or_null<DeclStmt>(SubExprs[BEGINSTMT]);
	}
	const DeclStmt *getEndStmt() const {
	return cast_or_null<DeclStmt>(SubExprs[ENDSTMT]);
	}
	const Expr *getCond() const {
	return cast_or_null<Expr>(SubExprs[COND]);
	}
	const Expr *getInc() const {
	return cast_or_null<Expr>(SubExprs[INC]);
	}
	const DeclStmt *getLoopVarStmt() const {
	return cast<DeclStmt>(SubExprs[LOOPVAR]);
	}
	const Stmt *getBody() const { return SubExprs[BODY]; }

	void setRangeInit(Expr E) { SubExprs[RANGE] = reinterpret_cast<Stmt>(E); }
	void setRangeStmt(Stmt *S) { SubExprs[RANGE] = S; }
	void setBeginStmt(Stmt *S) { SubExprs[BEGINSTMT] = S; }
	void setEndStmt(Stmt *S) { SubExprs[ENDSTMT] = S; }
	void setCond(Expr E) { SubExprs[COND] = reinterpret_cast<Stmt>(E); }
	void setInc(Expr E) { SubExprs[INC] = reinterpret_cast<Stmt>(E); }
	void setLoopVarStmt(Stmt *S) { SubExprs[LOOPVAR] = S; }
	void setBody(Stmt *S) { SubExprs[BODY] = S; }

	SourceLocation getForLoc() const { return ForLoc; }
	SourceLocation getCoawaitLoc() const { return CoawaitLoc; }
	SourceLocation getColonLoc() const { return ColonLoc; }
	SourceLocation getRParenLoc() const { return RParenLoc; }

	SourceLocation getLocStart() const LLVM_READONLY { return ForLoc; }
	SourceLocation getLocEnd() const LLVM_READONLY {
	return SubExprs[BODY]->getLocEnd();
	}

	static bool classof(const Stmt *T) {
	return T->getStmtClass() == CXXForRangeStmtClass;
	}

	// Iterators
	child_range children() {
	return child_range(&SubExprs[0], &SubExprs[END]);
	}
	};

	/// \brief Representation of a Microsoft __if_exists or __if_not_exists
	/// statement with a dependent name.
	///
	/// The __if_exists statement can be used to include a sequence of statements
	/// in the program only when a particular dependent name does not exist. For
	/// example:
	///
	/// \code
	/// template<typename T>
	/// void call_foo(T &t) {
	/// __if_exists (T::foo) {
	/// t.foo(); // okay: only called when T::foo exists.
	/// }
	/// }
	/// \endcode
	///
	/// Similarly, the __if_not_exists statement can be used to include the
	/// statements when a particular name does not exist.
	///
	/// Note that this statement only captures __if_exists and __if_not_exists
	/// statements whose name is dependent. All non-dependent cases are handled
	/// directly in the parser, so that they don't introduce a new scope. Clang
	/// introduces scopes in the dependent case to keep names inside the compound
	/// statement from leaking out into the surround statements, which would
	/// compromise the template instantiation model. This behavior differs from
	/// Visual C++ (which never introduces a scope), but is a fairly reasonable
	/// approximation of the VC++ behavior.
	class MSDependentExistsStmt : public Stmt {
	SourceLocation KeywordLoc;
	bool IsIfExists;
	NestedNameSpecifierLoc QualifierLoc;
	DeclarationNameInfo NameInfo;
	Stmt *SubStmt;

	friend class ASTReader;
	friend class ASTStmtReader;

	public:
	MSDependentExistsStmt(SourceLocation KeywordLoc, bool IsIfExists,
	NestedNameSpecifierLoc QualifierLoc,
	DeclarationNameInfo NameInfo,
	CompoundStmt *SubStmt)
	: Stmt(MSDependentExistsStmtClass),
	KeywordLoc(KeywordLoc), IsIfExists(IsIfExists),
	QualifierLoc(QualifierLoc), NameInfo(NameInfo),
	SubStmt(reinterpret_cast<Stmt *>(SubStmt)) { }

	/// \brief Retrieve the location of the __if_exists or __if_not_exists
	/// keyword.
	SourceLocation getKeywordLoc() const { return KeywordLoc; }

	/// \brief Determine whether this is an __if_exists statement.
	bool isIfExists() const { return IsIfExists; }

	/// \brief Determine whether this is an __if_exists statement.
	bool isIfNotExists() const { return !IsIfExists; }

	/// \brief Retrieve the nested-name-specifier that qualifies this name, if
	/// any.
	NestedNameSpecifierLoc getQualifierLoc() const { return QualifierLoc; }

	/// \brief Retrieve the name of the entity we're testing for, along with
	/// location information
	DeclarationNameInfo getNameInfo() const { return NameInfo; }

	/// \brief Retrieve the compound statement that will be included in the
	/// program only if the existence of the symbol matches the initial keyword.
	CompoundStmt *getSubStmt() const {
	return reinterpret_cast<CompoundStmt *>(SubStmt);
	}

	SourceLocation getLocStart() const LLVM_READONLY { return KeywordLoc; }
	SourceLocation getLocEnd() const LLVM_READONLY { return SubStmt->getLocEnd();}

	child_range children() {
	return child_range(&SubStmt, &SubStmt+1);
	}

	static bool classof(const Stmt *T) {
	return T->getStmtClass() == MSDependentExistsStmtClass;
	}
	};

	/// \brief Represents the body of a coroutine. This wraps the normal function
	/// body and holds the additional semantic context required to set up and tear
	/// down the coroutine frame.
	class CoroutineBodyStmt final
	: public Stmt,
	private llvm::TrailingObjects<CoroutineBodyStmt, Stmt *> {
	enum SubStmt {
	Body, ///< The body of the coroutine.
	Promise, ///< The promise statement.
	InitSuspend, ///< The initial suspend statement, run before the body.
	FinalSuspend, ///< The final suspend statement, run after the body.
	OnException, ///< Handler for exceptions thrown in the body.
	OnFallthrough, ///< Handler for control flow falling off the body.
	Allocate, ///< Coroutine frame memory allocation.
	Deallocate, ///< Coroutine frame memory deallocation.
	ReturnValue, ///< Return value for thunk function: p.get_return_object().
	ResultDecl, ///< Declaration holding the result of get_return_object.
	ReturnStmt, ///< Return statement for the thunk function.
	ReturnStmtOnAllocFailure, ///< Return statement if allocation failed.
	FirstParamMove ///< First offset for move construction of parameter copies.
	};
	unsigned NumParams;

	friend class ASTStmtReader;
	+ friend class ASTReader;
	friend TrailingObjects;

	Stmt *getStoredStmts() { return getTrailingObjects<Stmt >(); }

	Stmt const getStoredStmts() const { return getTrailingObjects<Stmt *>(); }

	public:

	struct CtorArgs {
	Stmt *Body = nullptr;
	Stmt *Promise = nullptr;
	Expr *InitialSuspend = nullptr;
	Expr *FinalSuspend = nullptr;
	Stmt *OnException = nullptr;
	Stmt *OnFallthrough = nullptr;
	Expr *Allocate = nullptr;
	Expr *Deallocate = nullptr;
	Expr *ReturnValue = nullptr;
	Stmt *ResultDecl = nullptr;
	Stmt *ReturnStmt = nullptr;
	Stmt *ReturnStmtOnAllocFailure = nullptr;
	ArrayRef<Stmt *> ParamMoves;
	};

	private:

	CoroutineBodyStmt(CtorArgs const& Args);

	public:
	static CoroutineBodyStmt *Create(const ASTContext &C, CtorArgs const &Args);
	+ static CoroutineBodyStmt *Create(const ASTContext &C, EmptyShell,
	+ unsigned NumParams);

	bool hasDependentPromiseType() const {
	return getPromiseDecl()->getType()->isDependentType();
	}

	/// \brief Retrieve the body of the coroutine as written. This will be either
	/// a CompoundStmt or a TryStmt.
	Stmt *getBody() const {
	return getStoredStmts()[SubStmt::Body];
	}

	Stmt *getPromiseDeclStmt() const {
	return getStoredStmts()[SubStmt::Promise];
	}
	VarDecl *getPromiseDecl() const {
	return cast<VarDecl>(cast<DeclStmt>(getPromiseDeclStmt())->getSingleDecl());
	}

	Stmt *getInitSuspendStmt() const {
	return getStoredStmts()[SubStmt::InitSuspend];
	}
	Stmt *getFinalSuspendStmt() const {
	return getStoredStmts()[SubStmt::FinalSuspend];
	}

	Stmt *getExceptionHandler() const {
	return getStoredStmts()[SubStmt::OnException];
	}
	Stmt *getFallthroughHandler() const {
	return getStoredStmts()[SubStmt::OnFallthrough];
	}

	Expr *getAllocate() const {
	return cast_or_null<Expr>(getStoredStmts()[SubStmt::Allocate]);
	}
	Expr *getDeallocate() const {
	return cast_or_null<Expr>(getStoredStmts()[SubStmt::Deallocate]);
	}
	Expr *getReturnValueInit() const {
	return cast<Expr>(getStoredStmts()[SubStmt::ReturnValue]);
	}
	Stmt *getResultDecl() const { return getStoredStmts()[SubStmt::ResultDecl]; }
	Stmt *getReturnStmt() const { return getStoredStmts()[SubStmt::ReturnStmt]; }
	Stmt *getReturnStmtOnAllocFailure() const {
	return getStoredStmts()[SubStmt::ReturnStmtOnAllocFailure];
	}
	ArrayRef<Stmt const *> getParamMoves() const {
	return {getStoredStmts() + SubStmt::FirstParamMove, NumParams};
	}

	SourceLocation getLocStart() const LLVM_READONLY {
	return getBody() ? getBody()->getLocStart()
	: getPromiseDecl()->getLocStart();
	}
	SourceLocation getLocEnd() const LLVM_READONLY {
	return getBody() ? getBody()->getLocEnd() : getPromiseDecl()->getLocEnd();
	}

	child_range children() {
	return child_range(getStoredStmts(),
	getStoredStmts() + SubStmt::FirstParamMove + NumParams);
	}

	static bool classof(const Stmt *T) {
	return T->getStmtClass() == CoroutineBodyStmtClass;
	}
	};

	/// \brief Represents a 'co_return' statement in the C++ Coroutines TS.
	///
	/// This statament models the initialization of the coroutine promise
	/// (encapsulating the eventual notional return value) from an expression
	/// (or braced-init-list), followed by termination of the coroutine.
	///
	/// This initialization is modeled by the evaluation of the operand
	/// followed by a call to one of:
	/// <promise>.return_value(<operand>)
	/// <promise>.return_void()
	/// which we name the "promise call".
	class CoreturnStmt : public Stmt {
	SourceLocation CoreturnLoc;

	enum SubStmt { Operand, PromiseCall, Count };
	Stmt *SubStmts[SubStmt::Count];

	bool IsImplicit : 1;

	friend class ASTStmtReader;
	public:
	CoreturnStmt(SourceLocation CoreturnLoc, Stmt Operand, Stmt PromiseCall,
	bool IsImplicit = false)
	: Stmt(CoreturnStmtClass), CoreturnLoc(CoreturnLoc),
	IsImplicit(IsImplicit) {
	SubStmts[SubStmt::Operand] = Operand;
	SubStmts[SubStmt::PromiseCall] = PromiseCall;
	}
	+
	+ CoreturnStmt(EmptyShell) : CoreturnStmt({}, {}, {}) {}

	SourceLocation getKeywordLoc() const { return CoreturnLoc; }

	/// \brief Retrieve the operand of the 'co_return' statement. Will be nullptr
	/// if none was specified.
	Expr getOperand() const { return static_cast<Expr>(SubStmts[Operand]); }

	/// \brief Retrieve the promise call that results from this 'co_return'
	/// statement. Will be nullptr if either the coroutine has not yet been
	/// finalized or the coroutine has no eventual return type.
	Expr *getPromiseCall() const {
	return static_cast<Expr*>(SubStmts[PromiseCall]);
	}

	bool isImplicit() const { return IsImplicit; }
	void setIsImplicit(bool value = true) { IsImplicit = value; }

	SourceLocation getLocStart() const LLVM_READONLY { return CoreturnLoc; }
	SourceLocation getLocEnd() const LLVM_READONLY {
	return getOperand() ? getOperand()->getLocEnd() : getLocStart();
	}

	child_range children() {
	if (!getOperand())
	return child_range(SubStmts + SubStmt::PromiseCall,
	SubStmts + SubStmt::Count);
	return child_range(SubStmts, SubStmts + SubStmt::Count);
	}

	static bool classof(const Stmt *T) {
	return T->getStmtClass() == CoreturnStmtClass;
	}
	};

	} // end namespace clang

	#endif
	Index: head/contrib/llvm/tools/clang/include/clang/Basic/BuiltinsX86.def
	===================================================================
	--- head/contrib/llvm/tools/clang/include/clang/Basic/BuiltinsX86.def (revision 322319)
	+++ head/contrib/llvm/tools/clang/include/clang/Basic/BuiltinsX86.def (revision 322320)
	@@ -1,1848 +1,1848 @@
	//===--- BuiltinsX86.def - X86 Builtin function database --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the X86-specific builtin function database. Users of
	// this file must define the BUILTIN macro to make use of this information.
	//
	//===----------------------------------------------------------------------===//

	// The format of this database matches clang/Basic/Builtins.def.

	// FIXME: Ideally we would be able to pull this information from what
	// LLVM already knows about X86 builtins. We need to match the LLVM
	// definition anyway, since code generation will lower to the
	// intrinsic if one exists.

	#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
	# define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
	#endif

	#if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN)
	# define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS)
	#endif

	// FIXME: Are these nothrow/const?

	// Miscellaneous builtin for checking x86 cpu features.
	// TODO: Make this somewhat generic so that other backends
	// can use it?
	BUILTIN(__builtin_cpu_supports, "bcC*", "nc")

	// Undefined Values
	//
	TARGET_BUILTIN(__builtin_ia32_undef128, "V2d", "nc", "")
	TARGET_BUILTIN(__builtin_ia32_undef256, "V4d", "nc", "")
	TARGET_BUILTIN(__builtin_ia32_undef512, "V8d", "nc", "")

	// FLAGS
	//
	TARGET_BUILTIN(__builtin_ia32_readeflags_u32, "Ui", "n", "")
	TARGET_BUILTIN(__builtin_ia32_writeeflags_u32, "vUi", "n", "")

	// 3DNow!
	//
	TARGET_BUILTIN(__builtin_ia32_femms, "v", "", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pavgusb, "V8cV8cV8c", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pf2id, "V2iV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfacc, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfadd, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfcmpeq, "V2iV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfcmpge, "V2iV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfcmpgt, "V2iV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfmax, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfmin, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfmul, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfrcp, "V2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfrcpit1, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfrcpit2, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfrsqrt, "V2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfrsqit1, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfsub, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pfsubr, "V2fV2fV2f", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pi2fd, "V2fV2i", "nc", "3dnow")
	TARGET_BUILTIN(__builtin_ia32_pmulhrw, "V4sV4sV4s", "nc", "3dnow")
	// 3DNow! Extensions (3dnowa).
	TARGET_BUILTIN(__builtin_ia32_pf2iw, "V2iV2f", "nc", "3dnowa")
	TARGET_BUILTIN(__builtin_ia32_pfnacc, "V2fV2fV2f", "nc", "3dnowa")
	TARGET_BUILTIN(__builtin_ia32_pfpnacc, "V2fV2fV2f", "nc", "3dnowa")
	TARGET_BUILTIN(__builtin_ia32_pi2fw, "V2fV2i", "nc", "3dnowa")
	TARGET_BUILTIN(__builtin_ia32_pswapdsf, "V2fV2f", "nc", "3dnowa")
	TARGET_BUILTIN(__builtin_ia32_pswapdsi, "V2iV2i", "nc", "3dnowa")

	// MMX
	//
	// All MMX instructions will be generated via builtins. Any MMX vector
	// types (<1 x i64>, <2 x i32>, etc.) that aren't used by these builtins will be
	// expanded by the back-end.
	// FIXME: _mm_prefetch must be a built-in because it takes a compile-time constant
	// argument and our prior approach of using a #define to the current built-in
	// doesn't work in the presence of re-declaration of _mm_prefetch for windows.
	TARGET_BUILTIN(_mm_prefetch, "vcC*i", "nc", "mmx")
	TARGET_BUILTIN(__builtin_ia32_emms, "v", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_paddb, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_paddw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_paddd, "V2iV2iV2i", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_paddsb, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_paddsw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_paddusb, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_paddusw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psubb, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psubw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psubd, "V2iV2iV2i", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psubsb, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psubsw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psubusb, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psubusw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pmulhw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pmullw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pmaddwd, "V2iV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pand, "V1LLiV1LLiV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pandn, "V1LLiV1LLiV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_por, "V1LLiV1LLiV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pxor, "V1LLiV1LLiV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psllw, "V4sV4sV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pslld, "V2iV2iV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psllq, "V1LLiV1LLiV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psrlw, "V4sV4sV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psrld, "V2iV2iV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psrlq, "V1LLiV1LLiV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psraw, "V4sV4sV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psrad, "V2iV2iV1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psllwi, "V4sV4si", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pslldi, "V2iV2ii", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psllqi, "V1LLiV1LLii", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psrlwi, "V4sV4si", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psrldi, "V2iV2ii", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psrlqi, "V1LLiV1LLii", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psrawi, "V4sV4si", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_psradi, "V2iV2ii", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_packsswb, "V8cV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_packssdw, "V4sV2iV2i", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_packuswb, "V8cV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_punpckhbw, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_punpckhwd, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_punpckhdq, "V2iV2iV2i", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_punpcklbw, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_punpcklwd, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_punpckldq, "V2iV2iV2i", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqb, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqd, "V2iV2iV2i", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtb, "V8cV8cV8c", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtw, "V4sV4sV4s", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtd, "V2iV2iV2i", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_maskmovq, "vV8cV8cc*", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_movntq, "vV1LLi*V1LLi", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_vec_init_v2si, "V2iii", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_vec_init_v4hi, "V4sssss", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_vec_init_v8qi, "V8ccccccccc", "", "mmx")
	TARGET_BUILTIN(__builtin_ia32_vec_ext_v2si, "iV2ii", "", "mmx")

	// MMX2 (MMX+SSE) intrinsics
	TARGET_BUILTIN(__builtin_ia32_cvtpi2ps, "V4fV4fV2i", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cvtps2pi, "V2iV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cvttps2pi, "V2iV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pavgb, "V8cV8cV8c", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pavgw, "V4sV4sV4s", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pmaxsw, "V4sV4sV4s", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pmaxub, "V8cV8cV8c", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pminsw, "V4sV4sV4s", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pminub, "V8cV8cV8c", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pmovmskb, "iV8c", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pmulhuw, "V4sV4sV4s", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_psadbw, "V4sV8cV8c", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_pshufw, "V4sV4sIc", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_vec_ext_v4hi, "iV4sIi", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_vec_set_v4hi, "V4sV4siIi", "", "sse")

	// MMX+SSE2
	TARGET_BUILTIN(__builtin_ia32_cvtpd2pi, "V2iV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvtpi2pd, "V2dV2i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2pi, "V2iV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_paddq, "V1LLiV1LLiV1LLi", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pmuludq, "V1LLiV2iV2i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psubq, "V1LLiV1LLiV1LLi", "", "sse2")

	// MMX+SSSE3
	TARGET_BUILTIN(__builtin_ia32_pabsb, "V8cV8c", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pabsd, "V2iV2i", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pabsw, "V4sV4s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_palignr, "V8cV8cV8cIc", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phaddd, "V2iV2iV2i", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phaddsw, "V4sV4sV4s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phaddw, "V4sV4sV4s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phsubd, "V2iV2iV2i", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phsubsw, "V4sV4sV4s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phsubw, "V4sV4sV4s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pmaddubsw, "V8cV8cV8c", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pmulhrsw, "V4sV4sV4s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pshufb, "V8cV8cV8c", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_psignw, "V4sV4sV4s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_psignb, "V8cV8cV8c", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_psignd, "V2iV2iV2i", "", "ssse3")

	// SSE intrinsics.
	TARGET_BUILTIN(__builtin_ia32_comieq, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_comilt, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_comile, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_comigt, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_comige, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_comineq, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_ucomieq, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_ucomilt, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_ucomile, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_ucomigt, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_ucomige, "iV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_ucomineq, "iV4fV4f", "", "sse")

	TARGET_BUILTIN(__builtin_ia32_comisdeq, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_comisdlt, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_comisdle, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_comisdgt, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_comisdge, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_comisdneq, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_ucomisdeq, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_ucomisdlt, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_ucomisdle, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_ucomisdgt, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_ucomisdge, "iV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_ucomisdneq, "iV2dV2d", "", "sse2")

	TARGET_BUILTIN(__builtin_ia32_cmpeqps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpltps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpleps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpunordps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpneqps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpnltps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpnleps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpordps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpeqss, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpltss, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpless, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpunordss, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpneqss, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpnltss, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpnless, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cmpordss, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_minps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_maxps, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_minss, "V4fV4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_maxss, "V4fV4fV4f", "", "sse")

	TARGET_BUILTIN(__builtin_ia32_cmpeqpd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpltpd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmplepd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpunordpd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpneqpd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpnltpd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpnlepd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpordpd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpeqsd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpltsd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmplesd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpunordsd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpneqsd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpnltsd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpnlesd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cmpordsd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_minpd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_maxpd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_minsd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_maxsd, "V2dV2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_paddsb128, "V16cV16cV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_paddsw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psubsb128, "V16cV16cV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psubsw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_paddusb128, "V16cV16cV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_paddusw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psubusb128, "V16cV16cV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psubusw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pmulhw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pavgb128, "V16cV16cV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pavgw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pmaxub128, "V16cV16cV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pmaxsw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pminub128, "V16cV16cV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pminsw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_packsswb128, "V16cV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_packssdw128, "V8sV4iV4i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_packuswb128, "V16cV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pmulhuw128, "V8sV8sV8s", "", "sse2")

	TARGET_BUILTIN(__builtin_ia32_addsubps, "V4fV4fV4f", "", "sse3")
	TARGET_BUILTIN(__builtin_ia32_addsubpd, "V2dV2dV2d", "", "sse3")
	TARGET_BUILTIN(__builtin_ia32_haddps, "V4fV4fV4f", "", "sse3")
	TARGET_BUILTIN(__builtin_ia32_haddpd, "V2dV2dV2d", "", "sse3")
	TARGET_BUILTIN(__builtin_ia32_hsubps, "V4fV4fV4f", "", "sse3")
	TARGET_BUILTIN(__builtin_ia32_hsubpd, "V2dV2dV2d", "", "sse3")
	TARGET_BUILTIN(__builtin_ia32_phaddw128, "V8sV8sV8s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phaddd128, "V4iV4iV4i", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phaddsw128, "V8sV8sV8s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phsubw128, "V8sV8sV8s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phsubd128, "V4iV4iV4i", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_phsubsw128, "V8sV8sV8s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pmaddubsw128, "V8sV16cV16c", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pmulhrsw128, "V8sV8sV8s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pshufb128, "V16cV16cV16c", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_psignb128, "V16cV16cV16c", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_psignw128, "V8sV8sV8s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_psignd128, "V4iV4iV4i", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pabsb128, "V16cV16c", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pabsw128, "V8sV8s", "", "ssse3")
	TARGET_BUILTIN(__builtin_ia32_pabsd128, "V4iV4i", "", "ssse3")

	TARGET_BUILTIN(__builtin_ia32_ldmxcsr, "vUi", "", "sse")
	TARGET_HEADER_BUILTIN(_mm_setcsr, "vUi", "h","xmmintrin.h", ALL_LANGUAGES, "sse")
	TARGET_BUILTIN(__builtin_ia32_stmxcsr, "Ui", "", "sse")
	TARGET_HEADER_BUILTIN(_mm_getcsr, "Ui", "h", "xmmintrin.h", ALL_LANGUAGES, "sse")
	TARGET_BUILTIN(__builtin_ia32_cvtss2si, "iV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_cvttss2si, "iV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_storehps, "vV2i*V4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_storelps, "vV2i*V4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_movmskps, "iV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_sfence, "v", "", "sse")
	TARGET_HEADER_BUILTIN(_mm_sfence, "v", "h", "xmmintrin.h", ALL_LANGUAGES, "sse")
	TARGET_BUILTIN(__builtin_ia32_rcpps, "V4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_rcpss, "V4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_rsqrtps, "V4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_rsqrtss, "V4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_sqrtps, "V4fV4f", "", "sse")
	TARGET_BUILTIN(__builtin_ia32_sqrtss, "V4fV4f", "", "sse")

	TARGET_BUILTIN(__builtin_ia32_maskmovdqu, "vV16cV16cc*", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_movmskpd, "iV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pmovmskb128, "iV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_movnti, "vi*i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_movnti64, "vLLi*LLi", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psadbw128, "V2LLiV16cV16c", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_sqrtpd, "V2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_sqrtsd, "V2dV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvtdq2ps, "V4fV4i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2dq, "V2LLiV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2ps, "V4fV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2dq, "V4iV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvtsd2si, "iV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvttsd2si, "iV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvtsd2ss, "V4fV4fV2d", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvtps2dq, "V4iV4f", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_cvttps2dq, "V4iV4f", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_clflush, "vvC*", "", "sse2")
	TARGET_HEADER_BUILTIN(_mm_clflush, "vvC*", "h", "emmintrin.h", ALL_LANGUAGES, "sse2")
	TARGET_BUILTIN(__builtin_ia32_lfence, "v", "", "sse2")
	TARGET_HEADER_BUILTIN(_mm_lfence, "v", "h", "emmintrin.h", ALL_LANGUAGES, "sse2")
	TARGET_BUILTIN(__builtin_ia32_mfence, "v", "", "sse2")
	TARGET_HEADER_BUILTIN(_mm_mfence, "v", "h", "emmintrin.h", ALL_LANGUAGES, "sse2")
	-TARGET_BUILTIN(__builtin_ia32_pause, "v", "", "sse2")
	-TARGET_HEADER_BUILTIN(_mm_pause, "v", "h", "emmintrin.h", ALL_LANGUAGES, "sse2")
	+TARGET_BUILTIN(__builtin_ia32_pause, "v", "", "")
	+TARGET_HEADER_BUILTIN(_mm_pause, "v", "h", "emmintrin.h", ALL_LANGUAGES, "")
	TARGET_BUILTIN(__builtin_ia32_pmuludq128, "V2LLiV4iV4i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psraw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psrad128, "V4iV4iV4i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psrlw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psrld128, "V4iV4iV4i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psrlq128, "V2LLiV2LLiV2LLi", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psllw128, "V8sV8sV8s", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pslld128, "V4iV4iV4i", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psllq128, "V2LLiV2LLiV2LLi", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psllwi128, "V8sV8si", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pslldi128, "V4iV4ii", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psllqi128, "V2LLiV2LLii", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psrlwi128, "V8sV8si", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psrldi128, "V4iV4ii", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psrlqi128, "V2LLiV2LLii", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psrawi128, "V8sV8si", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_psradi128, "V4iV4ii", "", "sse2")
	TARGET_BUILTIN(__builtin_ia32_pmaddwd128, "V4iV8sV8s", "", "sse2")

	TARGET_BUILTIN(__builtin_ia32_monitor, "vv*UiUi", "", "sse3")
	TARGET_BUILTIN(__builtin_ia32_mwait, "vUiUi", "", "sse3")
	TARGET_BUILTIN(__builtin_ia32_lddqu, "V16ccC*", "", "sse3")

	TARGET_BUILTIN(__builtin_ia32_palignr128, "V16cV16cV16cIi", "", "ssse3")

	TARGET_BUILTIN(__builtin_ia32_insertps128, "V4fV4fV4fIc", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pblendvb128, "V16cV16cV16cV16c", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_blendvpd, "V2dV2dV2dV2d", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_blendvps, "V4fV4fV4fV4f", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_packusdw128, "V8sV4iV4i", "", "sse4.1")

	TARGET_BUILTIN(__builtin_ia32_pmaxsb128, "V16cV16cV16c", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pmaxsd128, "V4iV4iV4i", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pmaxud128, "V4iV4iV4i", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pmaxuw128, "V8sV8sV8s", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pminsb128, "V16cV16cV16c", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pminsd128, "V4iV4iV4i", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pminud128, "V4iV4iV4i", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pminuw128, "V8sV8sV8s", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_pmuldq128, "V2LLiV4iV4i", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_roundps, "V4fV4fIi", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_roundss, "V4fV4fV4fIi", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_roundsd, "V2dV2dV2dIi", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_roundpd, "V2dV2dIi", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_dpps, "V4fV4fV4fIc", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_dppd, "V2dV2dV2dIc", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_ptestz128, "iV2LLiV2LLi", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_ptestc128, "iV2LLiV2LLi", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_ptestnzc128, "iV2LLiV2LLi", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_mpsadbw128, "V16cV16cV16cIc", "", "sse4.1")
	TARGET_BUILTIN(__builtin_ia32_phminposuw128, "V8sV8s", "", "sse4.1")

	// SSE 4.2
	TARGET_BUILTIN(__builtin_ia32_pcmpistrm128, "V16cV16cV16cIc", "", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpistri128, "iV16cV16cIc", "", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpestrm128, "V16cV16ciV16ciIc", "", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpestri128, "iV16ciV16ciIc","", "sse4.2")

	TARGET_BUILTIN(__builtin_ia32_pcmpistria128, "iV16cV16cIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpistric128, "iV16cV16cIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpistrio128, "iV16cV16cIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpistris128, "iV16cV16cIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpistriz128, "iV16cV16cIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpestria128, "iV16ciV16ciIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpestric128, "iV16ciV16ciIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpestrio128, "iV16ciV16ciIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpestris128, "iV16ciV16ciIc","", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_pcmpestriz128, "iV16ciV16ciIc","", "sse4.2")

	TARGET_BUILTIN(__builtin_ia32_crc32qi, "UiUiUc", "", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_crc32hi, "UiUiUs", "", "sse4.2")
	TARGET_BUILTIN(__builtin_ia32_crc32si, "UiUiUi", "", "sse4.2")

	// SSE4a
	TARGET_BUILTIN(__builtin_ia32_extrqi, "V2LLiV2LLiIcIc", "", "sse4a")
	TARGET_BUILTIN(__builtin_ia32_extrq, "V2LLiV2LLiV16c", "", "sse4a")
	TARGET_BUILTIN(__builtin_ia32_insertqi, "V2LLiV2LLiV2LLiIcIc", "", "sse4a")
	TARGET_BUILTIN(__builtin_ia32_insertq, "V2LLiV2LLiV2LLi", "", "sse4a")
	TARGET_BUILTIN(__builtin_ia32_movntsd, "vd*V2d", "", "sse4a")
	TARGET_BUILTIN(__builtin_ia32_movntss, "vf*V4f", "", "sse4a")

	// AES
	TARGET_BUILTIN(__builtin_ia32_aesenc128, "V2LLiV2LLiV2LLi", "", "aes")
	TARGET_BUILTIN(__builtin_ia32_aesenclast128, "V2LLiV2LLiV2LLi", "", "aes")
	TARGET_BUILTIN(__builtin_ia32_aesdec128, "V2LLiV2LLiV2LLi", "", "aes")
	TARGET_BUILTIN(__builtin_ia32_aesdeclast128, "V2LLiV2LLiV2LLi", "", "aes")
	TARGET_BUILTIN(__builtin_ia32_aesimc128, "V2LLiV2LLi", "", "aes")
	TARGET_BUILTIN(__builtin_ia32_aeskeygenassist128, "V2LLiV2LLiIc", "", "aes")

	// CLMUL
	TARGET_BUILTIN(__builtin_ia32_pclmulqdq128, "V2LLiV2LLiV2LLiIc", "", "pclmul")

	// AVX
	TARGET_BUILTIN(__builtin_ia32_addsubpd256, "V4dV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_addsubps256, "V8fV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_haddpd256, "V4dV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_hsubps256, "V8fV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_hsubpd256, "V4dV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_haddps256, "V8fV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maxpd256, "V4dV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maxps256, "V8fV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_minpd256, "V4dV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_minps256, "V8fV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vpermilvarpd, "V2dV2dV2LLi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vpermilvarps, "V4fV4fV4i", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vpermilvarpd256, "V4dV4dV4LLi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vpermilvarps256, "V8fV8fV8i", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_blendvpd256, "V4dV4dV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_blendvps256, "V8fV8fV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_dpps256, "V8fV8fV8fIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cmppd, "V2dV2dV2dIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cmppd256, "V4dV4dV4dIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cmpps, "V4fV4fV4fIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cmpps256, "V8fV8fV8fIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cmpsd, "V2dV2dV2dIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cmpss, "V4fV4fV4fIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cvtdq2ps256, "V8fV8i", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2ps256, "V4fV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cvtps2dq256, "V8iV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2dq256, "V4iV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2dq256, "V4iV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_cvttps2dq256, "V8iV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vperm2f128_pd256, "V4dV4dV4dIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vperm2f128_ps256, "V8fV8fV8fIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vperm2f128_si256, "V8iV8iV8iIc", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_sqrtpd256, "V4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_sqrtps256, "V8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_rsqrtps256, "V8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_rcpps256, "V8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_roundpd256, "V4dV4dIi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_roundps256, "V8fV8fIi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestzpd, "iV2dV2d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestcpd, "iV2dV2d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestnzcpd, "iV2dV2d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestzps, "iV4fV4f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestcps, "iV4fV4f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestnzcps, "iV4fV4f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestzpd256, "iV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestcpd256, "iV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestnzcpd256, "iV4dV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestzps256, "iV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestcps256, "iV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vtestnzcps256, "iV8fV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_ptestz256, "iV4LLiV4LLi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_ptestc256, "iV4LLiV4LLi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_ptestnzc256, "iV4LLiV4LLi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_movmskpd256, "iV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_movmskps256, "iV8f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vzeroall, "v", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vzeroupper, "v", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_pd256, "V4dV2dC*", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_vbroadcastf128_ps256, "V8fV4fC*", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_lddqu256, "V32ccC*", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maskloadpd, "V2dV2dC*V2LLi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maskloadps, "V4fV4fC*V4i", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maskloadpd256, "V4dV4dC*V4LLi", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maskloadps256, "V8fV8fC*V8i", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maskstorepd, "vV2d*V2LLiV2d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maskstoreps, "vV4f*V4iV4f", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maskstorepd256, "vV4d*V4LLiV4d", "", "avx")
	TARGET_BUILTIN(__builtin_ia32_maskstoreps256, "vV8f*V8iV8f", "", "avx")

	// AVX2
	TARGET_BUILTIN(__builtin_ia32_mpsadbw256, "V32cV32cV32cIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pabsb256, "V32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pabsw256, "V16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pabsd256, "V8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_packsswb256, "V32cV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_packssdw256, "V16sV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_packuswb256, "V32cV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_packusdw256, "V16sV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_paddsb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_paddsw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psubsb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psubsw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_paddusb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_paddusw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_phaddd256, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_phaddsw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_phsubw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_phsubd256, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_phsubsw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmaddubsw256, "V16sV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmaddwd256, "V8iV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmaxub256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmaxuw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmaxud256, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmaxsb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmaxsw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmaxsd256, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pminub256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pminuw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pminud256, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pminsb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pminsw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pminsd256, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmovmskb256, "iV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmuldq256, "V4LLiV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmulhrsw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmulhuw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmulhw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pmuludq256, "V4LLiV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psadbw256, "V4LLiV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pshufb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psignb256, "V32cV32cV32c", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psignw256, "V16sV16sV16s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psignd256, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psllwi256, "V16sV16si", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psllw256, "V16sV16sV8s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pslldi256, "V8iV8ii", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_pslld256, "V8iV8iV4i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psllqi256, "V4LLiV4LLii", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psllq256, "V4LLiV4LLiV2LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrawi256, "V16sV16si", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psraw256, "V16sV16sV8s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psradi256, "V8iV8ii", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrad256, "V8iV8iV4i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrlwi256, "V16sV16si", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrlw256, "V16sV16sV8s", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrldi256, "V8iV8ii", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrld256, "V8iV8iV4i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrlqi256, "V4LLiV4LLii", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrlq256, "V4LLiV4LLiV2LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_permvarsi256, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_permvarsf256, "V8fV8fV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_permti256, "V4LLiV4LLiV4LLiIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_maskloadd256, "V8iV8iC*V8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_maskloadq256, "V4LLiV4LLiC*V4LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_maskloadd, "V4iV4iC*V4i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_maskloadq, "V2LLiV2LLiC*V2LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_maskstored256, "vV8i*V8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_maskstoreq256, "vV4LLi*V4LLiV4LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_maskstored, "vV4i*V4iV4i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_maskstoreq, "vV2LLi*V2LLiV2LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psllv8si, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psllv4si, "V4iV4iV4i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psllv4di, "V4LLiV4LLiV4LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psllv2di, "V2LLiV2LLiV2LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrav8si, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrav4si, "V4iV4iV4i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrlv8si, "V8iV8iV8i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrlv4si, "V4iV4iV4i", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrlv4di, "V4LLiV4LLiV4LLi", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_psrlv2di, "V2LLiV2LLiV2LLi", "", "avx2")

	// GATHER
	TARGET_BUILTIN(__builtin_ia32_gatherd_pd, "V2dV2ddC*V4iV2dIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherd_pd256, "V4dV4ddC*V4iV4dIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherq_pd, "V2dV2ddC*V2LLiV2dIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherq_pd256, "V4dV4ddC*V4LLiV4dIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherd_ps, "V4fV4ffC*V4iV4fIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherd_ps256, "V8fV8ffC*V8iV8fIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherq_ps, "V4fV4ffC*V2LLiV4fIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherq_ps256, "V4fV4ffC*V4LLiV4fIc", "", "avx2")

	TARGET_BUILTIN(__builtin_ia32_gatherd_q, "V2LLiV2LLiLLiC*V4iV2LLiIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherd_q256, "V4LLiV4LLiLLiC*V4iV4LLiIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherq_q, "V2LLiV2LLiLLiC*V2LLiV2LLiIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherq_q256, "V4LLiV4LLiLLiC*V4LLiV4LLiIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherd_d, "V4iV4iiC*V4iV4iIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherd_d256, "V8iV8iiC*V8iV8iIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherq_d, "V4iV4iiC*V2LLiV4iIc", "", "avx2")
	TARGET_BUILTIN(__builtin_ia32_gatherq_d256, "V4iV4iiC*V4LLiV4iIc", "", "avx2")

	// F16C
	TARGET_BUILTIN(__builtin_ia32_vcvtps2ph, "V8sV4fIi", "", "f16c")
	TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256, "V8sV8fIi", "", "f16c")
	TARGET_BUILTIN(__builtin_ia32_vcvtph2ps, "V4fV8s", "", "f16c")
	TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256, "V8fV8s", "", "f16c")

	// RDRAND
	TARGET_BUILTIN(__builtin_ia32_rdrand16_step, "UiUs*", "", "rdrnd")
	TARGET_BUILTIN(__builtin_ia32_rdrand32_step, "UiUi*", "", "rdrnd")
	TARGET_BUILTIN(__builtin_ia32_rdrand64_step, "UiULLi*", "", "rdrnd")

	// FSGSBASE
	TARGET_BUILTIN(__builtin_ia32_rdfsbase32, "Ui", "", "fsgsbase")
	TARGET_BUILTIN(__builtin_ia32_rdgsbase32, "Ui", "", "fsgsbase")
	TARGET_BUILTIN(__builtin_ia32_wrfsbase32, "vUi", "", "fsgsbase")
	TARGET_BUILTIN(__builtin_ia32_wrgsbase32, "vUi", "", "fsgsbase")

	// FXSR
	TARGET_BUILTIN(__builtin_ia32_fxrstor, "vv*", "", "fxsr")
	TARGET_BUILTIN(__builtin_ia32_fxsave, "vv*", "", "fxsr")

	// XSAVE
	TARGET_BUILTIN(__builtin_ia32_xsave, "vv*ULLi", "", "xsave")
	TARGET_BUILTIN(__builtin_ia32_xrstor, "vv*ULLi", "", "xsave")
	TARGET_BUILTIN(__builtin_ia32_xsaveopt, "vv*ULLi", "", "xsaveopt")
	TARGET_BUILTIN(__builtin_ia32_xrstors, "vv*ULLi", "", "xsaves")
	TARGET_BUILTIN(__builtin_ia32_xsavec, "vv*ULLi", "", "xsavec")
	TARGET_BUILTIN(__builtin_ia32_xsaves, "vv*ULLi", "", "xsaves")

	//CLFLUSHOPT
	TARGET_BUILTIN(__builtin_ia32_clflushopt, "vc*", "", "clflushopt")

	// ADX
	TARGET_BUILTIN(__builtin_ia32_addcarryx_u32, "UcUcUiUiUi*", "", "adx")
	TARGET_BUILTIN(__builtin_ia32_addcarry_u32, "UcUcUiUiUi*", "", "")
	TARGET_BUILTIN(__builtin_ia32_subborrow_u32, "UcUcUiUiUi*", "", "")

	// RDSEED
	TARGET_BUILTIN(__builtin_ia32_rdseed16_step, "UiUs*", "", "rdseed")
	TARGET_BUILTIN(__builtin_ia32_rdseed32_step, "UiUi*", "", "rdseed")

	// BMI
	TARGET_BUILTIN(__builtin_ia32_bextr_u32, "UiUiUi", "", "bmi")

	// BMI2
	TARGET_BUILTIN(__builtin_ia32_bzhi_si, "UiUiUi", "", "bmi2")
	TARGET_BUILTIN(__builtin_ia32_pdep_si, "UiUiUi", "", "bmi2")
	TARGET_BUILTIN(__builtin_ia32_pext_si, "UiUiUi", "", "bmi2")

	// TBM
	TARGET_BUILTIN(__builtin_ia32_bextri_u32, "UiUiIUi", "", "tbm")

	// LWP
	TARGET_BUILTIN(__builtin_ia32_llwpcb, "vv*", "", "lwp")
	TARGET_BUILTIN(__builtin_ia32_slwpcb, "v*", "", "lwp")
	TARGET_BUILTIN(__builtin_ia32_lwpins32, "UcUiUiUi", "", "lwp")
	TARGET_BUILTIN(__builtin_ia32_lwpval32, "vUiUiUi", "", "lwp")

	// SHA
	TARGET_BUILTIN(__builtin_ia32_sha1rnds4, "V4iV4iV4iIc", "", "sha")
	TARGET_BUILTIN(__builtin_ia32_sha1nexte, "V4iV4iV4i", "", "sha")
	TARGET_BUILTIN(__builtin_ia32_sha1msg1, "V4iV4iV4i", "", "sha")
	TARGET_BUILTIN(__builtin_ia32_sha1msg2, "V4iV4iV4i", "", "sha")
	TARGET_BUILTIN(__builtin_ia32_sha256rnds2, "V4iV4iV4iV4i", "", "sha")
	TARGET_BUILTIN(__builtin_ia32_sha256msg1, "V4iV4iV4i", "", "sha")
	TARGET_BUILTIN(__builtin_ia32_sha256msg2, "V4iV4iV4i", "", "sha")

	// FMA
	TARGET_BUILTIN(__builtin_ia32_vfmaddps, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddss, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubps, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubpd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubss, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubsd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmaddps, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmaddpd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmaddss, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmaddsd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubps, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubpd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubss, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubsd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubaddps, "V4fV4fV4fV4f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd, "V2dV2dV2dV2d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddps256, "V8fV8fV8fV8f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd256, "V4dV4dV4dV4d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubps256, "V8fV8fV8fV8f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubpd256, "V4dV4dV4dV4d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmaddps256, "V8fV8fV8fV8f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmaddpd256, "V4dV4dV4dV4d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubps256, "V8fV8fV8fV8f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubpd256, "V4dV4dV4dV4d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256, "V8fV8fV8fV8f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256, "V4dV4dV4dV4d", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubaddps256, "V8fV8fV8fV8f", "", "fma\|fma4")
	TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd256, "V4dV4dV4dV4d", "", "fma\|fma4")

	TARGET_BUILTIN(__builtin_ia32_vfmaddpd128_mask, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd128_mask3, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd128_maskz, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_mask, "V4dV4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_mask3, "V4dV4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd256_maskz, "V4dV4dV4dV4dUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_mask3, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddpd512_maskz, "V8dV8dV8dV8dUcIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfmaddps128_mask, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddps128_mask3, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddps128_maskz, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddps256_mask, "V8fV8fV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddps256_mask3, "V8fV8fV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddps256_maskz, "V8fV8fV8fV8fUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddps512_mask3, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddps512_maskz, "V16fV16fV16fV16fUsIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd128_mask, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd128_mask3, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd128_maskz, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_mask, "V4dV4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_mask3, "V4dV4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd256_maskz, "V4dV4dV4dV4dUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_mask3, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubpd512_maskz, "V8dV8dV8dV8dUcIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps128_mask, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps128_mask3, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps128_maskz, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_mask, "V8fV8fV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_mask3, "V8fV8fV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps256_maskz, "V8fV8fV8fV8fUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_mask3, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsubps512_maskz, "V16fV16fV16fV16fUsIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfmsubpd128_mask3, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmsubpd256_mask3, "V4dV4dV4dV4dUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfmsubps128_mask3, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmsubps256_mask3, "V8fV8fV8fV8fUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfmsubps512_mask3, "V16fV16fV16fV16fUsIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd128_mask3, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd256_mask3, "V4dV4dV4dV4dUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfmsubaddpd512_mask3, "V8dV8dV8dV8dUcIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfmsubaddps128_mask3, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfmsubaddps256_mask3, "V8fV8fV8fV8fUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfmsubaddps512_mask3, "V16fV16fV16fV16fUsIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfnmaddpd128_mask, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfnmaddpd256_mask, "V4dV4dV4dV4dUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfnmaddpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfnmaddps128_mask, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfnmaddps256_mask, "V8fV8fV8fV8fUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfnmaddps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfnmsubpd128_mask, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubpd128_mask3, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubpd256_mask, "V4dV4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubpd256_mask3, "V4dV4dV4dV4dUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfnmsubpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubpd512_mask3, "V8dV8dV8dV8dUcIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_vfnmsubps128_mask, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubps128_mask3, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubps256_mask, "V8fV8fV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubps256_mask3, "V8fV8fV8fV8fUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vfnmsubps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubps512_mask3, "V16fV16fV16fV16fUsIi", "", "avx512f")

	// XOP
	TARGET_BUILTIN(__builtin_ia32_vpmacssww, "V8sV8sV8sV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacsww, "V8sV8sV8sV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacsswd, "V4iV8sV8sV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacswd, "V4iV8sV8sV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacssdd, "V4iV4iV4iV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacsdd, "V4iV4iV4iV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacssdql, "V2LLiV4iV4iV2LLi", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacsdql, "V2LLiV4iV4iV2LLi", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacssdqh, "V2LLiV4iV4iV2LLi", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmacsdqh, "V2LLiV4iV4iV2LLi", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmadcsswd, "V4iV8sV8sV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpmadcswd, "V4iV8sV8sV4i", "", "xop")

	TARGET_BUILTIN(__builtin_ia32_vphaddbw, "V8sV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphaddbd, "V4iV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphaddbq, "V2LLiV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphaddwd, "V4iV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphaddwq, "V2LLiV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphadddq, "V2LLiV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphaddubw, "V8sV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphaddubd, "V4iV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphaddubq, "V2LLiV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphadduwd, "V4iV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphadduwq, "V2LLiV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphaddudq, "V2LLiV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphsubbw, "V8sV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphsubwd, "V4iV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vphsubdq, "V2LLiV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpperm, "V16cV16cV16cV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vprotb, "V16cV16cV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vprotw, "V8sV8sV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vprotd, "V4iV4iV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vprotq, "V2LLiV2LLiV2LLi", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vprotbi, "V16cV16cIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vprotwi, "V8sV8sIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vprotdi, "V4iV4iIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vprotqi, "V2LLiV2LLiIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpshlb, "V16cV16cV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpshlw, "V8sV8sV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpshld, "V4iV4iV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpshlq, "V2LLiV2LLiV2LLi", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpshab, "V16cV16cV16c", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpshaw, "V8sV8sV8s", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpshad, "V4iV4iV4i", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpshaq, "V2LLiV2LLiV2LLi", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpcomub, "V16cV16cV16cIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpcomuw, "V8sV8sV8sIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpcomud, "V4iV4iV4iIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpcomuq, "V2LLiV2LLiV2LLiIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpcomb, "V16cV16cV16cIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpcomw, "V8sV8sV8sIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpcomd, "V4iV4iV4iIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpcomq, "V2LLiV2LLiV2LLiIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpermil2pd, "V2dV2dV2dV2LLiIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpermil2pd256, "V4dV4dV4dV4LLiIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpermil2ps, "V4fV4fV4fV4iIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vpermil2ps256, "V8fV8fV8fV8iIc", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vfrczss, "V4fV4f", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vfrczsd, "V2dV2d", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vfrczps, "V4fV4f", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vfrczpd, "V2dV2d", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vfrczps256, "V8fV8f", "", "xop")
	TARGET_BUILTIN(__builtin_ia32_vfrczpd256, "V4dV4d", "", "xop")

	TARGET_BUILTIN(__builtin_ia32_xbegin, "i", "", "rtm")
	TARGET_BUILTIN(__builtin_ia32_xend, "v", "", "rtm")
	TARGET_BUILTIN(__builtin_ia32_xabort, "vIc", "", "rtm")
	TARGET_BUILTIN(__builtin_ia32_xtest, "i", "", "rtm")

	BUILTIN(__builtin_ia32_rdpmc, "ULLii", "")
	BUILTIN(__builtin_ia32_rdtsc, "ULLi", "")
	BUILTIN(__rdtsc, "ULLi", "")
	BUILTIN(__builtin_ia32_rdtscp, "ULLiUi*", "")
	// PKU
	TARGET_BUILTIN(__builtin_ia32_rdpkru, "Ui", "", "pku")
	TARGET_BUILTIN(__builtin_ia32_wrpkru, "vUi", "", "pku")

	// AVX-512
	TARGET_BUILTIN(__builtin_ia32_sqrtpd512_mask, "V8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_sqrtps512_mask, "V16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_rsqrt14sd_mask, "V2dV2dV2dV2dUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_rsqrt14ss_mask, "V4fV4fV4fV4fUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_rsqrt14pd512_mask, "V8dV8dV8dUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_rsqrt14ps512_mask, "V16fV16fV16fUs", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_rsqrt28sd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512er")
	TARGET_BUILTIN(__builtin_ia32_rsqrt28ss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512er")
	TARGET_BUILTIN(__builtin_ia32_rsqrt28pd_mask, "V8dV8dV8dUcIi", "", "avx512er")
	TARGET_BUILTIN(__builtin_ia32_rsqrt28ps_mask, "V16fV16fV16fUsIi", "", "avx512er")

	TARGET_BUILTIN(__builtin_ia32_rcp14sd_mask, "V2dV2dV2dV2dUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_rcp14ss_mask, "V4fV4fV4fV4fUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_rcp14pd512_mask, "V8dV8dV8dUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_rcp14ps512_mask, "V16fV16fV16fUs", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_rcp28sd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512er")
	TARGET_BUILTIN(__builtin_ia32_rcp28ss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512er")
	TARGET_BUILTIN(__builtin_ia32_rcp28pd_mask, "V8dV8dV8dUcIi", "", "avx512er")
	TARGET_BUILTIN(__builtin_ia32_rcp28ps_mask, "V16fV16fV16fUsIi", "", "avx512er")
	TARGET_BUILTIN(__builtin_ia32_exp2pd_mask, "V8dV8dV8dUcIi", "", "avx512er")
	TARGET_BUILTIN(__builtin_ia32_exp2ps_mask, "V16fV16fV16fUsIi", "", "avx512er")

	TARGET_BUILTIN(__builtin_ia32_cvttps2dq512_mask, "V16iV16fV16iUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvttps2udq512_mask, "V16iV16fV16iUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2dq512_mask, "V8iV8dV8iUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2udq512_mask, "V8iV8dV8iUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cmpps512_mask, "UsV16fV16fIiUsIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_cmpps256_mask, "UcV8fV8fIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cmpps128_mask, "UcV4fV4fIiUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_pcmpeqb512_mask, "LLiV64cV64cLLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqd512_mask, "sV16iV16is", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqq512_mask, "cV8LLiV8LLic", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqw512_mask, "iV32sV32si", "", "avx512bw")

	TARGET_BUILTIN(__builtin_ia32_pcmpeqb256_mask, "iV32cV32ci", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqd256_mask, "cV8iV8ic", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqq256_mask, "cV4LLiV4LLic", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqw256_mask, "sV16sV16ss", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqb128_mask, "sV16cV16cs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqd128_mask, "cV4iV4ic", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqq128_mask, "cV2LLiV2LLic", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pcmpeqw128_mask, "cV8sV8sc", "", "avx512vl,avx512bw")

	TARGET_BUILTIN(__builtin_ia32_pcmpgtb512_mask, "LLiV64cV64cLLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtd512_mask, "sV16iV16is", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtq512_mask, "cV8LLiV8LLic", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtw512_mask, "iV32sV32si", "", "avx512bw")

	TARGET_BUILTIN(__builtin_ia32_pcmpgtb256_mask, "iV32cV32ci", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtd256_mask, "cV8iV8ic", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtq256_mask, "cV4LLiV4LLic", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtw256_mask, "sV16sV16ss", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtb128_mask, "sV16cV16cs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtd128_mask, "cV4iV4ic", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtq128_mask, "cV2LLiV2LLic", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pcmpgtw128_mask, "cV8sV8sc", "", "avx512vl,avx512bw")

	TARGET_BUILTIN(__builtin_ia32_cmppd512_mask, "UcV8dV8dIiUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cmppd256_mask, "UcV4dV4dIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cmppd128_mask, "UcV2dV2dIiUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_rndscaleps_mask, "V16fV16fIiV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_rndscalepd_mask, "V8dV8dIiV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtps2dq512_mask, "V16iV16fV16iUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2dq512_mask, "V8iV8dV8iUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtps2udq512_mask, "V16iV16fV16iUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2udq512_mask, "V8iV8dV8iUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_minps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_minpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_maxps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_maxpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtdq2ps512_mask, "V16fV16iV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtudq2ps512_mask, "V16fV16iV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2ps512_mask, "V8fV8dV8fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvtps2ph512_mask, "V16sV16fIiV16sUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvtph2ps512_mask, "V16fV16sV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pabsd512_mask, "V16iV16iV16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pabsq512_mask, "V8LLiV8LLiV8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmaxsd512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmaxsq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmaxud512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmaxuq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pminsd512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pminsq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pminud512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pminuq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmuldq512, "V8LLiV16iV16i", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmuludq512, "V8LLiV16iV16i", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_ptestmd512, "UsV16iV16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_ptestmq512, "UcV8LLiV8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastd512_gpr_mask, "V16iiV16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastq512_mem_mask, "V8LLiLLiV8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_loaddqusi512_mask, "V16iiC*V16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_loaddqudi512_mask, "V8LLiLLiC*V8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_loadups512_mask, "V16ffC*V16fUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_loadaps512_mask, "V16fV16fC*V16fUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_loadupd512_mask, "V8ddC*V8dUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_loadapd512_mask, "V8dV8dC*V8dUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_storedqudi512_mask, "vLLi*V8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_storedqusi512_mask, "vi*V16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_storeupd512_mask, "vd*V8dUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_storeapd512_mask, "vV8d*V8dUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_storeups512_mask, "vf*V16fUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_storeaps512_mask, "vV16f*V16fUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermt2vard512_mask, "V16iV16iV16iV16iUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varq512_mask, "V8LLiV8LLiV8LLiV8LLiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varps512_mask, "V16fV16iV16fV16fUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varpd512_mask, "V8dV8LLiV8dV8dUc", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2ddC*V2LLiUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3div2di, "V2LLiV2LLiLLiC*V2LLiUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3div4df, "V4dV4ddC*V4LLiUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3div4di, "V4LLiV4LLiLLiC*V4LLiUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3div4sf, "V4fV4ffC*V2LLiUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3div4si, "V4iV4iiC*V2LLiUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3div8sf, "V4fV4ffC*V4LLiUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3div8si, "V4iV4iiC*V4LLiUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3siv2df, "V2dV2ddC*V4iUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3siv2di, "V2LLiV2LLiLLiC*V4iUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3siv4df, "V4dV4ddC*V4iUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3siv4di, "V4LLiV4LLiLLiC*V4iUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3siv4sf, "V4fV4ffC*V4iUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3siv4si, "V4iV4iiC*V4iUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3siv8sf, "V8fV8ffC*V8iUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gather3siv8si, "V8iV8iiC*V8iUcIi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_gathersiv8df, "V8dV8ddC*V8iUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_gathersiv16sf, "V16fV16ffC*V16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_gatherdiv8df, "V8dV8ddC*V8LLiUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_gatherdiv16sf, "V8fV8ffC*V8LLiUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_gathersiv8di, "V8LLiV8LLiLLiC*V8iUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_gathersiv16si, "V16iV16iiC*V16iUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_gatherdiv8di, "V8LLiV8LLiLLiC*V8LLiUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_gatherdiv16si, "V8iV8iiC*V8LLiUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_scattersiv8df, "vd*UcV8iV8dIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_scattersiv16sf, "vf*UsV16iV16fIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv8df, "vd*UcV8LLiV8dIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv16sf, "vf*UcV8LLiV8fIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_scattersiv8di, "vLLi*UcV8iV8LLiIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_scattersiv16si, "vi*UsV16iV16iIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv8di, "vLLi*UcV8LLiV8LLiIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv16si, "vi*UcV8LLiV8iIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_gatherpfdpd, "vUcV8iLLiC*IiIi", "", "avx512pf")
	TARGET_BUILTIN(__builtin_ia32_gatherpfdps, "vUsV16iiC*IiIi", "", "avx512pf")
	TARGET_BUILTIN(__builtin_ia32_gatherpfqpd, "vUcV8LLiLLiC*IiIi", "", "avx512pf")
	TARGET_BUILTIN(__builtin_ia32_gatherpfqps, "vUcV8LLiiC*IiIi", "", "avx512pf")
	TARGET_BUILTIN(__builtin_ia32_scatterpfdpd, "vUcV8iLLi*IiIi", "", "avx512pf")
	TARGET_BUILTIN(__builtin_ia32_scatterpfdps, "vUsV16ii*IiIi", "", "avx512pf")
	TARGET_BUILTIN(__builtin_ia32_scatterpfqpd, "vUcV8LLiLLi*IiIi", "", "avx512pf")
	TARGET_BUILTIN(__builtin_ia32_scatterpfqps, "vUcV8LLii*IiIi", "", "avx512pf")

	TARGET_BUILTIN(__builtin_ia32_knothi, "UsUs", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_cmpb128_mask, "UsV16cV16cIiUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cmpd128_mask, "UcV4iV4iIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cmpq128_mask, "UcV2LLiV2LLiIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cmpw128_mask, "UcV8sV8sIiUc", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cmpb256_mask, "UiV32cV32cIiUi", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cmpd256_mask, "UcV8iV8iIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cmpq256_mask, "UcV4LLiV4LLiIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cmpw256_mask, "UsV16sV16sIiUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cmpb512_mask, "ULLiV64cV64cIiULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cmpd512_mask, "UsV16iV16iIiUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cmpq512_mask, "UcV8LLiV8LLiIiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_cmpw512_mask, "UiV32sV32sIiUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ucmpb128_mask, "UsV16cV16cIiUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ucmpd128_mask, "UcV4iV4iIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ucmpq128_mask, "UcV2LLiV2LLiIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ucmpw128_mask, "UcV8sV8sIiUc", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ucmpb256_mask, "UiV32cV32cIiUi", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ucmpd256_mask, "UcV8iV8iIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ucmpq256_mask, "UcV4LLiV4LLiIiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ucmpw256_mask, "UsV16sV16sIiUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ucmpb512_mask, "ULLiV64cV64cIiULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ucmpd512_mask, "UsV16iV16iIiUs", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_ucmpq512_mask, "UcV8LLiV8LLiIiUc", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_ucmpw512_mask, "UiV32sV32sIiUi", "", "avx512bw")

	TARGET_BUILTIN(__builtin_ia32_pabsb512_mask, "V64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pabsw512_mask, "V32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_packssdw512, "V32sV16iV16i", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_packsswb512, "V64cV32sV32s", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_packusdw512, "V32sV16iV16i", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_packuswb512, "V64cV32sV32s", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_paddsb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_paddsw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_paddusb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_paddusw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pavgb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pavgw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmaxsb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmaxsw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmaxub512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmaxuw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pminsb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pminsw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pminub512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pminuw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pshufb512, "V64cV64cV64c", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psubsb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psubsw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psubusb512_mask, "V64cV64cV64cV64cULLi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psubusw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")

	TARGET_BUILTIN(__builtin_ia32_vpermi2varhi512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varhi512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varhi512_maskz, "V32sV32sV32sV32sUi", "", "avx512bw")

	TARGET_BUILTIN(__builtin_ia32_vpconflictdi_128_mask, "V2LLiV2LLiV2LLiUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpconflictdi_256_mask, "V4LLiV4LLiV4LLiUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpconflictsi_128_mask, "V4iV4iV4iUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpconflictsi_256_mask, "V8iV8iV8iUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpconflictdi_512_mask, "V8LLiV8LLiV8LLiUc", "", "avx512cd")
	TARGET_BUILTIN(__builtin_ia32_vpconflictsi_512_mask, "V16iV16iV16iUs", "", "avx512cd")
	TARGET_BUILTIN(__builtin_ia32_vplzcntd_512_mask, "V16iV16iV16iUs", "", "avx512cd")
	TARGET_BUILTIN(__builtin_ia32_vplzcntq_512_mask, "V8LLiV8LLiV8LLiUc", "", "avx512cd")

	TARGET_BUILTIN(__builtin_ia32_vpopcntd_512, "V16iV16i", "", "avx512vpopcntdq")
	TARGET_BUILTIN(__builtin_ia32_vpopcntq_512, "V8LLiV8LLi", "", "avx512vpopcntdq")

	TARGET_BUILTIN(__builtin_ia32_vpermi2varhi128_mask, "V8sV8sV8sV8sUc", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varhi256_mask, "V16sV16sV16sV16sUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varhi128_mask, "V8sV8sV8sV8sUc", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varhi128_maskz, "V8sV8sV8sV8sUc", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varhi256_mask, "V16sV16sV16sV16sUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varhi256_maskz, "V16sV16sV16sV16sUs", "", "avx512vl,avx512bw")

	TARGET_BUILTIN(__builtin_ia32_pmulhrsw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmulhuw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmulhw512_mask, "V32sV32sV32sV32sUi", "", "avx512bw")

	TARGET_BUILTIN(__builtin_ia32_addpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_addps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_divpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_divps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_mulpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_mulps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_subpd512_mask, "V8dV8dV8dV8dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_subps512_mask, "V16fV16fV16fV16fUsIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_pmaddubsw512_mask, "V32sV64cV64cV32sUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmaddwd512_mask, "V16iV32sV32sV16iUs", "", "avx512bw")

	TARGET_BUILTIN(__builtin_ia32_addss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_divss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_mulss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_subss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_maxss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_minss_round_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_addsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_divsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_mulsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_subsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_maxsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_minsd_round_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")

	TARGET_BUILTIN(__builtin_ia32_compressdf128_mask, "V2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressdf256_mask, "V4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressdi128_mask, "V2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressdi256_mask, "V4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compresssf128_mask, "V4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compresssf256_mask, "V8fV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compresssi128_mask, "V4iV4iV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compresssi256_mask, "V8iV8iV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressstoredf128_mask, "vV2d*V2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressstoredf256_mask, "vV4d*V4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressstoredi128_mask, "vV2LLi*V2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressstoredi256_mask, "vV4LLi*V4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressstoresf128_mask, "vV4f*V4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressstoresf256_mask, "vV8f*V8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressstoresi128_mask, "vV4i*V4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_compressstoresi256_mask, "vV8i*V8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtdq2ps128_mask, "V4fV4iV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtdq2ps256_mask, "V8fV8iV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2dq128_mask, "V4iV2dV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2dq256_mask, "V4iV4dV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2ps_mask, "V4fV2dV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2ps256_mask, "V4fV4dV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2udq128_mask, "V4iV2dV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2udq256_mask, "V4iV4dV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtps2dq128_mask, "V4iV4fV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtps2dq256_mask, "V8iV8fV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtps2pd128_mask, "V2dV4fV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtps2pd256_mask, "V4dV4fV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtps2udq128_mask, "V4iV4fV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtps2udq256_mask, "V8iV8fV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2dq128_mask, "V4iV2dV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2dq256_mask, "V4iV4dV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2udq128_mask, "V4iV2dV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2udq256_mask, "V4iV4dV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvttps2dq128_mask, "V4iV4fV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvttps2dq256_mask, "V8iV8fV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvttps2udq128_mask, "V4iV4fV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvttps2udq256_mask, "V8iV8fV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtudq2ps128_mask, "V4fV4iV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtudq2ps256_mask, "V8fV8iV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expanddf128_mask, "V2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expanddf256_mask, "V4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expanddi128_mask, "V2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expanddi256_mask, "V4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandloaddf128_mask, "V2dV2d*V2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandloaddf256_mask, "V4dV4d*V4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandloaddi128_mask, "V4iV2LLi*V2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandloaddi256_mask, "V4LLiV4LLi*V4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandloadsf128_mask, "V4fV4f*V4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandloadsf256_mask, "V8fV8f*V8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandloadsi128_mask, "V4iV4i*V4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandloadsi256_mask, "V8iV8i*V8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandsf128_mask, "V4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandsf256_mask, "V8fV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandsi128_mask, "V4iV4iV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_expandsi256_mask, "V8iV8iV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getexppd128_mask, "V2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getexppd256_mask, "V4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getexpps128_mask, "V4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getexpps256_mask, "V8fV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pabsq128_mask, "V2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pabsq256_mask, "V4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmaxsq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmaxsq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmaxuq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmaxuq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pminsq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pminsq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pminuq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pminuq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rndscalepd_128_mask, "V2dV2dIiV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rndscalepd_256_mask, "V4dV4dIiV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rndscaleps_128_mask, "V4fV4fIiV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rndscaleps_256_mask, "V8fV8fIiV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scalefpd128_mask, "V2dV2dV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scalefpd256_mask, "V4dV4dV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scalefps128_mask, "V4fV4fV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scalefps256_mask, "V8fV8fV8fV8fUc", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_scatterdiv2df, "vd*UcV2LLiV2dIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv2di, "vLLi*UcV2LLiV2LLiIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv4df, "vd*UcV4LLiV4dIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv4di, "vLLi*UcV4LLiV4LLiIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv4sf, "vf*UcV2LLiV4fIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv4si, "vi*UcV2LLiV4iIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv8sf, "vf*UcV4LLiV4fIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scatterdiv8si, "vi*UcV4LLiV4iIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scattersiv2df, "vd*UcV4iV2dIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scattersiv2di, "vLLi*UcV4iV2LLiIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scattersiv4df, "vd*UcV4iV4dIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scattersiv4di, "vLLi*UcV4iV4LLiIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scattersiv4sf, "vf*UcV4iV4fIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scattersiv4si, "vi*UcV4iV4iIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scattersiv8sf, "vf*UcV8iV8fIi", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_scattersiv8si, "vi*UcV8iV8iIi", "", "avx512vl")

	TARGET_BUILTIN(__builtin_ia32_vpermi2vard128_mask, "V4iV4iV4iV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2vard256_mask, "V8iV8iV8iV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varpd128_mask, "V2dV2dV2LLiV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varpd256_mask, "V4dV4dV4LLiV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varps128_mask, "V4fV4fV4iV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varps256_mask, "V8fV8fV8iV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2vard128_mask, "V4iV4iV4iV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2vard128_maskz, "V4iV4iV4iV4iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2vard256_mask, "V8iV8iV8iV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2vard256_maskz, "V8iV8iV8iV8iUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varpd128_mask, "V2dV2LLiV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varpd128_maskz, "V2dV2LLiV2dV2dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varpd256_mask, "V4dV4LLiV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varpd256_maskz, "V4dV4LLiV4dV4dUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varps128_mask, "V4fV4iV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varps128_maskz, "V4fV4iV4fV4fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varps256_mask, "V8fV8iV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varps256_maskz, "V8fV8iV8fV8fUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varq128_mask, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varq128_maskz, "V2LLiV2LLiV2LLiV2LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varq256_mask, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varq256_maskz, "V4LLiV4LLiV4LLiV4LLiUc", "", "avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovswb512_mask, "V32cV32sV32cUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovuswb512_mask, "V32cV32sV32cUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovwb512_mask, "V32cV32sV32cUi", "", "avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2qq128_mask, "V2LLiV2dV2LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2qq256_mask, "V4LLiV4dV4LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq128_mask, "V2LLiV2dV2LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq256_mask, "V4LLiV4dV4LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtps2qq128_mask, "V2LLiV4fV2LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtps2qq256_mask, "V4LLiV4fV4LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtps2uqq128_mask, "V2LLiV4fV2LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtps2uqq256_mask, "V4LLiV4fV4LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtqq2pd128_mask, "V2dV2LLiV2dUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtqq2pd256_mask, "V4dV4LLiV4dUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtqq2ps128_mask, "V4fV2LLiV4fUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtqq2ps256_mask, "V4fV4LLiV4fUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2qq128_mask, "V2LLiV2dV2LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2qq256_mask, "V4LLiV4dV4LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq128_mask, "V2LLiV2dV2LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq256_mask, "V4LLiV4dV4LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttps2qq128_mask, "V2LLiV4fV2LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttps2qq256_mask, "V4LLiV4fV4LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttps2uqq128_mask, "V2LLiV4fV2LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttps2uqq256_mask, "V4LLiV4fV4LLiUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtuqq2pd128_mask, "V2dV2LLiV2dUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtuqq2pd256_mask, "V4dV4LLiV4dUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps128_mask, "V4fV2LLiV4fUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps256_mask, "V4fV4LLiV4fUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_rangepd128_mask, "V2dV2dV2dIiV2dUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_rangepd256_mask, "V4dV4dV4dIiV4dUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_rangeps128_mask, "V4fV4fV4fIiV4fUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_rangeps256_mask, "V8fV8fV8fIiV8fUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_rangesd128_round_mask, "V2dV2dV2dV2dUcIiIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_rangess128_round_mask, "V4fV4fV4fV4fUcIiIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_reducepd128_mask, "V2dV2dIiV2dUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_reducepd256_mask, "V4dV4dIiV4dUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_reduceps128_mask, "V4fV4fIiV4fUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_reduceps256_mask, "V8fV8fIiV8fUc", "", "avx512vl,avx512dq")
	TARGET_BUILTIN(__builtin_ia32_reducesd_mask, "V2dV2dV2dV2dUcIiIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_reducess_mask, "V4fV4fV4fV4fUcIiIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_pmovswb128_mask, "V16cV8sV16cUc", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovswb256_mask, "V16cV16sV16cUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovuswb128_mask, "V16cV8sV16cUc", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovuswb256_mask, "V16cV16sV16cUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovwb128_mask, "V16cV8sV16cUc", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovwb256_mask, "V16cV16sV16cUs", "", "avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2qq512_mask, "V8LLiV8dV8LLiUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtpd2uqq512_mask, "V8LLiV8dV8LLiUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtps2qq512_mask, "V8LLiV8fV8LLiUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtps2uqq512_mask, "V8LLiV8fV8LLiUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtqq2pd512_mask, "V8dV8LLiV8dUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtqq2ps512_mask, "V8fV8LLiV8fUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2qq512_mask, "V8LLiV8dV8LLiUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttpd2uqq512_mask, "V8LLiV8dV8LLiUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttps2qq512_mask, "V8LLiV8fV8LLiUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvttps2uqq512_mask, "V8LLiV8fV8LLiUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtuqq2pd512_mask, "V8dV8LLiV8dUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtuqq2ps512_mask, "V8fV8LLiV8fUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_rangepd512_mask, "V8dV8dV8dIiV8dUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_rangeps512_mask, "V16fV16fV16fIiV16fUsIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_reducepd512_mask, "V8dV8dIiV8dUcIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_reduceps512_mask, "V16fV16fIiV16fUsIi", "", "avx512dq")
	TARGET_BUILTIN(__builtin_ia32_prold512_mask, "V16iV16iIiV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_prolq512_mask, "V8LLiV8LLiIiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_prold128_mask, "V4iV4iIiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prold256_mask, "V8iV8iIiV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prolq128_mask, "V2LLiV2LLiIiV2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prolq256_mask, "V4LLiV4LLiIiV4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prolvd512_mask, "V16iV16iV16iV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_prolvq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_prord512_mask, "V16iV16iiV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_prorq512_mask, "V8LLiV8LLiiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_prolvd128_mask, "V4iV4iV4iV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prolvd256_mask, "V8iV8iV8iV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prolvq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prolvq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prord128_mask, "V4iV4iIiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prord256_mask, "V8iV8iIiV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prorq128_mask, "V2LLiV2LLiIiV2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prorq256_mask, "V4LLiV4LLiIiV4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prorvd512_mask, "V16iV16iV16iV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_prorvq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_prorvd128_mask, "V4iV4iV4iV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prorvd256_mask, "V8iV8iV8iV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prorvq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_prorvq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psllv32hi, "V32sV32sV32s","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psllw512, "V32sV32sV8s","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psllwi512, "V32sV32si","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psllv16hi, "V16sV16sV16s","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psllv8hi, "V8sV8sV8s","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pslldi512, "V16iV16ii","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psllqi512, "V8LLiV8LLii","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrlv32hi, "V32sV32sV32s","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psrlv16hi, "V16sV16sV16s","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psrlv8hi, "V8sV8sV8s","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psrldi512, "V16iV16ii","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrlqi512, "V8LLiV8LLii","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrav32hi, "V32sV32sV32s","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psrav16hi, "V16sV16sV16s","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psrav8hi, "V8sV8sV8s","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psravq128, "V2LLiV2LLiV2LLi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psravq256, "V4LLiV4LLiV4LLi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psraw512, "V32sV32sV8s","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psrawi512, "V32sV32si","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psrlw512, "V32sV32sV8s","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psrlwi512, "V32sV32si","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_movdqa32load128_mask, "V4iV4i*V4iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa32load256_mask, "V8iV8i*V8iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa32load512_mask, "V16iV16iC*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa32store512_mask, "vV16i*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa64load512_mask, "V8LLiV8LLiC*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa64store512_mask, "vV8LLi*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa32store128_mask, "vV4i*V4iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa32store256_mask, "vV8i*V8iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa64load128_mask, "V2LLiV2LLiC*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_movdqa64load256_mask, "V4LLiV4LLiC*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_movdqa64store128_mask, "vV2LLi*V2LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_movdqa64store256_mask, "vV4LLi*V4LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastb512_gpr_mask, "V64ccV64cULLi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastb128_gpr_mask, "V16ccV16cUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastb256_gpr_mask, "V32ccV32cUi","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastd128_gpr_mask, "V4iiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastd256_gpr_mask, "V8iiV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52huq512_maskz, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52luq512_maskz, "V8LLiV8LLiV8LLiV8LLiUc","","avx512ifma")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512ifma,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52huq128_maskz, "V2LLiV2LLiV2LLiV2LLiUc","","avx512ifma,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512ifma,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52huq256_maskz, "V4LLiV4LLiV4LLiV4LLiUc","","avx512ifma,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128_mask, "V2LLiV2LLiV2LLiV2LLiUc","","avx512ifma,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52luq128_maskz, "V2LLiV2LLiV2LLiV2LLiUc","","avx512ifma,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512ifma,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmadd52luq256_maskz, "V4LLiV4LLiV4LLiV4LLiUc","","avx512ifma,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varqi512_mask, "V64cV64cV64cV64cULLi","","avx512vbmi")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varqi512_mask, "V64cV64cV64cV64cULLi","","avx512vbmi")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varqi512_maskz, "V64cV64cV64cV64cULLi","","avx512vbmi")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varqi128_mask, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varqi256_mask, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varqi128_mask, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varqi128_maskz, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varqi256_mask, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varqi256_maskz, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vcomisd, "iV2dV2dIiIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcomiss, "iV4fV4fIiIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_kunpckdi, "ULLiULLiULLi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_kunpcksi, "UiUiUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_loaddquhi512_mask, "V32sV32s*V32sUi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_loaddquqi512_mask, "V64cV64c*V64cULLi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_mask, "V8dV8dV8dV8LLiIiUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_fixupimmpd512_maskz, "V8dV8dV8dV8LLiIiUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_fixupimmps512_mask, "V16fV16fV16fV16iIiUsIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_fixupimmps512_maskz, "V16fV16fV16fV16iIiUsIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_fixupimmsd_mask, "V2dV2dV2dV2LLiIiUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_fixupimmsd_maskz, "V2dV2dV2dV2LLiIiUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_fixupimmss_mask, "V4fV4fV4fV4iIiUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_fixupimmss_maskz, "V4fV4fV4fV4iIiUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_getexpsd128_round_mask, "V2dV2dV2dV2dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_getexpss128_round_mask, "V4fV4fV4fV4fUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_getmantsd_round_mask, "V2dV2dV2dIiV2dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_getmantss_round_mask, "V4fV4fV4fIiV4fUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_loaddquhi128_mask, "V8sV8s*V8sUc","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loaddquhi256_mask, "V16sV16s*V16sUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loaddquqi128_mask, "V16cV16c*V16cUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loaddquqi256_mask, "V32cV32c*V32cUi","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_mask, "V2dV2dV2dV2LLiIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fixupimmpd128_maskz, "V2dV2dV2dV2LLiIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_mask, "V4dV4dV4dV4LLiIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fixupimmpd256_maskz, "V4dV4dV4dV4LLiIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fixupimmps128_mask, "V4fV4fV4fV4iIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fixupimmps128_maskz, "V4fV4fV4fV4iIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fixupimmps256_mask, "V8fV8fV8fV8iIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fixupimmps256_maskz, "V8fV8fV8fV8iIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loadapd128_mask, "V2dV2d*V2dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loadsd128_mask, "V8dV8d*V8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_loadapd256_mask, "V4dV4d*V4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loadaps128_mask, "V4fV4f*V4fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loadss128_mask, "V16fV16f*V16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_loadaps256_mask, "V8fV8f*V8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loaddqudi128_mask, "V2LLiV2LLi*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loaddqudi256_mask, "V4LLiV4LLi*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loaddqusi128_mask, "V4iV4i*V4iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_loaddqusi256_mask, "V8iV8i*V8iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_loadupd128_mask, "V2dV2d*V2dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loadupd256_mask, "V4dV4d*V4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loadups128_mask, "V4fV4f*V4fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_loadups256_mask, "V8fV8f*V8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storedquhi512_mask, "vV32s*V32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_storedquqi512_mask, "vV64c*V64cULLi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_storedquhi128_mask, "vV8s*V8sUc","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_storedquhi256_mask, "vV16s*V16sUs","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_storedquqi128_mask, "vV16c*V16cUs","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_storedquqi256_mask, "vV32c*V32cUi","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_storeapd128_mask, "vV2d*V2dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storesd128_mask, "vV8d*V8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_storeapd256_mask, "vV4d*V4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storeaps128_mask, "vV4f*V4fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storess128_mask, "vV16f*V16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_storeaps256_mask, "vV8f*V8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storedqudi128_mask, "vV2LLi*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storedqudi256_mask, "vV4LLi*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storedqusi128_mask, "vV4i*V4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storedqusi256_mask, "vV8i*V8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storeupd128_mask, "vV2d*V2dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storeupd256_mask, "vV4d*V4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storeups128_mask, "vV4f*V4fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_storeups256_mask, "vV8f*V8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rcp14pd128_mask, "V2dV2dV2dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rcp14pd256_mask, "V4dV4dV4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rcp14ps128_mask, "V4fV4fV4fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rcp14ps256_mask, "V8fV8fV8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vplzcntd_128_mask, "V4iV4iV4iUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vplzcntd_256_mask, "V8iV8iV8iUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vplzcntq_128_mask, "V2LLiV2LLiV2LLiUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vplzcntq_256_mask, "V4LLiV4LLiV4LLiUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vcvtsd2si32, "iV2dIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi32, "UiV2dIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvtss2si32, "iV4fIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvtss2usi32, "UiV4fIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvttsd2si32, "iV2dIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi32, "UiV2dIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvttss2si32, "iV4fIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvttss2usi32, "UiV4fIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermi2vard512_mask, "V16iV16iV16iV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varpd512_mask, "V8dV8dV8LLiV8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varps512_mask, "V16fV16fV16iV16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermi2varq512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermilvarpd512, "V8dV8dV8LLi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermilvarps512, "V16fV16fV16i","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermt2vard512_maskz, "V16iV16iV16iV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varpd512_maskz, "V8dV8LLiV8dV8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varps512_maskz, "V16fV16iV16fV16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpermt2varq512_maskz, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_ptestmb512, "ULLiV64cV64cULLi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ptestmw512, "UiV32sV32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ptestnmb512, "ULLiV64cV64cULLi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ptestnmw512, "UiV32sV32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_ptestmb128, "UsV16cV16cUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestmb256, "UiV32cV32cUi","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestmw128, "UcV8sV8sUc","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestmw256, "UsV16sV16sUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmb128, "UsV16cV16cUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmb256, "UiV32cV32cUi","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmw128, "UcV8sV8sUc","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmw256, "UsV16sV16sUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestmd128, "UcV4iV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestmd256, "UcV8iV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestmq128, "UcV2LLiV2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestmq256, "UcV4LLiV4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmd128, "UcV4iV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmd256, "UcV8iV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmq128, "UcV2LLiV2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmq256, "UcV4LLiV4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_ptestnmd512, "UsV16iV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_ptestnmq512, "UcV8LLiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_rndscalesd_round_mask, "V2dV2dV2dV2dUcIiIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_rndscaless_round_mask, "V4fV4fV4fV4fUcIiIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_scalefpd512_mask, "V8dV8dV8dV8dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_scalefps512_mask, "V16fV16fV16fV16fUsIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_scalefsd_round_mask, "V2dV2dV2dV2dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_scalefss_round_mask, "V4fV4fV4fV4fUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psradi512, "V16iV16ii","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psraqi512, "V8LLiV8LLii","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psraq128, "V2LLiV2LLiV2LLi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psraq256, "V4LLiV4LLiV2LLi","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psraqi128, "V2LLiV2LLii","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_psraqi256, "V4LLiV4LLii","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pslld512, "V16iV16iV4i","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psllq512, "V8LLiV8LLiV2LLi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psllv16si, "V16iV16iV16i","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psllv8di, "V8LLiV8LLiV8LLi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrad512, "V16iV16iV4i","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psraq512, "V8LLiV8LLiV2LLi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrav16si, "V16iV16iV16i","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrav8di, "V8LLiV8LLiV8LLi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrld512, "V16iV16iV4i","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrlq512, "V8LLiV8LLiV2LLi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrlv16si, "V16iV16iV16i","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_psrlv8di, "V8LLiV8LLiV8LLi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pternlogd512_mask, "V16iV16iV16iV16iIiUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pternlogd512_maskz, "V16iV16iV16iV16iIiUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pternlogq512_mask, "V8LLiV8LLiV8LLiV8LLiIiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pternlogq512_maskz, "V8LLiV8LLiV8LLiV8LLiIiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pternlogd128_mask, "V4iV4iV4iV4iIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pternlogd128_maskz, "V4iV4iV4iV4iIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pternlogd256_mask, "V8iV8iV8iV8iIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pternlogd256_maskz, "V8iV8iV8iV8iIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pternlogq128_mask, "V2LLiV2LLiV2LLiV2LLiIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pternlogq128_maskz, "V2LLiV2LLiV2LLiV2LLiIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pternlogq256_mask, "V4LLiV4LLiV4LLiV4LLiIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pternlogq256_maskz, "V4LLiV4LLiV4LLiV4LLiIiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_shuf_f32x4_mask, "V16fV16fV16fIiV16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_shuf_f64x2_mask, "V8dV8dV8dIiV8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_shuf_i32x4_mask, "V16iV16iV16iIiV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_shuf_i64x2_mask, "V8LLiV8LLiV8LLiIiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_shuf_f32x4_256_mask, "V8fV8fV8fIiV8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_shuf_f64x2_256_mask, "V4dV4dV4dIiV4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_shuf_i32x4_256_mask, "V8iV8iV8iIiV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_shuf_i64x2_256_mask, "V4LLiV4LLiV4LLiIiV4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_sqrtsd_round_mask, "V2dV2dV2dV2dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_sqrtss_round_mask, "V4fV4fV4fV4fUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_rsqrt14pd128_mask, "V2dV2dV2dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rsqrt14pd256_mask, "V4dV4dV4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rsqrt14ps128_mask, "V4fV4fV4fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_rsqrt14ps256_mask, "V8fV8fV8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtb2mask512, "ULLiV64c","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2b512, "V64cULLi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2w512, "V32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cvtd2mask512, "UsV16i","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2d512, "V16iUs","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2q512, "V8LLiUc","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtq2mask512, "UcV8LLi","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_cvtb2mask128, "UsV16c","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtb2mask256, "UiV32c","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2b128, "V16cUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2b256, "V32cUi","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2w128, "V8sUc","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2w256, "V16sUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtd2mask128, "UcV4i","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtd2mask256, "UcV8i","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2d128, "V4iUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2d256, "V8iUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2q128, "V2LLiUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtmask2q256, "V4LLiUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtq2mask128, "UcV2LLi","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtq2mask256, "UcV4LLi","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_broadcastmb512, "V8LLiUc","","avx512cd")
	TARGET_BUILTIN(__builtin_ia32_broadcastmw512, "V16iUs","","avx512cd")
	TARGET_BUILTIN(__builtin_ia32_broadcastmb128, "V2LLiUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_broadcastmb256, "V4LLiUc","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_broadcastmw128, "V4iUs","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_broadcastmw256, "V8iUs","","avx512cd,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_broadcastf32x2_512_mask, "V16fV4fV16fUs","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_broadcasti32x2_512_mask, "V16iV4iV16iUs","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_broadcastf32x2_256_mask, "V8fV4fV8fUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_broadcasti32x2_128_mask, "V4iV4iV4iUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_broadcasti32x2_256_mask, "V8iV4iV8iUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastw512_gpr_mask, "V32shV32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastw256_gpr_mask, "V16shV16sUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pbroadcastw128_gpr_mask, "V8ssV8sUc","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsdb512_mask, "V16cV16iV16cUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsdb512mem_mask, "vV16c*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovswb512mem_mask, "vV32c*V32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovsdw512_mask, "V16sV16iV16sUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsdw512mem_mask, "vV16s*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsqb512_mask, "V16cV8LLiV16cUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsqb512mem_mask, "vV16c*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsqd512_mask, "V8iV8LLiV8iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsqd512mem_mask, "vV8i*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsqw512_mask, "V8sV8LLiV8sUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsqw512mem_mask, "vV8s*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovsdb128_mask, "V16cV4iV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsdb128mem_mask, "vV16c*V4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovswb128mem_mask, "vV16c*V8sUc","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovsdb256_mask, "V16cV8iV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsdb256mem_mask, "vV16c*V8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovswb256mem_mask, "vV16c*V16sUs","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovsdw128_mask, "V8sV4iV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsdw128mem_mask, "vV8s*V4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsdw256_mask, "V8sV8iV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsdw256mem_mask, "vV8s*V8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqb128_mask, "V16cV2LLiV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqb128mem_mask, "vV16c*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqb256_mask, "V16cV4LLiV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqb256mem_mask, "vV16c*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqd128_mask, "V4iV2LLiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqd128mem_mask, "vV4i*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqd256_mask, "V4iV4LLiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqd256mem_mask, "vV4i*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqw128_mask, "V8sV2LLiV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqw128mem_mask, "vV8s*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqw256_mask, "V8sV4LLiV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovsqw256mem_mask, "vV8s*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusdb512_mask, "V16cV16iV16cUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusdb512mem_mask, "vV16c*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovuswb512mem_mask, "vV32c*V32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovusdw512_mask, "V16sV16iV16sUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusdw512mem_mask, "vV16s*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusqb512_mask, "V16cV8LLiV16cUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusqb512mem_mask, "vV16c*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusqd512_mask, "V8iV8LLiV8iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusqd512mem_mask, "vV8i*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusqw512_mask, "V8sV8LLiV8sUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusqw512mem_mask, "vV8s*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovusdb128_mask, "V16cV4iV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusdb128mem_mask, "vV16c*V4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovuswb128mem_mask, "vV16c*V8sUc","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovusdb256_mask, "V16cV8iV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusdb256mem_mask, "vV16c*V8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovuswb256mem_mask, "vV16c*V16sUs","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovusdw128_mask, "V8sV4iV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusdw128mem_mask, "vV8s*V4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusdw256_mask, "V8sV8iV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusdw256mem_mask, "vV8s*V8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqb128_mask, "V16cV2LLiV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqb128mem_mask, "vV16c*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqb256_mask, "V16cV4LLiV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqb256mem_mask, "vV16c*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqd128_mask, "V4iV2LLiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqd128mem_mask, "vV4i*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqd256_mask, "V4iV4LLiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqd256mem_mask, "vV4i*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqw128_mask, "V8sV2LLiV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqw128mem_mask, "vV8s*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqw256_mask, "V8sV4LLiV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovusqw256mem_mask, "vV8s*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovdb512_mask, "V16cV16iV16cUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovdb512mem_mask, "vV16c*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovwb512mem_mask, "vV32c*V32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovdw512_mask, "V16sV16iV16sUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovdw512mem_mask, "vV16s*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovqb512_mask, "V16cV8LLiV16cUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovqb512mem_mask, "vV16c*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovqd512_mask, "V8iV8LLiV8iUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovqd512mem_mask, "vV8i*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovqw512_mask, "V8sV8LLiV8sUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovqw512mem_mask, "vV8s*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_pmovdb128_mask, "V16cV4iV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovwb128mem_mask, "vV16c*V8sUc","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovdb128mem_mask, "vV16c*V4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovdb256_mask, "V16cV8iV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovdb256mem_mask, "vV16c*V8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovwb256mem_mask, "vV16c*V16sUs","","avx512vl,avx512bw")
	TARGET_BUILTIN(__builtin_ia32_pmovdw128_mask, "V8sV4iV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovdw128mem_mask, "vV8s*V4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovdw256_mask, "V8sV8iV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovdw256mem_mask, "vV8s*V8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqb128_mask, "V16cV2LLiV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqb128mem_mask, "vV16c*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqb256_mask, "V16cV4LLiV16cUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqb256mem_mask, "vV16c*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqd128_mask, "V4iV2LLiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqd128mem_mask, "vV4i*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqd256_mask, "V4iV4LLiV4iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqd256mem_mask, "vV4i*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqw128_mask, "V8sV2LLiV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqw128mem_mask, "vV8s*V2LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqw256_mask, "V8sV4LLiV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_pmovqw256mem_mask, "vV8s*V4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getmantpd128_mask, "V2dV2diV2dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getmantpd256_mask, "V4dV4diV4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getmantps128_mask, "V4fV4fiV4fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getmantps256_mask, "V8fV8fiV8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_getmantpd512_mask, "V8dV8diV8dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_getmantps512_mask, "V16fV16fiV16fUsIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_getexppd512_mask, "V8dV8dV8dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_getexpps512_mask, "V16fV16fV16fUsIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddss3_maskz, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddss3_mask3, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_maskz, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmaddsd3_mask3, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmsubsd3_mask3, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfmsubss3_mask3, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubsd3_mask3, "V2dV2dV2dV2dUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_vfnmsubss3_mask3, "V4fV4fV4fV4fUcIi", "", "avx512f")
	TARGET_BUILTIN(__builtin_ia32_permvarhi512_mask, "V32sV32sV32sV32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_permvardf512_mask, "V8dV8dV8LLiV8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_permvardi512_mask, "V8LLiV8LLiV8LLiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_permvarsf512_mask, "V16fV16fV16iV16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_permvarsi512_mask, "V16iV16iV16iV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_permvarqi512_mask, "V64cV64cV64cV64cULLi","","avx512vbmi")
	TARGET_BUILTIN(__builtin_ia32_permvarqi128_mask, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_permvarqi256_mask, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_permvarhi128_mask, "V8sV8sV8sV8sUc","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_permvarhi256_mask, "V16sV16sV16sV16sUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_permvardf256_mask, "V4dV4dV4LLiV4dUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_permvardi256_mask, "V4LLiV4LLiV4LLiV4LLiUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_permvarsf256_mask, "V8fV8fV8iV8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_permvarsi256_mask, "V8iV8iV8iV8iUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fpclasspd128_mask, "UcV2dIiUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fpclasspd256_mask, "UcV4dIiUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fpclassps128_mask, "UcV4fIiUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fpclassps256_mask, "UcV8fIiUc","","avx512dq,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_fpclassps512_mask, "UsV16fIiUs","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_fpclasspd512_mask, "UcV8dIiUc","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_fpclasssd_mask, "UcV2dIiUc","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_fpclassss_mask, "UcV4fIiUc","","avx512dq")
	TARGET_BUILTIN(__builtin_ia32_kandhi, "UsUsUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_kandnhi, "UsUsUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_korhi, "UsUsUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_kortestchi, "iUsUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_kortestzhi, "iUsUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_kunpckhi, "UsUsUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_kxnorhi, "UsUsUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_kxorhi, "UsUsUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_palignr512_mask, "V64cV64cV64cIiV64cULLi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_dbpsadbw128_mask, "V8sV16cV16cIiV8sUc","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_dbpsadbw256_mask, "V16sV32cV32cIiV16sUs","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_dbpsadbw512_mask, "V32sV64cV64cIiV32sUi","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_psadbw512, "V8LLiV64cV64c","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_compressdf512_mask, "V8dV8dV8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_compressdi512_mask, "V8LLiV8LLiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_compresssf512_mask, "V16fV16fV16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_compresssi512_mask, "V16iV16iV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_cmpsd_mask, "UcV2dV2dIiUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_cmpss_mask, "UcV4fV4fIiUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_expanddf512_mask, "V8dV8dV8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_expanddi512_mask, "V8LLiV8LLiV8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_expandloaddf512_mask, "V8dV8dC*V8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_expandloaddi512_mask, "V8LLiV8LLiC*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_expandloadsf512_mask, "V16fV16fC*V16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_expandloadsi512_mask, "V16iV16iC*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_expandsf512_mask, "V16fV16fV16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_expandsi512_mask, "V16iV16iV16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtps2pd512_mask, "V8dV8fV8dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_compressstoredf512_mask, "vV8d*V8dUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_compressstoredi512_mask, "vV8LLi*V8LLiUc","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_compressstoresf512_mask, "vV16f*V16fUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_compressstoresi512_mask, "vV16i*V16iUs","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vcvtph2ps_mask, "V4fV8sV4fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vcvtph2ps256_mask, "V8fV8sV8fUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vcvtps2ph_mask, "V8sV4fIiV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vcvtps2ph256_mask, "V8sV8fIiV8sUc","","avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtw2mask512, "UiV32s","","avx512bw")
	TARGET_BUILTIN(__builtin_ia32_cvtw2mask128, "UcV8s","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtw2mask256, "UsV16s","","avx512bw,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_cvtsd2ss_round_mask, "V4fV4fV2dV4fUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtsi2ss32, "V4fV4fiIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtss2sd_round_mask, "V2dV2dV4fV2dUcIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtusi2sd32, "V2dV2dUi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_cvtusi2ss32, "V4fV4fUiIi","","avx512f")
	TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb512_mask, "V64cV64cV64cV64cULLi","","avx512vbmi")
	TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb128_mask, "V16cV16cV16cV16cUs","","avx512vbmi,avx512vl")
	TARGET_BUILTIN(__builtin_ia32_vpmultishiftqb256_mask, "V32cV32cV32cV32cUi","","avx512vbmi,avx512vl")

	// generic select intrinsics
	TARGET_BUILTIN(__builtin_ia32_selectb_128, "V16cUsV16cV16c", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectb_256, "V32cUiV32cV32c", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectb_512, "V64cULLiV64cV64c", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectw_128, "V8sUcV8sV8s", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectw_256, "V16sUsV16sV16s", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectw_512, "V32sUiV32sV32s", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectd_128, "V4iUcV4iV4i", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectd_256, "V8iUcV8iV8i", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectd_512, "V16iUsV16iV16i", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectq_128, "V2LLiUcV2LLiV2LLi", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectq_256, "V4LLiUcV4LLiV4LLi", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectq_512, "V8LLiUcV8LLiV8LLi", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectps_128, "V4fUcV4fV4f", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectps_256, "V8fUcV8fV8f", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectps_512, "V16fUsV16fV16f", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectpd_128, "V2dUcV2dV2d", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectpd_256, "V4dUcV4dV4d", "", "")
	TARGET_BUILTIN(__builtin_ia32_selectpd_512, "V8dUcV8dV8d", "", "")

	// MONITORX/MWAITX
	TARGET_BUILTIN(__builtin_ia32_monitorx, "vv*UiUi", "", "mwaitx")
	TARGET_BUILTIN(__builtin_ia32_mwaitx, "vUiUiUi", "", "mwaitx")

	// CLZERO
	TARGET_BUILTIN(__builtin_ia32_clzero, "vv*", "", "clzero")

	// MSVC
	TARGET_HEADER_BUILTIN(_BitScanForward, "UcUNi*UNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(_BitScanReverse, "UcUNi*UNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")

	TARGET_HEADER_BUILTIN(_ReadWriteBarrier, "v", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(_ReadBarrier, "v", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(_WriteBarrier, "v", "nh", "intrin.h", ALL_MS_LANGUAGES, "")

	TARGET_HEADER_BUILTIN(__emul, "LLiii", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__emulu, "ULLiUiUi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")

	TARGET_HEADER_BUILTIN(_AddressOfReturnAddress, "v*", "nh", "intrin.h", ALL_MS_LANGUAGES, "")

	TARGET_HEADER_BUILTIN(__stosb, "vUc*Ucz", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__int2c, "v", "nr", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__ud2, "v", "nr", "intrin.h", ALL_MS_LANGUAGES, "")

	TARGET_HEADER_BUILTIN(__readfsbyte, "UcUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__readfsword, "UsUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__readfsdword, "UNiUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__readfsqword, "ULLiUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")

	TARGET_HEADER_BUILTIN(__readgsbyte, "UcUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__readgsword, "UsUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__readgsdword, "UNiUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")
	TARGET_HEADER_BUILTIN(__readgsqword, "ULLiUNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "")

	#undef BUILTIN
	#undef TARGET_BUILTIN
	#undef TARGET_HEADER_BUILTIN
	Index: head/contrib/llvm/tools/clang/include/clang/Driver/Options.td
	===================================================================
	--- head/contrib/llvm/tools/clang/include/clang/Driver/Options.td (revision 322319)
	+++ head/contrib/llvm/tools/clang/include/clang/Driver/Options.td (revision 322320)
	@@ -1,2626 +1,2622 @@
	//===--- Options.td - Options for clang -----------------------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file defines the options accepted by clang.
	//
	//===----------------------------------------------------------------------===//

	// Include the common option parsing interfaces.
	include "llvm/Option/OptParser.td"

	/////////
	// Flags

	// DriverOption - The option is a "driver" option, and should not be forwarded
	// to other tools.
	def DriverOption : OptionFlag;

	// LinkerInput - The option is a linker input.
	def LinkerInput : OptionFlag;

	// NoArgumentUnused - Don't report argument unused warnings for this option; this
	// is useful for options like -static or -dynamic which a user may always end up
	// passing, even if the platform defaults to (or only supports) that option.
	def NoArgumentUnused : OptionFlag;

	// Unsupported - The option is unsupported, and the driver will reject command
	// lines that use it.
	def Unsupported : OptionFlag;

	// Ignored - The option is unsupported, and the driver will silently ignore it.
	def Ignored : OptionFlag;

	// CoreOption - This is considered a "core" Clang option, available in both
	// clang and clang-cl modes.
	def CoreOption : OptionFlag;

	// CLOption - This is a cl.exe compatibility option. Options with this flag
	// are made available when the driver is running in CL compatibility mode.
	def CLOption : OptionFlag;

	// CC1Option - This option should be accepted by clang -cc1.
	def CC1Option : OptionFlag;

	// CC1AsOption - This option should be accepted by clang -cc1as.
	def CC1AsOption : OptionFlag;

	// NoDriverOption - This option should not be accepted by the driver.
	def NoDriverOption : OptionFlag;

	// A short name to show in documentation. The name will be interpreted as rST.
	class DocName<string name> { string DocName = name; }

	// A brief description to show in documentation, interpreted as rST.
	class DocBrief<code descr> { code DocBrief = descr; }

	// Indicates that this group should be flattened into its parent when generating
	// documentation.
	class DocFlatten { bit DocFlatten = 1; }

	// Indicates that this warning is ignored, but accepted with a warning for
	// GCC compatibility.
	class IgnoredGCCCompat : Flags<[HelpHidden]> {}

	/////////
	// Groups

	def Action_Group : OptionGroup<"<action group>">, DocName<"Actions">,
	DocBrief<[{The action to perform on the input.}]>;

	// Meta-group for options which are only used for compilation,
	// and not linking etc.
	def CompileOnly_Group : OptionGroup<"<CompileOnly group>">,
	DocName<"Compilation flags">, DocBrief<[{
	Flags controlling the behavior of Clang during compilation. These flags have
	no effect during actions that do not perform compilation.}]>;

	def Preprocessor_Group : OptionGroup<"<Preprocessor group>">,
	Group<CompileOnly_Group>,
	DocName<"Preprocessor flags">, DocBrief<[{
	Flags controlling the behavior of the Clang preprocessor.}]>;

	def IncludePath_Group : OptionGroup<"<I/i group>">, Group<Preprocessor_Group>,
	DocName<"Include path management">,
	DocBrief<[{
	Flags controlling how ``#include``\s are resolved to files.}]>;

	def I_Group : OptionGroup<"<I group>">, Group<IncludePath_Group>, DocFlatten;
	def i_Group : OptionGroup<"<i group>">, Group<IncludePath_Group>, DocFlatten;
	def clang_i_Group : OptionGroup<"<clang i group>">, Group<i_Group>, DocFlatten;

	def M_Group : OptionGroup<"<M group>">, Group<Preprocessor_Group>,
	DocName<"Dependency file generation">, DocBrief<[{
	Flags controlling generation of a dependency file for ``make``-like build
	systems.}]>;

	def d_Group : OptionGroup<"<d group>">, Group<Preprocessor_Group>,
	DocName<"Dumping preprocessor state">, DocBrief<[{
	Flags allowing the state of the preprocessor to be dumped in various ways.}]>;

	def Diag_Group : OptionGroup<"<W/R group>">, Group<CompileOnly_Group>,
	DocName<"Diagnostic flags">, DocBrief<[{
	Flags controlling which warnings, errors, and remarks Clang will generate.
	See the :doc:`full list of warning and remark flags <DiagnosticsReference>`.}]>;

	def R_Group : OptionGroup<"<R group>">, Group<Diag_Group>, DocFlatten;
	def R_value_Group : OptionGroup<"<R (with value) group>">, Group<R_Group>,
	DocFlatten;
	def W_Group : OptionGroup<"<W group>">, Group<Diag_Group>, DocFlatten;
	def W_value_Group : OptionGroup<"<W (with value) group>">, Group<W_Group>,
	DocFlatten;

	def f_Group : OptionGroup<"<f group>">, Group<CompileOnly_Group>,
	DocName<"Target-independent compilation options">;

	def f_clang_Group : OptionGroup<"<f (clang-only) group>">,
	Group<CompileOnly_Group>, DocFlatten;
	def pedantic_Group : OptionGroup<"<pedantic group>">, Group<f_Group>,
	DocFlatten;
	def opencl_Group : OptionGroup<"<opencl group>">, Group<f_Group>,
	DocName<"OpenCL flags">;

	def m_Group : OptionGroup<"<m group>">, Group<CompileOnly_Group>,
	DocName<"Target-dependent compilation options">;

	// Feature groups - these take command line options that correspond directly to
	// target specific features and can be translated directly from command line
	// options.
	def m_aarch64_Features_Group : OptionGroup<"<aarch64 features group>">,
	Group<m_Group>, DocName<"AARCH64">;
	def m_amdgpu_Features_Group : OptionGroup<"<amdgpu features group>">,
	Group<m_Group>, DocName<"AMDGPU">;
	def m_arm_Features_Group : OptionGroup<"<arm features group>">,
	Group<m_Group>, DocName<"ARM">;
	def m_hexagon_Features_Group : OptionGroup<"<hexagon features group>">,
	Group<m_Group>, DocName<"Hexagon">;
	def m_ppc_Features_Group : OptionGroup<"<ppc features group>">,
	Group<m_Group>, DocName<"PowerPC">;
	def m_wasm_Features_Group : OptionGroup<"<wasm features group>">,
	Group<m_Group>, DocName<"WebAssembly">;
	def m_x86_Features_Group : OptionGroup<"<x86 features group>">,
	Group<m_Group>, Flags<[CoreOption]>, DocName<"X86">;

	def m_libc_Group : OptionGroup<"<m libc group>">, Group<m_Group>,
	Flags<[HelpHidden]>;

	def O_Group : OptionGroup<"<O group>">, Group<CompileOnly_Group>,
	DocName<"Optimization level">, DocBrief<[{
	Flags controlling how much optimization should be performed.}]>;

	def DebugInfo_Group : OptionGroup<"<g group>">, Group<CompileOnly_Group>,
	DocName<"Debug information generation">, DocBrief<[{
	Flags controlling how much and what kind of debug information should be
	generated.}]>;

	def g_Group : OptionGroup<"<g group>">, Group<DebugInfo_Group>,
	DocName<"Kind and level of debug information">;
	def gN_Group : OptionGroup<"<gN group>">, Group<g_Group>,
	DocName<"Debug level">;
	def ggdbN_Group : OptionGroup<"<ggdbN group>">, Group<gN_Group>, DocFlatten;
	def gTune_Group : OptionGroup<"<gTune group>">, Group<g_Group>,
	DocName<"Debugger to tune debug information for">;
	def g_flags_Group : OptionGroup<"<g flags group>">, Group<DebugInfo_Group>,
	DocName<"Debug information flags">;

	def StaticAnalyzer_Group : OptionGroup<"<Static analyzer group>">,
	DocName<"Static analyzer flags">, DocBrief<[{
	Flags controlling the behavior of the Clang Static Analyzer.}]>;

	// gfortran options that we recognize in the driver and pass along when
	// invoking GCC to compile Fortran code.
	def gfortran_Group : OptionGroup<"<gfortran group>">,
	DocName<"Fortran compilation flags">, DocBrief<[{
	Flags that will be passed onto the ``gfortran`` compiler when Clang is given
	a Fortran input.}]>;

	def Link_Group : OptionGroup<"<T/e/s/t/u group>">, DocName<"Linker flags">,
	DocBrief<[{Flags that are passed on to the linker}]>;
	def T_Group : OptionGroup<"<T group>">, Group<Link_Group>, DocFlatten;
	def u_Group : OptionGroup<"<u group>">, Group<Link_Group>, DocFlatten;

	def reserved_lib_Group : OptionGroup<"<reserved libs group>">,
	Flags<[Unsupported]>;

	// Temporary groups for clang options which we know we don't support,
	// but don't want to verbosely warn the user about.
	def clang_ignored_f_Group : OptionGroup<"<clang ignored f group>">,
	Group<f_Group>, Flags<[Ignored]>;
	def clang_ignored_m_Group : OptionGroup<"<clang ignored m group>">,
	Group<m_Group>, Flags<[Ignored]>;

	// Group for clang options in the process of deprecation.
	// Please include the version that deprecated the flag as comment to allow
	// easier garbage collection.
	def clang_ignored_legacy_options_Group : OptionGroup<"<clang legacy flags>">,
	Group<f_Group>, Flags<[Ignored]>;

	// Retired with clang-5.0
	def : Flag<["-"], "fslp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;
	def : Flag<["-"], "fno-slp-vectorize-aggressive">, Group<clang_ignored_legacy_options_Group>;

	// Group that ignores all gcc optimizations that won't be implemented
	def clang_ignored_gcc_optimization_f_Group : OptionGroup<
	"<clang_ignored_gcc_optimization_f_Group>">, Group<f_Group>, Flags<[Ignored]>;

	/////////
	// Options

	// The internal option ID must be a valid C++ identifier and results in a
	// clang::driver::options::OPT_XX enum constant for XX.
	//
	// We want to unambiguously be able to refer to options from the driver source
	// code, for this reason the option name is mangled into an ID. This mangling
	// isn't guaranteed to have an inverse, but for practical purposes it does.
	//
	// The mangling scheme is to ignore the leading '-', and perform the following
	// substitutions:
	// _ => __
	// - => _
	// / => _SLASH
	// # => _HASH
	// ? => _QUESTION
	// , => _COMMA
	// = => _EQ
	// C++ => CXX
	// . => _

	// Developer Driver Options

	def internal_Group : OptionGroup<"<clang internal options>">, Flags<[HelpHidden]>;
	def internal_driver_Group : OptionGroup<"<clang driver internal options>">,
	Group<internal_Group>, HelpText<"DRIVER OPTIONS">;
	def internal_debug_Group :
	OptionGroup<"<clang debug/development internal options>">,
	Group<internal_Group>, HelpText<"DEBUG/DEVELOPMENT OPTIONS">;

	class InternalDriverOpt : Group<internal_driver_Group>,
	Flags<[DriverOption, HelpHidden]>;
	def driver_mode : Joined<["--"], "driver-mode=">, Group<internal_driver_Group>,
	Flags<[CoreOption, DriverOption, HelpHidden]>,
	HelpText<"Set the driver mode to either 'gcc', 'g++', 'cpp', or 'cl'">;
	def rsp_quoting : Joined<["--"], "rsp-quoting=">, Group<internal_driver_Group>,
	Flags<[CoreOption, DriverOption, HelpHidden]>,
	HelpText<"Set the rsp quoting to either 'posix', or 'windows'">;
	def ccc_gcc_name : Separate<["-"], "ccc-gcc-name">, InternalDriverOpt,
	HelpText<"Name for native GCC compiler">,
	MetaVarName<"<gcc-path>">;
	def ccc_pch_is_pch : Flag<["-"], "ccc-pch-is-pch">, InternalDriverOpt,
	HelpText<"Use lazy PCH for precompiled headers">;
	def ccc_pch_is_pth : Flag<["-"], "ccc-pch-is-pth">, InternalDriverOpt,
	HelpText<"Use pretokenized headers for precompiled headers">;

	class InternalDebugOpt : Group<internal_debug_Group>,
	Flags<[DriverOption, HelpHidden, CoreOption]>;
	def ccc_install_dir : Separate<["-"], "ccc-install-dir">, InternalDebugOpt,
	HelpText<"Simulate installation in the given directory">;
	def ccc_print_phases : Flag<["-"], "ccc-print-phases">, InternalDebugOpt,
	HelpText<"Dump list of actions to perform">;
	def ccc_print_bindings : Flag<["-"], "ccc-print-bindings">, InternalDebugOpt,
	HelpText<"Show bindings of tools to actions">;

	def ccc_arcmt_check : Flag<["-"], "ccc-arcmt-check">, InternalDriverOpt,
	HelpText<"Check for ARC migration issues that need manual handling">;
	def ccc_arcmt_modify : Flag<["-"], "ccc-arcmt-modify">, InternalDriverOpt,
	HelpText<"Apply modifications to files to conform to ARC">;
	def ccc_arcmt_migrate : Separate<["-"], "ccc-arcmt-migrate">, InternalDriverOpt,
	HelpText<"Apply modifications and produces temporary files that conform to ARC">;
	def arcmt_migrate_report_output : Separate<["-"], "arcmt-migrate-report-output">,
	HelpText<"Output path for the plist report">, Flags<[CC1Option]>;
	def arcmt_migrate_emit_arc_errors : Flag<["-"], "arcmt-migrate-emit-errors">,
	HelpText<"Emit ARC errors even if the migrator can fix them">,
	Flags<[CC1Option]>;
	def gen_reproducer: Flag<["-"], "gen-reproducer">, InternalDebugOpt,
	HelpText<"Auto-generates preprocessed source files and a reproduction script">;

	def _migrate : Flag<["--"], "migrate">, Flags<[DriverOption]>,
	HelpText<"Run the migrator">;
	def ccc_objcmt_migrate : Separate<["-"], "ccc-objcmt-migrate">,
	InternalDriverOpt,
	HelpText<"Apply modifications and produces temporary files to migrate to "
	"modern ObjC syntax">;
	def objcmt_migrate_literals : Flag<["-"], "objcmt-migrate-literals">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC literals">;
	def objcmt_migrate_subscripting : Flag<["-"], "objcmt-migrate-subscripting">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC subscripting">;
	def objcmt_migrate_property : Flag<["-"], "objcmt-migrate-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC property">;
	def objcmt_migrate_all : Flag<["-"], "objcmt-migrate-all">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC">;
	def objcmt_migrate_readonly_property : Flag<["-"], "objcmt-migrate-readonly-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC readonly property">;
	def objcmt_migrate_readwrite_property : Flag<["-"], "objcmt-migrate-readwrite-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to modern ObjC readwrite property">;
	def objcmt_migrate_property_dot_syntax : Flag<["-"], "objcmt-migrate-property-dot-syntax">, Flags<[CC1Option]>,
	HelpText<"Enable migration of setter/getter messages to property-dot syntax">;
	def objcmt_migrate_annotation : Flag<["-"], "objcmt-migrate-annotation">, Flags<[CC1Option]>,
	HelpText<"Enable migration to property and method annotations">;
	def objcmt_migrate_instancetype : Flag<["-"], "objcmt-migrate-instancetype">, Flags<[CC1Option]>,
	HelpText<"Enable migration to infer instancetype for method result type">;
	def objcmt_migrate_nsmacros : Flag<["-"], "objcmt-migrate-ns-macros">, Flags<[CC1Option]>,
	HelpText<"Enable migration to NS_ENUM/NS_OPTIONS macros">;
	def objcmt_migrate_protocol_conformance : Flag<["-"], "objcmt-migrate-protocol-conformance">, Flags<[CC1Option]>,
	HelpText<"Enable migration to add protocol conformance on classes">;
	def objcmt_atomic_property : Flag<["-"], "objcmt-atomic-property">, Flags<[CC1Option]>,
	HelpText<"Make migration to 'atomic' properties">;
	def objcmt_returns_innerpointer_property : Flag<["-"], "objcmt-returns-innerpointer-property">, Flags<[CC1Option]>,
	HelpText<"Enable migration to annotate property with NS_RETURNS_INNER_POINTER">;
	def objcmt_ns_nonatomic_iosonly: Flag<["-"], "objcmt-ns-nonatomic-iosonly">, Flags<[CC1Option]>,
	HelpText<"Enable migration to use NS_NONATOMIC_IOSONLY macro for setting property's 'atomic' attribute">;
	def objcmt_migrate_designated_init : Flag<["-"], "objcmt-migrate-designated-init">, Flags<[CC1Option]>,
	HelpText<"Enable migration to infer NS_DESIGNATED_INITIALIZER for initializer methods">;
	def objcmt_whitelist_dir_path: Joined<["-"], "objcmt-whitelist-dir-path=">, Flags<[CC1Option]>,
	HelpText<"Only modify files with a filename contained in the provided directory path">;
	// The misspelt "white-list" [sic] alias is due for removal.
	def : Joined<["-"], "objcmt-white-list-dir-path=">, Flags<[CC1Option]>,
	Alias<objcmt_whitelist_dir_path>;

	// Make sure all other -ccc- options are rejected.
	def ccc_ : Joined<["-"], "ccc-">, Group<internal_Group>, Flags<[Unsupported]>;

	// Standard Options

	def _HASH_HASH_HASH : Flag<["-"], "###">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Print (but do not run) the commands to run for this compilation">;
	def _DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>,
	Flags<[DriverOption, CoreOption]>;
	def A : JoinedOrSeparate<["-"], "A">, Flags<[RenderJoined]>, Group<gfortran_Group>;
	def B : JoinedOrSeparate<["-"], "B">, MetaVarName<"<dir>">,
	HelpText<"Add <dir> to search path for binaries and object files used implicitly">;
	def CC : Flag<["-"], "CC">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Include comments from within macros in preprocessed output">;
	def C : Flag<["-"], "C">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Include comments in preprocessed output">;
	def D : JoinedOrSeparate<["-"], "D">, Group<Preprocessor_Group>,
	Flags<[CC1Option]>, MetaVarName<"<macro>=<value>">,
	HelpText<"Define <macro> to <value> (or 1 if <value> omitted)">;
	def E : Flag<["-"], "E">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Only run the preprocessor">;
	def F : JoinedOrSeparate<["-"], "F">, Flags<[RenderJoined,CC1Option]>,
	HelpText<"Add directory to framework include search path">;
	def G : JoinedOrSeparate<["-"], "G">, Flags<[DriverOption]>, Group<m_Group>,
	MetaVarName<"<size>">, HelpText<"Put objects of at most <size> bytes "
	"into small data section (MIPS / Hexagon)">;
	def G_EQ : Joined<["-"], "G=">, Flags<[DriverOption]>, Group<m_Group>, Alias<G>;
	def H : Flag<["-"], "H">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Show header includes and nesting depth">;
	def I_ : Flag<["-"], "I-">, Group<I_Group>,
	HelpText<"Restrict all prior -I flags to double-quoted inclusion and "
	"remove current directory from include path">;
	def I : JoinedOrSeparate<["-"], "I">, Group<I_Group>,
	Flags<[CC1Option,CC1AsOption]>, MetaVarName<"<dir>">,
	HelpText<"Add directory to include search path">;
	def L : JoinedOrSeparate<["-"], "L">, Flags<[RenderJoined]>, Group<Link_Group>,
	MetaVarName<"<dir>">, HelpText<"Add directory to library search path">;
	def MD : Flag<["-"], "MD">, Group<M_Group>,
	HelpText<"Write a depfile containing user and system headers">;
	def MMD : Flag<["-"], "MMD">, Group<M_Group>,
	HelpText<"Write a depfile containing user headers">;
	def M : Flag<["-"], "M">, Group<M_Group>,
	HelpText<"Like -MD, but also implies -E and writes to stdout by default">;
	def MM : Flag<["-"], "MM">, Group<M_Group>,
	HelpText<"Like -MMD, but also implies -E and writes to stdout by default">;
	def MF : JoinedOrSeparate<["-"], "MF">, Group<M_Group>,
	HelpText<"Write depfile output from -MMD, -MD, -MM, or -M to <file>">,
	MetaVarName<"<file>">;
	def MG : Flag<["-"], "MG">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Add missing headers to depfile">;
	def MJ : JoinedOrSeparate<["-"], "MJ">, Group<M_Group>,
	HelpText<"Write a compilation database entry per input">;
	def MP : Flag<["-"], "MP">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Create phony target for each dependency (other than main file)">;
	def MQ : JoinedOrSeparate<["-"], "MQ">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Specify name of main file output to quote in depfile">;
	def MT : JoinedOrSeparate<["-"], "MT">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Specify name of main file output in depfile">;
	def MV : Flag<["-"], "MV">, Group<M_Group>, Flags<[CC1Option]>,
	HelpText<"Use NMake/Jom format for the depfile">;
	def Mach : Flag<["-"], "Mach">, Group<Link_Group>;
	def O0 : Flag<["-"], "O0">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
	def O4 : Flag<["-"], "O4">, Group<O_Group>, Flags<[CC1Option, HelpHidden]>;
	def ObjCXX : Flag<["-"], "ObjC++">, Flags<[DriverOption]>,
	HelpText<"Treat source input files as Objective-C++ inputs">;
	def ObjC : Flag<["-"], "ObjC">, Flags<[DriverOption]>,
	HelpText<"Treat source input files as Objective-C inputs">;
	def O : Joined<["-"], "O">, Group<O_Group>, Flags<[CC1Option]>;
	def O_flag : Flag<["-"], "O">, Flags<[CC1Option]>, Alias<O>, AliasArgs<["2"]>;
	def Ofast : Joined<["-"], "Ofast">, Group<O_Group>, Flags<[CC1Option]>;
	def P : Flag<["-"], "P">, Flags<[CC1Option]>, Group<Preprocessor_Group>,
	HelpText<"Disable linemarker output in -E mode">;
	def Qn : Flag<["-"], "Qn">, IgnoredGCCCompat;
	def Qunused_arguments : Flag<["-"], "Qunused-arguments">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Don't emit warning for unused driver arguments">;
	def Q : Flag<["-"], "Q">, IgnoredGCCCompat;
	def Rpass_EQ : Joined<["-"], "Rpass=">, Group<R_value_Group>, Flags<[CC1Option]>,
	HelpText<"Report transformations performed by optimization passes whose "
	"name matches the given POSIX regular expression">;
	def Rpass_missed_EQ : Joined<["-"], "Rpass-missed=">, Group<R_value_Group>,
	Flags<[CC1Option]>,
	HelpText<"Report missed transformations by optimization passes whose "
	"name matches the given POSIX regular expression">;
	def Rpass_analysis_EQ : Joined<["-"], "Rpass-analysis=">, Group<R_value_Group>,
	Flags<[CC1Option]>,
	HelpText<"Report transformation analysis from optimization passes whose "
	"name matches the given POSIX regular expression">;
	def R_Joined : Joined<["-"], "R">, Group<R_Group>, Flags<[CC1Option, CoreOption]>,
	MetaVarName<"<remark>">, HelpText<"Enable the specified remark">;
	def S : Flag<["-"], "S">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Only run preprocess and compilation steps">;
	def Tbss : JoinedOrSeparate<["-"], "Tbss">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
	def Tdata : JoinedOrSeparate<["-"], "Tdata">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
	def Ttext : JoinedOrSeparate<["-"], "Ttext">, Group<T_Group>,
	MetaVarName<"<addr>">, HelpText<"Set starting address of BSS to <addr>">;
	def T : JoinedOrSeparate<["-"], "T">, Group<T_Group>,
	MetaVarName<"<script>">, HelpText<"Specify <script> as linker script">;
	def U : JoinedOrSeparate<["-"], "U">, Group<Preprocessor_Group>,
	Flags<[CC1Option]>, MetaVarName<"<macro>">, HelpText<"Undefine macro <macro>">;
	def V : JoinedOrSeparate<["-"], "V">, Flags<[DriverOption, Unsupported]>;
	def Wa_COMMA : CommaJoined<["-"], "Wa,">,
	HelpText<"Pass the comma separated arguments in <arg> to the assembler">,
	MetaVarName<"<arg>">;
	def Wall : Flag<["-"], "Wall">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def WCL4 : Flag<["-"], "WCL4">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def Wdeprecated : Flag<["-"], "Wdeprecated">, Group<W_Group>, Flags<[CC1Option]>,
	HelpText<"Enable warnings for deprecated constructs and define __DEPRECATED">;
	def Wno_deprecated : Flag<["-"], "Wno-deprecated">, Group<W_Group>, Flags<[CC1Option]>;
	def Wl_COMMA : CommaJoined<["-"], "Wl,">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass the comma separated arguments in <arg> to the linker">,
	MetaVarName<"<arg>">, Group<Link_Group>;
	// FIXME: This is broken; these should not be Joined arguments.
	def Wno_nonportable_cfstrings : Joined<["-"], "Wno-nonportable-cfstrings">, Group<W_Group>,
	Flags<[CC1Option]>;
	def Wnonportable_cfstrings : Joined<["-"], "Wnonportable-cfstrings">, Group<W_Group>,
	Flags<[CC1Option]>;
	def Wp_COMMA : CommaJoined<["-"], "Wp,">,
	HelpText<"Pass the comma separated arguments in <arg> to the preprocessor">,
	MetaVarName<"<arg>">, Group<Preprocessor_Group>;
	def Wwrite_strings : Flag<["-"], "Wwrite-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def Wno_write_strings : Flag<["-"], "Wno-write-strings">, Group<W_Group>, Flags<[CC1Option, HelpHidden]>;
	def W_Joined : Joined<["-"], "W">, Group<W_Group>, Flags<[CC1Option, CoreOption]>,
	MetaVarName<"<warning>">, HelpText<"Enable the specified warning">;
	def Xanalyzer : Separate<["-"], "Xanalyzer">,
	HelpText<"Pass <arg> to the static analyzer">, MetaVarName<"<arg>">,
	Group<StaticAnalyzer_Group>;
	def Xarch__ : JoinedAndSeparate<["-"], "Xarch_">, Flags<[DriverOption]>;
	def Xassembler : Separate<["-"], "Xassembler">,
	HelpText<"Pass <arg> to the assembler">, MetaVarName<"<arg>">,
	Group<CompileOnly_Group>;
	def Xclang : Separate<["-"], "Xclang">,
	HelpText<"Pass <arg> to the clang compiler">, MetaVarName<"<arg>">,
	Flags<[DriverOption, CoreOption]>, Group<CompileOnly_Group>;
	def Xcuda_fatbinary : Separate<["-"], "Xcuda-fatbinary">,
	HelpText<"Pass <arg> to fatbinary invocation">, MetaVarName<"<arg>">;
	def Xcuda_ptxas : Separate<["-"], "Xcuda-ptxas">,
	HelpText<"Pass <arg> to the ptxas assembler">, MetaVarName<"<arg>">;
	def z : Separate<["-"], "z">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass -z <arg> to the linker">, MetaVarName<"<arg>">,
	Group<Link_Group>;
	def Xlinker : Separate<["-"], "Xlinker">, Flags<[LinkerInput, RenderAsInput]>,
	HelpText<"Pass <arg> to the linker">, MetaVarName<"<arg>">,
	Group<Link_Group>;
	def Xpreprocessor : Separate<["-"], "Xpreprocessor">, Group<Preprocessor_Group>,
	HelpText<"Pass <arg> to the preprocessor">, MetaVarName<"<arg>">;
	def X_Flag : Flag<["-"], "X">, Group<Link_Group>;
	def X_Joined : Joined<["-"], "X">, IgnoredGCCCompat;
	def Z_Flag : Flag<["-"], "Z">, Group<Link_Group>;
	// FIXME: All we do with this is reject it. Remove.
	def Z_Joined : Joined<["-"], "Z">;
	def all__load : Flag<["-"], "all_load">;
	def allowable__client : Separate<["-"], "allowable_client">;
	def ansi : Flag<["-", "--"], "ansi">;
	def arch__errors__fatal : Flag<["-"], "arch_errors_fatal">;
	def arch : Separate<["-"], "arch">, Flags<[DriverOption]>;
	def arch__only : Separate<["-"], "arch_only">;
	def a : Joined<["-"], "a">;
	def autocomplete : Joined<["--"], "autocomplete=">;
	def bind__at__load : Flag<["-"], "bind_at_load">;
	def bundle__loader : Separate<["-"], "bundle_loader">;
	def bundle : Flag<["-"], "bundle">;
	def b : JoinedOrSeparate<["-"], "b">, Flags<[Unsupported]>;
	def cl_opt_disable : Flag<["-"], "cl-opt-disable">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. This option disables all optimizations. By default optimizations are enabled.">;
	def cl_strict_aliasing : Flag<["-"], "cl-strict-aliasing">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. This option is added for compatibility with OpenCL 1.0.">;
	def cl_single_precision_constant : Flag<["-"], "cl-single-precision-constant">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Treat double precision floating-point constant as single precision constant.">;
	def cl_finite_math_only : Flag<["-"], "cl-finite-math-only">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow floating-point optimizations that assume arguments and results are not NaNs or +-Inf.">;
	def cl_kernel_arg_info : Flag<["-"], "cl-kernel-arg-info">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Generate kernel argument metadata.">;
	def cl_unsafe_math_optimizations : Flag<["-"], "cl-unsafe-math-optimizations">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow unsafe floating-point optimizations. Also implies -cl-no-signed-zeros and -cl-mad-enable.">;
	def cl_fast_relaxed_math : Flag<["-"], "cl-fast-relaxed-math">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Sets -cl-finite-math-only and -cl-unsafe-math-optimizations, and defines __FAST_RELAXED_MATH__.">;
	def cl_mad_enable : Flag<["-"], "cl-mad-enable">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow use of less precise MAD computations in the generated binary.">;
	def cl_no_signed_zeros : Flag<["-"], "cl-no-signed-zeros">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow use of less precise no signed zeros computations in the generated binary.">;
	def cl_std_EQ : Joined<["-"], "cl-std=">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL language standard to compile for.">, Values<"cl,CL,cl1.1,CL1.1,cl1.2,CL1.2,cl2.0,CL2.0">;
	def cl_denorms_are_zero : Flag<["-"], "cl-denorms-are-zero">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Allow denormals to be flushed to zero.">;
	def cl_fp32_correctly_rounded_divide_sqrt : Flag<["-"], "cl-fp32-correctly-rounded-divide-sqrt">, Group<opencl_Group>, Flags<[CC1Option]>,
	HelpText<"OpenCL only. Specify that single precision floating-point divide and sqrt used in the program source are correctly rounded.">;
	def client__name : JoinedOrSeparate<["-"], "client_name">;
	def combine : Flag<["-", "--"], "combine">, Flags<[DriverOption, Unsupported]>;
	def compatibility__version : JoinedOrSeparate<["-"], "compatibility_version">;
	def coverage : Flag<["-", "--"], "coverage">;
	def cpp_precomp : Flag<["-"], "cpp-precomp">, Group<clang_ignored_f_Group>;
	def current__version : JoinedOrSeparate<["-"], "current_version">;
	def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group<clang_i_Group>,
	HelpText<"Add directory to the C++ SYSTEM include search path">, Flags<[CC1Option]>,
	MetaVarName<"<directory>">;
	def c : Flag<["-"], "c">, Flags<[DriverOption]>, Group<Action_Group>,
	HelpText<"Only run preprocess, compile, and assemble steps">;
	def cuda_device_only : Flag<["--"], "cuda-device-only">,
	HelpText<"Compile CUDA code for device only">;
	def cuda_host_only : Flag<["--"], "cuda-host-only">,
	HelpText<"Compile CUDA code for host only. Has no effect on non-CUDA "
	"compilations.">;
	def cuda_compile_host_device : Flag<["--"], "cuda-compile-host-device">,
	HelpText<"Compile CUDA code for both host and device (default). Has no "
	"effect on non-CUDA compilations.">;
	def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">, Flags<[DriverOption]>,
	HelpText<"CUDA GPU architecture (e.g. sm_35). May be specified more than once.">;
	def no_cuda_gpu_arch_EQ : Joined<["--"], "no-cuda-gpu-arch=">, Flags<[DriverOption]>,
	HelpText<"Remove GPU architecture (e.g. sm_35) from the list of GPUs to compile for. "
	"'all' resets the list to its default value.">;
	def cuda_noopt_device_debug : Flag<["--"], "cuda-noopt-device-debug">,
	HelpText<"Enable device-side debug info generation. Disables ptxas optimizations.">;
	def no_cuda_version_check : Flag<["--"], "no-cuda-version-check">,
	HelpText<"Don't error out if the detected version of the CUDA install is "
	"too low for the requested CUDA gpu architecture.">;
	def no_cuda_noopt_device_debug : Flag<["--"], "no-cuda-noopt-device-debug">;
	def cuda_path_EQ : Joined<["--"], "cuda-path=">, Group<i_Group>,
	HelpText<"CUDA installation path">;
	def ptxas_path_EQ : Joined<["--"], "ptxas-path=">, Group<i_Group>,
	HelpText<"Path to ptxas (used for compiling CUDA code)">;
	def fcuda_flush_denormals_to_zero : Flag<["-"], "fcuda-flush-denormals-to-zero">,
	Flags<[CC1Option]>, HelpText<"Flush denormal floating point values to zero in CUDA device mode.">;
	def fno_cuda_flush_denormals_to_zero : Flag<["-"], "fno-cuda-flush-denormals-to-zero">;
	def fcuda_approx_transcendentals : Flag<["-"], "fcuda-approx-transcendentals">,
	Flags<[CC1Option]>, HelpText<"Use approximate transcendental functions">;
	def fno_cuda_approx_transcendentals : Flag<["-"], "fno-cuda-approx-transcendentals">;
	def dA : Flag<["-"], "dA">, Group<d_Group>;
	def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode in addition to normal output">;
	def dI : Flag<["-"], "dI">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print include directives in -E mode in addition to normal output">;
	def dM : Flag<["-"], "dM">, Group<d_Group>, Flags<[CC1Option]>,
	HelpText<"Print macro definitions in -E mode instead of normal output">;
	def dead__strip : Flag<["-"], "dead_strip">;
	def dependency_file : Separate<["-"], "dependency-file">, Flags<[CC1Option]>,
	HelpText<"Filename (or -) to write dependency output to">;
	def dependency_dot : Separate<["-"], "dependency-dot">, Flags<[CC1Option]>,
	HelpText<"Filename to write DOT-formatted header dependencies to">;
	def module_dependency_dir : Separate<["-"], "module-dependency-dir">,
	Flags<[CC1Option]>, HelpText<"Directory to dump module dependencies to">;
	def dumpmachine : Flag<["-"], "dumpmachine">;
	def dumpspecs : Flag<["-"], "dumpspecs">, Flags<[Unsupported]>;
	def dumpversion : Flag<["-"], "dumpversion">;
	def dylib__file : Separate<["-"], "dylib_file">;
	def dylinker__install__name : JoinedOrSeparate<["-"], "dylinker_install_name">;
	def dylinker : Flag<["-"], "dylinker">;
	def dynamiclib : Flag<["-"], "dynamiclib">;
	def dynamic : Flag<["-"], "dynamic">, Flags<[NoArgumentUnused]>;
	def d_Flag : Flag<["-"], "d">, Group<d_Group>;
	def d_Joined : Joined<["-"], "d">, Group<d_Group>;
	def emit_ast : Flag<["-"], "emit-ast">,
	HelpText<"Emit Clang AST files for source inputs">;
	def emit_llvm : Flag<["-"], "emit-llvm">, Flags<[CC1Option]>, Group<Action_Group>,
	HelpText<"Use the LLVM representation for assembler and object files">;
	def exported__symbols__list : Separate<["-"], "exported_symbols_list">;
	def e : JoinedOrSeparate<["-"], "e">, Group<Link_Group>;
	def fPIC : Flag<["-"], "fPIC">, Group<f_Group>;
	def fno_PIC : Flag<["-"], "fno-PIC">, Group<f_Group>;
	def fPIE : Flag<["-"], "fPIE">, Group<f_Group>;
	def fno_PIE : Flag<["-"], "fno-PIE">, Group<f_Group>;
	def faccess_control : Flag<["-"], "faccess-control">, Group<f_Group>;
	def fallow_unsupported : Flag<["-"], "fallow-unsupported">, Group<f_Group>;
	def fapple_kext : Flag<["-"], "fapple-kext">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use Apple's kernel extensions ABI">;
	def fapple_pragma_pack : Flag<["-"], "fapple-pragma-pack">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable Apple gcc-compatible #pragma pack handling">;
	def shared_libasan : Flag<["-"], "shared-libasan">;
	def fasm : Flag<["-"], "fasm">, Group<f_Group>;

	def fasm_blocks : Flag<["-"], "fasm-blocks">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_asm_blocks : Flag<["-"], "fno-asm-blocks">, Group<f_Group>;

	def fassume_sane_operator_new : Flag<["-"], "fassume-sane-operator-new">, Group<f_Group>;
	def fastcp : Flag<["-"], "fastcp">, Group<f_Group>;
	def fastf : Flag<["-"], "fastf">, Group<f_Group>;
	def fast : Flag<["-"], "fast">, Group<f_Group>;
	def fasynchronous_unwind_tables : Flag<["-"], "fasynchronous-unwind-tables">, Group<f_Group>;

	def fautolink : Flag <["-"], "fautolink">, Group<f_Group>;
	def fno_autolink : Flag <["-"], "fno-autolink">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable generation of linker directives for automatic library linking">;

	// C++ Coroutines TS
	def fcoroutines_ts : Flag <["-"], "fcoroutines-ts">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable support for the C++ Coroutines TS">;
	def fno_coroutines_ts : Flag <["-"], "fno-coroutines-ts">, Group<f_Group>,
	Flags<[DriverOption]>;

	def fembed_bitcode_EQ : Joined<["-"], "fembed-bitcode=">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>, MetaVarName<"<option>">,
	HelpText<"Embed LLVM bitcode (option: off, all, bitcode, marker)">;
	def fembed_bitcode : Flag<["-"], "fembed-bitcode">, Group<f_Group>,
	Alias<fembed_bitcode_EQ>, AliasArgs<["all"]>,
	HelpText<"Embed LLVM IR bitcode as data">;
	def fembed_bitcode_marker : Flag<["-"], "fembed-bitcode-marker">,
	Alias<fembed_bitcode_EQ>, AliasArgs<["marker"]>,
	HelpText<"Embed placeholder LLVM IR data as a marker">;
	def fgnu_inline_asm : Flag<["-"], "fgnu-inline-asm">, Group<f_Group>, Flags<[DriverOption]>;
	def fno_gnu_inline_asm : Flag<["-"], "fno-gnu-inline-asm">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Disable GNU style inline asm">;

	def fprofile_sample_use : Flag<["-"], "fprofile-sample-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_profile_sample_use : Flag<["-"], "fno-profile-sample-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fprofile_sample_use_EQ : Joined<["-"], "fprofile-sample-use=">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable sample-based profile guided optimizations">;
	def fauto_profile : Flag<["-"], "fauto-profile">, Group<f_Group>,
	Alias<fprofile_sample_use>;
	def fno_auto_profile : Flag<["-"], "fno-auto-profile">, Group<f_Group>,
	Alias<fno_profile_sample_use>;
	def fauto_profile_EQ : Joined<["-"], "fauto-profile=">,
	Alias<fprofile_sample_use_EQ>;
	def fdebug_info_for_profiling : Flag<["-"], "fdebug-info-for-profiling">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Emit extra debug info to make sample profile more accurate.">;
	def fno_debug_info_for_profiling : Flag<["-"], "fno-debug-info-for-profiling">, Group<f_Group>,
	Flags<[DriverOption]>,
	HelpText<"Do not emit extra debug info for sample profiler.">;
	def fprofile_instr_generate : Flag<["-"], "fprofile-instr-generate">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Generate instrumented code to collect execution counts into default.profraw file (overridden by '=' form of option or LLVM_PROFILE_FILE env var)">;
	def fprofile_instr_generate_EQ : Joined<["-"], "fprofile-instr-generate=">,
	Group<f_Group>, Flags<[CoreOption]>, MetaVarName<"<file>">,
	HelpText<"Generate instrumented code to collect execution counts into <file> (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_instr_use : Flag<["-"], "fprofile-instr-use">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fprofile_instr_use_EQ : Joined<["-"], "fprofile-instr-use=">,
	Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Use instrumentation data for profile-guided optimization">;
	def fcoverage_mapping : Flag<["-"], "fcoverage-mapping">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Generate coverage mapping to enable code coverage analysis">;
	def fno_coverage_mapping : Flag<["-"], "fno-coverage-mapping">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Disable code coverage analysis">;
	def fprofile_generate : Flag<["-"], "fprofile-generate">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">,
	Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
	HelpText<"Generate instrumented code to collect execution counts into <directory>/default.profraw (overridden by LLVM_PROFILE_FILE env var)">;
	def fprofile_use : Flag<["-"], "fprofile-use">, Group<f_Group>,
	Alias<fprofile_instr_use>;
	def fprofile_use_EQ : Joined<["-"], "fprofile-use=">,
	Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<pathname>">,
	HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from <pathname>/default.profdata. Otherwise, it reads from file <pathname>.">;
	def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Disable generation of profile instrumentation.">;
	def fno_profile_generate : Flag<["-"], "fno-profile-generate">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Disable generation of profile instrumentation.">;
	def fno_profile_instr_use : Flag<["-"], "fno-profile-instr-use">,
	Group<f_Group>, Flags<[DriverOption]>,
	HelpText<"Disable using instrumentation data for profile-guided optimization">;
	def fno_profile_use : Flag<["-"], "fno-profile-use">,
	Alias<fno_profile_instr_use>;

	def fblocks : Flag<["-"], "fblocks">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable the 'blocks' language feature">;
	def fbootclasspath_EQ : Joined<["-"], "fbootclasspath=">, Group<f_Group>;
	def fborland_extensions : Flag<["-"], "fborland-extensions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Accept non-standard constructs supported by the Borland compiler">;
	def fbuiltin : Flag<["-"], "fbuiltin">, Group<f_Group>;
	def fbuiltin_module_map : Flag <["-"], "fbuiltin-module-map">, Group<f_Group>,
	Flags<[DriverOption]>, HelpText<"Load the clang builtins module map file.">;
	def fcaret_diagnostics : Flag<["-"], "fcaret-diagnostics">, Group<f_Group>;
	def fclasspath_EQ : Joined<["-"], "fclasspath=">, Group<f_Group>;
	def fcolor_diagnostics : Flag<["-"], "fcolor-diagnostics">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Use colors in diagnostics">;
	def fdiagnostics_color : Flag<["-"], "fdiagnostics-color">, Group<f_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fdiagnostics_color_EQ : Joined<["-"], "fdiagnostics-color=">, Group<f_Group>;
	def fansi_escape_codes : Flag<["-"], "fansi-escape-codes">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Use ANSI escape codes for diagnostics">;
	def fcomment_block_commands : CommaJoined<["-"], "fcomment-block-commands=">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Treat each comma separated argument in <arg> as a documentation comment block command">,
	MetaVarName<"<arg>">;
	def fparse_all_comments : Flag<["-"], "fparse-all-comments">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fcommon : Flag<["-"], "fcommon">, Group<f_Group>;
	def fcompile_resource_EQ : Joined<["-"], "fcompile-resource=">, Group<f_Group>;
	def fconstant_cfstrings : Flag<["-"], "fconstant-cfstrings">, Group<f_Group>;
	def fconstant_string_class_EQ : Joined<["-"], "fconstant-string-class=">, Group<f_Group>;
	def fconstexpr_depth_EQ : Joined<["-"], "fconstexpr-depth=">, Group<f_Group>;
	def fconstexpr_steps_EQ : Joined<["-"], "fconstexpr-steps=">, Group<f_Group>;
	def fconstexpr_backtrace_limit_EQ : Joined<["-"], "fconstexpr-backtrace-limit=">,
	Group<f_Group>;
	def fno_crash_diagnostics : Flag<["-"], "fno-crash-diagnostics">, Group<f_clang_Group>, Flags<[NoArgumentUnused]>,
	HelpText<"Disable auto-generation of preprocessed source files and a script for reproduction during a clang crash">;
	def fcreate_profile : Flag<["-"], "fcreate-profile">, Group<f_Group>;
	def fcxx_exceptions: Flag<["-"], "fcxx-exceptions">, Group<f_Group>,
	HelpText<"Enable C++ exceptions">, Flags<[CC1Option]>;
	def fcxx_modules : Flag <["-"], "fcxx-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fdebug_pass_arguments : Flag<["-"], "fdebug-pass-arguments">, Group<f_Group>;
	def fdebug_pass_structure : Flag<["-"], "fdebug-pass-structure">, Group<f_Group>;
	def fdepfile_entry : Joined<["-"], "fdepfile-entry=">,
	Group<f_clang_Group>, Flags<[CC1Option]>;
	def fdiagnostics_fixit_info : Flag<["-"], "fdiagnostics-fixit-info">, Group<f_clang_Group>;
	def fdiagnostics_parseable_fixits : Flag<["-"], "fdiagnostics-parseable-fixits">, Group<f_clang_Group>,
	Flags<[CoreOption, CC1Option]>, HelpText<"Print fix-its in machine parseable form">;
	def fdiagnostics_print_source_range_info : Flag<["-"], "fdiagnostics-print-source-range-info">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Print source range spans in numeric form">;
	def fdiagnostics_show_hotness : Flag<["-"], "fdiagnostics-show-hotness">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable profile hotness information in diagnostic line">;
	def fdiagnostics_hotness_threshold_EQ : Joined<["-"], "fdiagnostics-hotness-threshold=">,
	Group<f_Group>, Flags<[CC1Option]>, MetaVarName<"<number>">,
	HelpText<"Prevent optimization remarks from being output if they do not have at least this profile count">;
	def fdiagnostics_show_option : Flag<["-"], "fdiagnostics-show-option">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Print option name with mappable diagnostics">;
	def fdiagnostics_show_note_include_stack : Flag<["-"], "fdiagnostics-show-note-include-stack">,
	Group<f_Group>, Flags<[CC1Option]>, HelpText<"Display include stacks for diagnostic notes">;
	def fdiagnostics_format_EQ : Joined<["-"], "fdiagnostics-format=">, Group<f_clang_Group>;
	def fdiagnostics_show_category_EQ : Joined<["-"], "fdiagnostics-show-category=">, Group<f_clang_Group>;
	def fdiagnostics_show_template_tree : Flag<["-"], "fdiagnostics-show-template-tree">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Print a template comparison tree for differing templates">;
	def fdeclspec : Flag<["-"], "fdeclspec">, Group<f_clang_Group>,
	HelpText<"Allow __declspec as a keyword">, Flags<[CC1Option]>;
	def fdollars_in_identifiers : Flag<["-"], "fdollars-in-identifiers">, Group<f_Group>,
	HelpText<"Allow '$' in identifiers">, Flags<[CC1Option]>;
	def fdwarf2_cfi_asm : Flag<["-"], "fdwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
	def fno_dwarf2_cfi_asm : Flag<["-"], "fno-dwarf2-cfi-asm">, Group<clang_ignored_f_Group>;
	def fdwarf_directory_asm : Flag<["-"], "fdwarf-directory-asm">, Group<f_Group>;
	def fno_dwarf_directory_asm : Flag<["-"], "fno-dwarf-directory-asm">, Group<f_Group>, Flags<[CC1Option]>;
	def felide_constructors : Flag<["-"], "felide-constructors">, Group<f_Group>;
	def fno_elide_type : Flag<["-"], "fno-elide-type">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Do not elide types when printing diagnostics">;
	def feliminate_unused_debug_symbols : Flag<["-"], "feliminate-unused-debug-symbols">, Group<f_Group>;
	def femit_all_decls : Flag<["-"], "femit-all-decls">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Emit all declarations, even if unused">;
	def femulated_tls : Flag<["-"], "femulated-tls">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use emutls functions to access thread_local variables">;
	def fno_emulated_tls : Flag<["-"], "fno-emulated-tls">, Group<f_Group>;
	def fencoding_EQ : Joined<["-"], "fencoding=">, Group<f_Group>;
	def ferror_limit_EQ : Joined<["-"], "ferror-limit=">, Group<f_Group>, Flags<[CoreOption]>;
	def fexceptions : Flag<["-"], "fexceptions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable support for exception handling">;
	def fsjlj_exceptions : Flag<["-"], "fsjlj-exceptions">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Use SjLj style exceptions">;
	def fexcess_precision_EQ : Joined<["-"], "fexcess-precision=">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fexpensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-expensive-optimizations">, Group<clang_ignored_gcc_optimization_f_Group>;
	def fextdirs_EQ : Joined<["-"], "fextdirs=">, Group<f_Group>;
	def : Flag<["-"], "fdefer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fno-defer-pop">, Group<clang_ignored_gcc_optimization_f_Group>;
	def : Flag<["-"], "fextended-identifiers">, Group<clang_ignored_f_Group>;
	def : Flag<["-"], "fno-extended-identifiers">, Group<f_Group>, Flags<[Unsupported]>;
	def fhosted : Flag<["-"], "fhosted">, Group<f_Group>;
	def fdenormal_fp_math_EQ : Joined<["-"], "fdenormal-fp-math=">, Group<f_Group>, Flags<[CC1Option]>;
	def ffast_math : Flag<["-"], "ffast-math">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow aggressive, lossy floating-point optimizations">;
	def fno_fast_math : Flag<["-"], "fno-fast-math">, Group<f_Group>;
	def fmath_errno : Flag<["-"], "fmath-errno">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Require math functions to indicate errors by setting errno">;
	def fno_math_errno : Flag<["-"], "fno-math-errno">, Group<f_Group>;
	def fbracket_depth_EQ : Joined<["-"], "fbracket-depth=">, Group<f_Group>;
	def fsignaling_math : Flag<["-"], "fsignaling-math">, Group<f_Group>;
	def fno_signaling_math : Flag<["-"], "fno-signaling-math">, Group<f_Group>;
	def fjump_tables : Flag<["-"], "fjump-tables">, Group<f_Group>;
	def fno_jump_tables : Flag<["-"], "fno-jump-tables">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not use jump tables for lowering switches">;

	// Begin sanitizer flags. These should all be core options exposed in all driver
	// modes.
	let Flags = [CC1Option, CoreOption] in {

	def fsanitize_EQ : CommaJoined<["-"], "fsanitize=">, Group<f_clang_Group>,
	MetaVarName<"<check>">,
	HelpText<"Turn on runtime checks for various forms of undefined "
	"or suspicious behavior. See user manual for available checks">;
	def fno_sanitize_EQ : CommaJoined<["-"], "fno-sanitize=">, Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fsanitize_blacklist : Joined<["-"], "fsanitize-blacklist=">,
	Group<f_clang_Group>,
	HelpText<"Path to blacklist file for sanitizers">;
	def fno_sanitize_blacklist : Flag<["-"], "fno-sanitize-blacklist">,
	Group<f_clang_Group>,
	HelpText<"Don't use blacklist file for sanitizers">;
	def fsanitize_coverage
	: CommaJoined<["-"], "fsanitize-coverage=">,
	Group<f_clang_Group>,
	HelpText<"Specify the type of coverage instrumentation for Sanitizers">;
	def fno_sanitize_coverage
	: CommaJoined<["-"], "fno-sanitize-coverage=">,
	Group<f_clang_Group>, Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable specified features of coverage instrumentation for "
	"Sanitizers">, Values<"func,bb,edge,indirect-calls,trace-bb,trace-cmp,trace-div,trace-gep,8bit-counters,trace-pc,trace-pc-guard,no-prune,inline-8bit-counters">;
	def fsanitize_memory_track_origins_EQ : Joined<["-"], "fsanitize-memory-track-origins=">,
	Group<f_clang_Group>,
	HelpText<"Enable origins tracking in MemorySanitizer">;
	def fsanitize_memory_track_origins : Flag<["-"], "fsanitize-memory-track-origins">,
	Group<f_clang_Group>,
	HelpText<"Enable origins tracking in MemorySanitizer">;
	def fno_sanitize_memory_track_origins : Flag<["-"], "fno-sanitize-memory-track-origins">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable origins tracking in MemorySanitizer">;
	def fsanitize_memory_use_after_dtor : Flag<["-"], "fsanitize-memory-use-after-dtor">,
	Group<f_clang_Group>,
	HelpText<"Enable use-after-destroy detection in MemorySanitizer">;
	def fsanitize_address_field_padding : Joined<["-"], "fsanitize-address-field-padding=">,
	Group<f_clang_Group>,
	HelpText<"Level of field padding for AddressSanitizer">;
	def fsanitize_address_use_after_scope : Flag<["-"], "fsanitize-address-use-after-scope">,
	Group<f_clang_Group>,
	HelpText<"Enable use-after-scope detection in AddressSanitizer">;
	def fno_sanitize_address_use_after_scope : Flag<["-"], "fno-sanitize-address-use-after-scope">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable use-after-scope detection in AddressSanitizer">;
	def fsanitize_address_globals_dead_stripping : Flag<["-"], "fsanitize-address-globals-dead-stripping">,
	Group<f_clang_Group>,
	HelpText<"Enable linker dead stripping of globals in AddressSanitizer">;
	def fsanitize_recover : Flag<["-"], "fsanitize-recover">, Group<f_clang_Group>;
	def fno_sanitize_recover : Flag<["-"], "fno-sanitize-recover">,
	Flags<[CoreOption, DriverOption]>,
	Group<f_clang_Group>;
	def fsanitize_recover_EQ : CommaJoined<["-"], "fsanitize-recover=">,
	Group<f_clang_Group>,
	HelpText<"Enable recovery for specified sanitizers">;
	def fno_sanitize_recover_EQ
	: CommaJoined<["-"], "fno-sanitize-recover=">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable recovery for specified sanitizers">;
	def fsanitize_trap_EQ : CommaJoined<["-"], "fsanitize-trap=">, Group<f_clang_Group>,
	HelpText<"Enable trapping for specified sanitizers">;
	def fno_sanitize_trap_EQ : CommaJoined<["-"], "fno-sanitize-trap=">, Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable trapping for specified sanitizers">;
	def fsanitize_undefined_trap_on_error : Flag<["-"], "fsanitize-undefined-trap-on-error">,
	Group<f_clang_Group>;
	def fno_sanitize_undefined_trap_on_error : Flag<["-"], "fno-sanitize-undefined-trap-on-error">,
	Group<f_clang_Group>;
	def fsanitize_link_cxx_runtime : Flag<["-"], "fsanitize-link-c++-runtime">,
	Group<f_clang_Group>;
	def fsanitize_cfi_cross_dso : Flag<["-"], "fsanitize-cfi-cross-dso">,
	Group<f_clang_Group>,
	HelpText<"Enable control flow integrity (CFI) checks for cross-DSO calls.">;
	def fno_sanitize_cfi_cross_dso : Flag<["-"], "fno-sanitize-cfi-cross-dso">,
	Flags<[CoreOption, DriverOption]>,
	Group<f_clang_Group>,
	HelpText<"Disable control flow integrity (CFI) checks for cross-DSO calls.">;
	def fsanitize_stats : Flag<["-"], "fsanitize-stats">,
	Group<f_clang_Group>,
	HelpText<"Enable sanitizer statistics gathering.">;
	def fno_sanitize_stats : Flag<["-"], "fno-sanitize-stats">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable sanitizer statistics gathering.">;
	def fsanitize_thread_memory_access : Flag<["-"], "fsanitize-thread-memory-access">,
	Group<f_clang_Group>,
	HelpText<"Enable memory access instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_memory_access : Flag<["-"], "fno-sanitize-thread-memory-access">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable memory access instrumentation in ThreadSanitizer">;
	def fsanitize_thread_func_entry_exit : Flag<["-"], "fsanitize-thread-func-entry-exit">,
	Group<f_clang_Group>,
	HelpText<"Enable function entry/exit instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_func_entry_exit : Flag<["-"], "fno-sanitize-thread-func-entry-exit">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable function entry/exit instrumentation in ThreadSanitizer">;
	def fsanitize_thread_atomics : Flag<["-"], "fsanitize-thread-atomics">,
	Group<f_clang_Group>,
	HelpText<"Enable atomic operations instrumentation in ThreadSanitizer (default)">;
	def fno_sanitize_thread_atomics : Flag<["-"], "fno-sanitize-thread-atomics">,
	Group<f_clang_Group>,
	Flags<[CoreOption, DriverOption]>,
	HelpText<"Disable atomic operations instrumentation in ThreadSanitizer">;
	def fsanitize_undefined_strip_path_components_EQ : Joined<["-"], "fsanitize-undefined-strip-path-components=">,
	Group<f_clang_Group>, MetaVarName<"<number>">,
	HelpText<"Strip (or keep only, if negative) a given number of path components "
	"when emitting check metadata.">;

	} // end -f[no-]sanitize* flags

	def funsafe_math_optimizations : Flag<["-"], "funsafe-math-optimizations">,
	Group<f_Group>;
	def fno_unsafe_math_optimizations : Flag<["-"], "fno-unsafe-math-optimizations">,
	Group<f_Group>;
	def fassociative_math : Flag<["-"], "fassociative-math">, Group<f_Group>;
	def fno_associative_math : Flag<["-"], "fno-associative-math">, Group<f_Group>;
	def freciprocal_math :
	Flag<["-"], "freciprocal-math">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow division operations to be reassociated">;
	def fno_reciprocal_math : Flag<["-"], "fno-reciprocal-math">, Group<f_Group>;
	def ffinite_math_only : Flag<["-"], "ffinite-math-only">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_finite_math_only : Flag<["-"], "fno-finite-math-only">, Group<f_Group>;
	def fsigned_zeros : Flag<["-"], "fsigned-zeros">, Group<f_Group>;
	def fno_signed_zeros :
	Flag<["-"], "fno-signed-zeros">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow optimizations that ignore the sign of floating point zeros">;
	def fhonor_nans : Flag<["-"], "fhonor-nans">, Group<f_Group>;
	def fno_honor_nans : Flag<["-"], "fno-honor-nans">, Group<f_Group>;
	def fhonor_infinities : Flag<["-"], "fhonor-infinities">, Group<f_Group>;
	def fno_honor_infinities : Flag<["-"], "fno-honor-infinities">, Group<f_Group>;
	// This option was originally misspelt "infinites" [sic].
	def : Flag<["-"], "fhonor-infinites">, Alias<fhonor_infinities>;
	def : Flag<["-"], "fno-honor-infinites">, Alias<fno_honor_infinities>;
	def ftrapping_math : Flag<["-"], "ftrapping-math">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_trapping_math : Flag<["-"], "fno-trapping-math">, Group<f_Group>, Flags<[CC1Option]>;
	def ffp_contract : Joined<["-"], "ffp-contract=">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Form fused FP ops (e.g. FMAs): fast (everywhere)"
	" \| on (according to FP_CONTRACT pragma, default) \| off (never fuse)">, Values<"fast,on,off">;

	def ffor_scope : Flag<["-"], "ffor-scope">, Group<f_Group>;
	def fno_for_scope : Flag<["-"], "fno-for-scope">, Group<f_Group>;

	def frewrite_includes : Flag<["-"], "frewrite-includes">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_rewrite_includes : Flag<["-"], "fno-rewrite-includes">, Group<f_Group>;

	def frewrite_imports : Flag<["-"], "frewrite-imports">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_rewrite_imports : Flag<["-"], "fno-rewrite-imports">, Group<f_Group>;

	def frewrite_map_file : Separate<["-"], "frewrite-map-file">,
	Group<f_Group>,
	Flags<[ DriverOption, CC1Option ]>;
	def frewrite_map_file_EQ : Joined<["-"], "frewrite-map-file=">,
	Group<f_Group>,
	Flags<[DriverOption]>;

	def fuse_line_directives : Flag<["-"], "fuse-line-directives">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_use_line_directives : Flag<["-"], "fno-use-line-directives">, Group<f_Group>;

	def ffreestanding : Flag<["-"], "ffreestanding">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Assert that the compilation takes place in a freestanding environment">;
	def fgnu_keywords : Flag<["-"], "fgnu-keywords">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allow GNU-extension keywords regardless of language standard">;
	def fgnu89_inline : Flag<["-"], "fgnu89-inline">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the gnu89 inline semantics">;
	def fno_gnu89_inline : Flag<["-"], "fno-gnu89-inline">, Group<f_Group>;
	def fgnu_runtime : Flag<["-"], "fgnu-runtime">, Group<f_Group>,
	HelpText<"Generate output compatible with the standard GNU Objective-C runtime">;
	def fheinous_gnu_extensions : Flag<["-"], "fheinous-gnu-extensions">, Flags<[CC1Option]>;
	def filelist : Separate<["-"], "filelist">, Flags<[LinkerInput]>,
	Group<Link_Group>;
	def : Flag<["-"], "findirect-virtual-calls">, Alias<fapple_kext>;
	def finline_functions : Flag<["-"], "finline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Inline suitable functions">;
	def finline_hint_functions: Flag<["-"], "finline-hint-functions">, Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Inline functions which are (explicitly or implicitly) marked inline">;
	def finline : Flag<["-"], "finline">, Group<clang_ignored_f_Group>;
	def fexperimental_new_pass_manager : Flag<["-"], "fexperimental-new-pass-manager">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Enables an experimental new pass manager in LLVM.">;
	def finput_charset_EQ : Joined<["-"], "finput-charset=">, Group<f_Group>;
	def fexec_charset_EQ : Joined<["-"], "fexec-charset=">, Group<f_Group>;
	def finstrument_functions : Flag<["-"], "finstrument-functions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Generate calls to instrument function entry and exit">;

	def fxray_instrument : Flag<["-"], "fxray-instrument">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Generate XRay instrumentation sleds on function entry and exit">;
	def fnoxray_instrument : Flag<["-"], "fno-xray-instrument">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fxray_instruction_threshold_EQ :
	JoinedOrSeparate<["-"], "fxray-instruction-threshold=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Sets the minimum function size to instrument with XRay">;
	def fxray_instruction_threshold_ :
	JoinedOrSeparate<["-"], "fxray-instruction-threshold">,
	Group<f_Group>, Flags<[CC1Option]>;

	def fxray_always_instrument :
	JoinedOrSeparate<["-"], "fxray-always-instrument=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Filename defining the whitelist for imbuing the 'always instrument' XRay attribute.">;
	def fxray_never_instrument :
	JoinedOrSeparate<["-"], "fxray-never-instrument=">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Filename defining the whitelist for imbuing the 'never instrument' XRay attribute.">;

	def flat__namespace : Flag<["-"], "flat_namespace">;
	def flax_vector_conversions : Flag<["-"], "flax-vector-conversions">, Group<f_Group>;
	def flimited_precision_EQ : Joined<["-"], "flimited-precision=">, Group<f_Group>;
	def flto_EQ : Joined<["-"], "flto=">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Set LTO mode to either 'full' or 'thin'">, Values<"thin,full">;
	def flto : Flag<["-"], "flto">, Flags<[CoreOption, CC1Option]>, Group<f_Group>,
	HelpText<"Enable LTO in 'full' mode">;
	def fno_lto : Flag<["-"], "fno-lto">, Group<f_Group>,
	HelpText<"Disable LTO mode (default)">;
	def flto_jobs_EQ : Joined<["-"], "flto-jobs=">,
	Flags<[CC1Option]>, Group<f_Group>,
	HelpText<"Controls the backend parallelism of -flto=thin (default "
	"of 0 means the number of threads will be derived from "
	"the number of CPUs detected)">;
	def fthinlto_index_EQ : Joined<["-"], "fthinlto-index=">,
	Flags<[CC1Option]>, Group<f_Group>,
	HelpText<"Perform ThinLTO importing using provided function summary index">;
	def fmacro_backtrace_limit_EQ : Joined<["-"], "fmacro-backtrace-limit=">,
	Group<f_Group>, Flags<[DriverOption, CoreOption]>;
	def fmerge_all_constants : Flag<["-"], "fmerge-all-constants">, Group<f_Group>;
	def fmessage_length_EQ : Joined<["-"], "fmessage-length=">, Group<f_Group>;
	def fms_extensions : Flag<["-"], "fms-extensions">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Accept some non-standard constructs supported by the Microsoft compiler">;
	def fms_compatibility : Flag<["-"], "fms-compatibility">, Group<f_Group>, Flags<[CC1Option, CoreOption]>,
	HelpText<"Enable full Microsoft Visual C++ compatibility">;
	def fms_volatile : Joined<["-"], "fms-volatile">, Group<f_Group>, Flags<[CC1Option]>;
	def fmsc_version : Joined<["-"], "fmsc-version=">, Group<f_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Microsoft compiler version number to report in _MSC_VER (0 = don't define it (default))">;
	def fms_compatibility_version
	: Joined<["-"], "fms-compatibility-version=">,
	Group<f_Group>,
	Flags<[ CC1Option, CoreOption ]>,
	HelpText<"Dot-separated value representing the Microsoft compiler "
	"version number to report in _MSC_VER (0 = don't define it "
	"(default))">;
	def fdelayed_template_parsing : Flag<["-"], "fdelayed-template-parsing">, Group<f_Group>,
	HelpText<"Parse templated function definitions at the end of the "
	"translation unit">, Flags<[CC1Option, CoreOption]>;
	def fms_memptr_rep_EQ : Joined<["-"], "fms-memptr-rep=">, Group<f_Group>, Flags<[CC1Option]>;
	def fmodules_cache_path : Joined<["-"], "fmodules-cache-path=">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the module cache path">;
	def fmodules_user_build_path : Separate<["-"], "fmodules-user-build-path">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the module user build path">;
	def fprebuilt_module_path : Joined<["-"], "fprebuilt-module-path=">, Group<i_Group>,
	Flags<[DriverOption, CC1Option]>, MetaVarName<"<directory>">,
	HelpText<"Specify the prebuilt module path">;
	def fmodules_prune_interval : Joined<["-"], "fmodules-prune-interval=">, Group<i_Group>,
	Flags<[CC1Option]>, MetaVarName<"<seconds>">,
	HelpText<"Specify the interval (in seconds) between attempts to prune the module cache">;
	def fmodules_prune_after : Joined<["-"], "fmodules-prune-after=">, Group<i_Group>,
	Flags<[CC1Option]>, MetaVarName<"<seconds>">,
	HelpText<"Specify the interval (in seconds) after which a module file will be considered unused">;
	def fmodules_search_all : Flag <["-"], "fmodules-search-all">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Search even non-imported modules to resolve references">;
	def fbuild_session_timestamp : Joined<["-"], "fbuild-session-timestamp=">,
	Group<i_Group>, Flags<[CC1Option]>, MetaVarName<"<time since Epoch in seconds>">,
	HelpText<"Time when the current build session started">;
	def fbuild_session_file : Joined<["-"], "fbuild-session-file=">,
	Group<i_Group>, MetaVarName<"<file>">,
	HelpText<"Use the last modification time of <file> as the build session timestamp">;
	def fmodules_validate_once_per_build_session : Flag<["-"], "fmodules-validate-once-per-build-session">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Don't verify input files for the modules if the module has been "
	"successfully validated or loaded during this build session">;
	def fmodules_disable_diagnostic_validation : Flag<["-"], "fmodules-disable-diagnostic-validation">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Disable validation of the diagnostic options when loading the module">;
	def fmodules_validate_system_headers : Flag<["-"], "fmodules-validate-system-headers">,
	Group<i_Group>, Flags<[CC1Option]>,
	HelpText<"Validate the system headers that a module depends on when loading the module">;
	def fmodules : Flag <["-"], "fmodules">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Enable the 'modules' language feature">;
	def fimplicit_module_maps : Flag <["-"], "fimplicit-module-maps">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>,
	HelpText<"Implicitly search the file system for module map files.">;
	def fmodules_ts : Flag <["-"], "fmodules-ts">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Enable support for the C++ Modules TS">;
	def fmodule_maps : Flag <["-"], "fmodule-maps">, Alias<fimplicit_module_maps>;
	def fmodule_name_EQ : Joined<["-"], "fmodule-name=">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>, MetaVarName<"<name>">,
	HelpText<"Specify the name of the module to build">;
	def fmodule_name : Separate<["-"], "fmodule-name">, Alias<fmodule_name_EQ>;
	def fmodule_implementation_of : Separate<["-"], "fmodule-implementation-of">,
	Flags<[CC1Option]>, Alias<fmodule_name_EQ>;
	def fmodule_map_file : Joined<["-"], "fmodule-map-file=">,
	Group<f_Group>, Flags<[DriverOption,CC1Option]>, MetaVarName<"<file>">,
	HelpText<"Load this module map file">;
	def fmodule_file : Joined<["-"], "fmodule-file=">,
	Group<f_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Load this precompiled module file">, MetaVarName<"<file>">;
	def fmodules_ignore_macro : Joined<["-"], "fmodules-ignore-macro=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Ignore the definition of the given macro when building and loading modules">;
	def fmodules_decluse : Flag <["-"], "fmodules-decluse">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Require declaration of modules used within a module">;
	def fmodules_strict_decluse : Flag <["-"], "fmodules-strict-decluse">, Group<f_Group>,
	Flags<[DriverOption,CC1Option]>,
	HelpText<"Like -fmodules-decluse but requires all headers to be in modules">;
	def fno_modules_search_all : Flag <["-"], "fno-modules-search-all">, Group<f_Group>,
	Flags<[DriverOption, CC1Option]>;
	def fno_implicit_modules :
	Flag <["-"], "fno-implicit-modules">,
	Group<f_Group>, Flags<[DriverOption, CC1Option]>;
	def fretain_comments_from_system_headers : Flag<["-"], "fretain-comments-from-system-headers">, Group<f_Group>, Flags<[CC1Option]>;

	def fmudflapth : Flag<["-"], "fmudflapth">, Group<f_Group>;
	def fmudflap : Flag<["-"], "fmudflap">, Group<f_Group>;
	def fnested_functions : Flag<["-"], "fnested-functions">, Group<f_Group>;
	def fnext_runtime : Flag<["-"], "fnext-runtime">, Group<f_Group>;
	def fno_access_control : Flag<["-"], "fno-access-control">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable C++ access control">;
	def fno_apple_pragma_pack : Flag<["-"], "fno-apple-pragma-pack">, Group<f_Group>;
	def fno_asm : Flag<["-"], "fno-asm">, Group<f_Group>;
	def fno_asynchronous_unwind_tables : Flag<["-"], "fno-asynchronous-unwind-tables">, Group<f_Group>;
	def fno_assume_sane_operator_new : Flag<["-"], "fno-assume-sane-operator-new">, Group<f_Group>,
	HelpText<"Don't assume that C++'s global operator new can't alias any pointer">,
	Flags<[CC1Option]>;
	def fno_blocks : Flag<["-"], "fno-blocks">, Group<f_Group>;
	def fno_borland_extensions : Flag<["-"], "fno-borland-extensions">, Group<f_Group>;
	def fno_builtin : Flag<["-"], "fno-builtin">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable implicit builtin knowledge of functions">;
	def fno_builtin_ : Joined<["-"], "fno-builtin-">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable implicit builtin knowledge of a specific function">;
	def fno_caret_diagnostics : Flag<["-"], "fno-caret-diagnostics">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fno_color_diagnostics : Flag<["-"], "fno-color-diagnostics">, Group<f_Group>,
	Flags<[CoreOption, CC1Option]>;
	def fno_diagnostics_color : Flag<["-"], "fno-diagnostics-color">, Group<f_Group>,
	Flags<[CoreOption, DriverOption]>;
	def fno_common : Flag<["-"], "fno-common">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Compile common globals like normal definitions">;
	def fno_constant_cfstrings : Flag<["-"], "fno-constant-cfstrings">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Disable creation of CodeFoundation-type constant strings">;
	def fno_cxx_exceptions: Flag<["-"], "fno-cxx-exceptions">, Group<f_Group>;
	def fno_cxx_modules : Flag <["-"], "fno-cxx-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_diagnostics_fixit_info : Flag<["-"], "fno-diagnostics-fixit-info">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not include fixit information in diagnostics">;
	def fno_diagnostics_show_hotness : Flag<["-"], "fno-diagnostics-show-hotness">, Group<f_Group>;
	def fno_diagnostics_show_option : Flag<["-"], "fno-diagnostics-show-option">, Group<f_Group>;
	def fno_diagnostics_show_note_include_stack : Flag<["-"], "fno-diagnostics-show-note-include-stack">,
	Flags<[CC1Option]>, Group<f_Group>;
	def fno_declspec : Flag<["-"], "fno-declspec">, Group<f_clang_Group>,
	HelpText<"Disallow __declspec as a keyword">, Flags<[CC1Option]>;
	def fno_dollars_in_identifiers : Flag<["-"], "fno-dollars-in-identifiers">, Group<f_Group>,
	HelpText<"Disallow '$' in identifiers">, Flags<[CC1Option]>;
	def fno_elide_constructors : Flag<["-"], "fno-elide-constructors">, Group<f_Group>,
	HelpText<"Disable C++ copy constructor elision">, Flags<[CC1Option]>;
	def fno_eliminate_unused_debug_symbols : Flag<["-"], "fno-eliminate-unused-debug-symbols">, Group<f_Group>;
	def fno_exceptions : Flag<["-"], "fno-exceptions">, Group<f_Group>;
	def fno_gnu_keywords : Flag<["-"], "fno-gnu-keywords">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_inline_functions : Flag<["-"], "fno-inline-functions">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fno_inline : Flag<["-"], "fno-inline">, Group<f_clang_Group>, Flags<[CC1Option]>;
	def fno_experimental_new_pass_manager : Flag<["-"], "fno-experimental-new-pass-manager">,
	Group<f_clang_Group>, Flags<[CC1Option]>,
	HelpText<"Disables an experimental new pass manager in LLVM.">;
	def fveclib : Joined<["-"], "fveclib=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use the given vector functions library">, Values<"Accelerate,SVML,none">;
	def fno_lax_vector_conversions : Flag<["-"], "fno-lax-vector-conversions">, Group<f_Group>,
	HelpText<"Disallow implicit conversions between vectors with a different number of elements or different element types">, Flags<[CC1Option]>;
	def fno_merge_all_constants : Flag<["-"], "fno-merge-all-constants">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Disallow merging of constants">;
	def fno_modules : Flag <["-"], "fno-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_implicit_module_maps : Flag <["-"], "fno-implicit-module-maps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_module_maps : Flag <["-"], "fno-module-maps">, Alias<fno_implicit_module_maps>;
	def fno_modules_decluse : Flag <["-"], "fno-modules-decluse">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_modules_strict_decluse : Flag <["-"], "fno-strict-modules-decluse">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fimplicit_modules : Flag <["-"], "fimplicit-modules">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fmodule_file_deps : Flag <["-"], "fmodule-file-deps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_module_file_deps : Flag <["-"], "fno-module-file-deps">, Group<f_Group>,
	Flags<[DriverOption]>;
	def fno_ms_extensions : Flag<["-"], "fno-ms-extensions">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_ms_compatibility : Flag<["-"], "fno-ms-compatibility">, Group<f_Group>,
	Flags<[CoreOption]>;
	def fno_delayed_template_parsing : Flag<["-"], "fno-delayed-template-parsing">, Group<f_Group>,
	HelpText<"Disable delayed template parsing">,
	Flags<[DriverOption, CoreOption]>;
	def fno_objc_exceptions: Flag<["-"], "fno-objc-exceptions">, Group<f_Group>;
	def fno_objc_legacy_dispatch : Flag<["-"], "fno-objc-legacy-dispatch">, Group<f_Group>;
	def fno_objc_weak : Flag<["-"], "fno-objc-weak">, Group<f_Group>, Flags<[CC1Option]>;
	def fno_omit_frame_pointer : Flag<["-"], "fno-omit-frame-pointer">, Group<f_Group>;
	def fno_operator_names : Flag<["-"], "fno-operator-names">, Group<f_Group>,
	HelpText<"Do not treat C++ operator name keywords as synonyms for operators">,
	Flags<[CC1Option]>;
	def fno_pascal_strings : Flag<["-"], "fno-pascal-strings">, Group<f_Group>;
	def fno_rtti : Flag<["-"], "fno-rtti">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Disable generation of rtti information">;
	def fno_short_enums : Flag<["-"], "fno-short-enums">, Group<f_Group>;
	def fno_show_column : Flag<["-"], "fno-show-column">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not include column number on diagnostics">;
	def fno_show_source_location : Flag<["-"], "fno-show-source-location">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not include source location information with diagnostics">;
	def fdiagnostics_absolute_paths : Flag<["-"], "fdiagnostics-absolute-paths">, Group<f_Group>,
	Flags<[CC1Option, CoreOption]>, HelpText<"Print absolute paths in diagnostics">;
	def fno_spell_checking : Flag<["-"], "fno-spell-checking">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Disable spell-checking">;
	def fno_stack_protector : Flag<["-"], "fno-stack-protector">, Group<f_Group>,
	HelpText<"Disable the use of stack protectors">;
	def fno_strict_aliasing : Flag<["-"], "fno-strict-aliasing">, Group<f_Group>,
	Flags<[DriverOption, CoreOption]>;
	def fstruct_path_tbaa : Flag<["-"], "fstruct-path-tbaa">, Group<f_Group>;
	def fno_struct_path_tbaa : Flag<["-"], "fno-struct-path-tbaa">, Group<f_Group>;
	def fno_strict_enums : Flag<["-"], "fno-strict-enums">, Group<f_Group>;
	def fno_strict_vtable_pointers: Flag<["-"], "fno-strict-vtable-pointers">,
	Group<f_Group>;
	def fno_strict_overflow : Flag<["-"], "fno-strict-overflow">, Group<f_Group>;
	def fno_threadsafe_statics : Flag<["-"], "fno-threadsafe-statics">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Do not emit code to make initialization of local statics thread safe">;
	def fno_use_cxa_atexit : Flag<["-"], "fno-use-cxa-atexit">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't use __cxa_atexit for calling destructors">;
	def fno_use_init_array : Flag<["-"], "fno-use-init-array">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Don't use .init_array instead of .ctors">;
	def fno_unit_at_a_time : Flag<["-"], "fno-unit-at-a-time">, Group<f_Group>;
	def fno_unwind_tables : Flag<["-"], "fno-unwind-tables">, Group<f_Group>;
	def fno_verbose_asm : Flag<["-"], "fno-verbose-asm">, Group<f_Group>;
	def fno_working_directory : Flag<["-"], "fno-working-directory">, Group<f_Group>;
	def fno_wrapv : Flag<["-"], "fno-wrapv">, Group<f_Group>;
	def fno_zero_initialized_in_bss : Flag<["-"], "fno-zero-initialized-in-bss">, Group<f_Group>;
	def fobjc_arc : Flag<["-"], "fobjc-arc">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Synthesize retain and release calls for Objective-C pointers">;
	def fno_objc_arc : Flag<["-"], "fno-objc-arc">, Group<f_Group>;
	def fobjc_arc_exceptions : Flag<["-"], "fobjc-arc-exceptions">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use EH-safe code when synthesizing retains and releases in -fobjc-arc">;
	def fno_objc_arc_exceptions : Flag<["-"], "fno-objc-arc-exceptions">, Group<f_Group>;
	def fobjc_atdefs : Flag<["-"], "fobjc-atdefs">, Group<clang_ignored_f_Group>;
	def fobjc_call_cxx_cdtors : Flag<["-"], "fobjc-call-cxx-cdtors">, Group<clang_ignored_f_Group>;
	def fobjc_exceptions: Flag<["-"], "fobjc-exceptions">, Group<f_Group>,
	HelpText<"Enable Objective-C exceptions">, Flags<[CC1Option]>;
	def fapplication_extension : Flag<["-"], "fapplication-extension">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Restrict code to those available for App Extensions">;
	def fno_application_extension : Flag<["-"], "fno-application-extension">,
	Group<f_Group>;
	def frelaxed_template_template_args : Flag<["-"], "frelaxed-template-template-args">,
	Flags<[CC1Option]>, HelpText<"Enable C++17 relaxed template template argument matching">,
	Group<f_Group>;
	def fno_relaxed_template_template_args : Flag<["-"], "fno-relaxed-template-template-args">,
	Group<f_Group>;
	def fsized_deallocation : Flag<["-"], "fsized-deallocation">, Flags<[CC1Option]>,
	HelpText<"Enable C++14 sized global deallocation functions">, Group<f_Group>;
	def fno_sized_deallocation: Flag<["-"], "fno-sized-deallocation">, Group<f_Group>;
	def faligned_allocation : Flag<["-"], "faligned-allocation">, Flags<[CC1Option]>,
	HelpText<"Enable C++17 aligned allocation functions">, Group<f_Group>;
	def fno_aligned_allocation: Flag<["-"], "fno-aligned-allocation">,
	Group<f_Group>, Flags<[CC1Option]>;
	def fnew_alignment_EQ : Joined<["-"], "fnew-alignment=">,
	HelpText<"Specifies the largest alignment guaranteed by '::operator new(size_t)'">,
	MetaVarName<"<align>">, Group<f_Group>, Flags<[CC1Option]>;
	def : Separate<["-"], "fnew-alignment">, Alias<fnew_alignment_EQ>;
	def : Flag<["-"], "faligned-new">, Alias<faligned_allocation>;
	def : Flag<["-"], "fno-aligned-new">, Alias<fno_aligned_allocation>;
	def faligned_new_EQ : Joined<["-"], "faligned-new=">;

	def fobjc_legacy_dispatch : Flag<["-"], "fobjc-legacy-dispatch">, Group<f_Group>;
	def fobjc_new_property : Flag<["-"], "fobjc-new-property">, Group<clang_ignored_f_Group>;
	def fobjc_infer_related_result_type : Flag<["-"], "fobjc-infer-related-result-type">,
	Group<f_Group>;
	def fno_objc_infer_related_result_type : Flag<["-"],
	"fno-objc-infer-related-result-type">, Group<f_Group>,
	HelpText<
	"do not infer Objective-C related result type based on method family">,
	Flags<[CC1Option]>;
	def fobjc_link_runtime: Flag<["-"], "fobjc-link-runtime">, Group<f_Group>;
	def fobjc_weak : Flag<["-"], "fobjc-weak">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable ARC-style weak references in Objective-C">;

	// Objective-C ABI options.
	def fobjc_runtime_EQ : Joined<["-"], "fobjc-runtime=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the target Objective-C runtime kind and version">;
	def fobjc_abi_version_EQ : Joined<["-"], "fobjc-abi-version=">, Group<f_Group>;
	def fobjc_nonfragile_abi_version_EQ : Joined<["-"], "fobjc-nonfragile-abi-version=">, Group<f_Group>;
	def fobjc_nonfragile_abi : Flag<["-"], "fobjc-nonfragile-abi">, Group<f_Group>;
	def fno_objc_nonfragile_abi : Flag<["-"], "fno-objc-nonfragile-abi">, Group<f_Group>;

	def fobjc_sender_dependent_dispatch : Flag<["-"], "fobjc-sender-dependent-dispatch">, Group<f_Group>;
	def fomit_frame_pointer : Flag<["-"], "fomit-frame-pointer">, Group<f_Group>;
	def fopenmp : Flag<["-"], "fopenmp">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fno_openmp : Flag<["-"], "fno-openmp">, Group<f_Group>, Flags<[NoArgumentUnused]>;
	def fopenmp_version_EQ : Joined<["-"], "fopenmp-version=">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fopenmp_EQ : Joined<["-"], "fopenmp=">, Group<f_Group>;
	def fopenmp_use_tls : Flag<["-"], "fopenmp-use-tls">, Group<f_Group>, Flags<[NoArgumentUnused]>;
	def fnoopenmp_use_tls : Flag<["-"], "fnoopenmp-use-tls">, Group<f_Group>, Flags<[CC1Option, NoArgumentUnused]>;
	def fopenmp_targets_EQ : CommaJoined<["-"], "fopenmp-targets=">, Flags<[DriverOption, CC1Option]>,
	HelpText<"Specify comma-separated list of triples OpenMP offloading targets to be supported">;
	def fopenmp_dump_offload_linker_script : Flag<["-"], "fopenmp-dump-offload-linker-script">, Group<f_Group>,
	Flags<[NoArgumentUnused]>;
	def fno_optimize_sibling_calls : Flag<["-"], "fno-optimize-sibling-calls">, Group<f_Group>;
	def foptimize_sibling_calls : Flag<["-"], "foptimize-sibling-calls">, Group<f_Group>;
	def force__cpusubtype__ALL : Flag<["-"], "force_cpusubtype_ALL">;
	def force__flat__namespace : Flag<["-"], "force_flat_namespace">;
	def force__load : Separate<["-"], "force_load">;
	def force_addr : Joined<["-"], "fforce-addr">, Group<clang_ignored_f_Group>;
	def foutput_class_dir_EQ : Joined<["-"], "foutput-class-dir=">, Group<f_Group>;
	def fpack_struct : Flag<["-"], "fpack-struct">, Group<f_Group>;
	def fno_pack_struct : Flag<["-"], "fno-pack-struct">, Group<f_Group>;
	def fpack_struct_EQ : Joined<["-"], "fpack-struct=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the default maximum struct packing alignment">;
	def fmax_type_align_EQ : Joined<["-"], "fmax-type-align=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Specify the maximum alignment to enforce on pointers lacking an explicit alignment">;
	def fno_max_type_align : Flag<["-"], "fno-max-type-align">, Group<f_Group>;
	def fpascal_strings : Flag<["-"], "fpascal-strings">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Recognize and construct Pascal-style string literals">;
	def fpcc_struct_return : Flag<["-"], "fpcc-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Override the default ABI to return all structs on the stack">;
	def fpch_preprocess : Flag<["-"], "fpch-preprocess">, Group<f_Group>;
	def fpic : Flag<["-"], "fpic">, Group<f_Group>;
	def fno_pic : Flag<["-"], "fno-pic">, Group<f_Group>;
	def fpie : Flag<["-"], "fpie">, Group<f_Group>;
	def fno_pie : Flag<["-"], "fno-pie">, Group<f_Group>;
	def fropi : Flag<["-"], "fropi">, Group<f_Group>;
	def fno_ropi : Flag<["-"], "fno-ropi">, Group<f_Group>;
	def frwpi : Flag<["-"], "frwpi">, Group<f_Group>;
	def fno_rwpi : Flag<["-"], "fno-rwpi">, Group<f_Group>;
	def fplugin_EQ : Joined<["-"], "fplugin=">, Group<f_Group>, Flags<[DriverOption]>, MetaVarName<"<dsopath>">,
	HelpText<"Load the named plugin (dynamic shared object)">;
	def fpreserve_as_comments : Flag<["-"], "fpreserve-as-comments">, Group<f_Group>;
	def fno_preserve_as_comments : Flag<["-"], "fno-preserve-as-comments">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Do not preserve comments in inline assembly">;
	def fprofile_arcs : Flag<["-"], "fprofile-arcs">, Group<f_Group>;
	def fno_profile_arcs : Flag<["-"], "fno-profile-arcs">, Group<f_Group>;
	def framework : Separate<["-"], "framework">, Flags<[LinkerInput]>;
	def frandom_seed_EQ : Joined<["-"], "frandom-seed=">, Group<clang_ignored_f_Group>;
	def freg_struct_return : Flag<["-"], "freg-struct-return">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Override the default ABI to return small structs in registers">;
	def frtti : Flag<["-"], "frtti">, Group<f_Group>;
	def : Flag<["-"], "fsched-interblock">, Group<clang_ignored_f_Group>;
	def fshort_enums : Flag<["-"], "fshort-enums">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Allocate to an enum type only as many bytes as it needs for the declared range of possible values">;
	def fshort_wchar : Flag<["-"], "fshort-wchar">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Force wchar_t to be a short unsigned int">;
	def fno_short_wchar : Flag<["-"], "fno-short-wchar">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Force wchar_t to be an unsigned int">;
	def fshow_overloads_EQ : Joined<["-"], "fshow-overloads=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Which overload candidates to show when overload resolution fails: "
	"best\|all; defaults to all">, Values<"best,all">;
	def fshow_column : Flag<["-"], "fshow-column">, Group<f_Group>, Flags<[CC1Option]>;
	def fshow_source_location : Flag<["-"], "fshow-source-location">, Group<f_Group>;
	def fspell_checking : Flag<["-"], "fspell-checking">, Group<f_Group>;
	def fspell_checking_limit_EQ : Joined<["-"], "fspell-checking-limit=">, Group<f_Group>;
	def fsigned_bitfields : Flag<["-"], "fsigned-bitfields">, Group<f_Group>;
	def fsigned_char : Flag<["-"], "fsigned-char">, Group<f_Group>;
	def fno_signed_char : Flag<["-"], "fno-signed-char">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Char is unsigned">;
	def fsplit_stack : Flag<["-"], "fsplit-stack">, Group<f_Group>;
	def fstack_protector_all : Flag<["-"], "fstack-protector-all">, Group<f_Group>,
	HelpText<"Force the usage of stack protectors for all functions">;
	def fstack_protector_strong : Flag<["-"], "fstack-protector-strong">, Group<f_Group>,
	HelpText<"Use a strong heuristic to apply stack protectors to functions">;
	def fstack_protector : Flag<["-"], "fstack-protector">, Group<f_Group>,
	HelpText<"Enable stack protectors for functions potentially vulnerable to stack smashing">;
	def fstandalone_debug : Flag<["-"], "fstandalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Emit full debug info for all types used by the program">;
	def fno_standalone_debug : Flag<["-"], "fno-standalone-debug">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Limit debug information produced to reduce size of debug binary">;
	def flimit_debug_info : Flag<["-"], "flimit-debug-info">, Flags<[CoreOption]>, Alias<fno_standalone_debug>;
	def fno_limit_debug_info : Flag<["-"], "fno-limit-debug-info">, Flags<[CoreOption]>, Alias<fstandalone_debug>;
	def fdebug_macro : Flag<["-"], "fdebug-macro">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Emit macro debug information">;
	def fno_debug_macro : Flag<["-"], "fno-debug-macro">, Group<f_Group>, Flags<[CoreOption]>,
	HelpText<"Do not emit macro debug information">;
	def fstrict_aliasing : Flag<["-"], "fstrict-aliasing">, Group<f_Group>,
	Flags<[DriverOption, CoreOption]>;
	def fstrict_enums : Flag<["-"], "fstrict-enums">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable optimizations based on the strict definition of an enum's "
	"value range">;
	def fstrict_vtable_pointers: Flag<["-"], "fstrict-vtable-pointers">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable optimizations based on the strict rules for overwriting "
	"polymorphic C++ objects">;
	def fstrict_overflow : Flag<["-"], "fstrict-overflow">, Group<f_Group>;
	def fsyntax_only : Flag<["-"], "fsyntax-only">,
	Flags<[DriverOption,CoreOption,CC1Option]>, Group<Action_Group>;
	def ftabstop_EQ : Joined<["-"], "ftabstop=">, Group<f_Group>;
	def ftemplate_depth_EQ : Joined<["-"], "ftemplate-depth=">, Group<f_Group>;
	def ftemplate_depth_ : Joined<["-"], "ftemplate-depth-">, Group<f_Group>;
	def ftemplate_backtrace_limit_EQ : Joined<["-"], "ftemplate-backtrace-limit=">,
	Group<f_Group>;
	def foperator_arrow_depth_EQ : Joined<["-"], "foperator-arrow-depth=">,
	Group<f_Group>;

	def fsave_optimization_record : Flag<["-"], "fsave-optimization-record">,
	Group<f_Group>, HelpText<"Generate a YAML optimization record file">;
	def fno_save_optimization_record : Flag<["-"], "fno-save-optimization-record">,
	Group<f_Group>, Flags<[NoArgumentUnused]>;
	def foptimization_record_file_EQ : Joined<["-"], "foptimization-record-file=">,
	Group<f_Group>,
	HelpText<"Specify the file name of any generated YAML optimization record">;

	def ftest_coverage : Flag<["-"], "ftest-coverage">, Group<f_Group>;
	def fvectorize : Flag<["-"], "fvectorize">, Group<f_Group>,
	HelpText<"Enable the loop vectorization passes">;
	def fno_vectorize : Flag<["-"], "fno-vectorize">, Group<f_Group>;
	def : Flag<["-"], "ftree-vectorize">, Alias<fvectorize>;
	def : Flag<["-"], "fno-tree-vectorize">, Alias<fno_vectorize>;
	def fslp_vectorize : Flag<["-"], "fslp-vectorize">, Group<f_Group>,
	HelpText<"Enable the superword-level parallelism vectorization passes">;
	def fno_slp_vectorize : Flag<["-"], "fno-slp-vectorize">, Group<f_Group>;
	def : Flag<["-"], "ftree-slp-vectorize">, Alias<fslp_vectorize>;
	def : Flag<["-"], "fno-tree-slp-vectorize">, Alias<fno_slp_vectorize>;
	def Wlarge_by_value_copy_def : Flag<["-"], "Wlarge-by-value-copy">,
	HelpText<"Warn if a function definition returns or accepts an object larger "
	"in bytes than a given value">, Flags<[HelpHidden]>;
	def Wlarge_by_value_copy_EQ : Joined<["-"], "Wlarge-by-value-copy=">, Flags<[CC1Option]>;

	// These "special" warning flags are effectively processed as f_Group flags by the driver:
	// Just silence warnings about -Wlarger-than for now.
	def Wlarger_than_EQ : Joined<["-"], "Wlarger-than=">, Group<clang_ignored_f_Group>;
	def Wlarger_than_ : Joined<["-"], "Wlarger-than-">, Alias<Wlarger_than_EQ>;
	def Wframe_larger_than_EQ : Joined<["-"], "Wframe-larger-than=">, Group<f_Group>, Flags<[DriverOption]>;

	def : Flag<["-"], "fterminated-vtables">, Alias<fapple_kext>;
	def fthreadsafe_statics : Flag<["-"], "fthreadsafe-statics">, Group<f_Group>;
	def ftime_report : Flag<["-"], "ftime-report">, Group<f_Group>, Flags<[CC1Option]>;
	def ftlsmodel_EQ : Joined<["-"], "ftls-model=">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrapv : Flag<["-"], "ftrapv">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Trap on integer overflow">;
	def ftrapv_handler_EQ : Joined<["-"], "ftrapv-handler=">, Group<f_Group>,
	MetaVarName<"<function name>">,
	HelpText<"Specify the function to be called on overflow">;
	def ftrapv_handler : Separate<["-"], "ftrapv-handler">, Group<f_Group>, Flags<[CC1Option]>;
	def ftrap_function_EQ : Joined<["-"], "ftrap-function=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Issue call to specified function rather than a trap instruction">;
	def funit_at_a_time : Flag<["-"], "funit-at-a-time">, Group<f_Group>;
	def funroll_loops : Flag<["-"], "funroll-loops">, Group<f_Group>,
	HelpText<"Turn on loop unroller">, Flags<[CC1Option]>;
	def fno_unroll_loops : Flag<["-"], "fno-unroll-loops">, Group<f_Group>,
	HelpText<"Turn off loop unroller">, Flags<[CC1Option]>;
	def freroll_loops : Flag<["-"], "freroll-loops">, Group<f_Group>,
	HelpText<"Turn on loop reroller">, Flags<[CC1Option]>;
	def fno_reroll_loops : Flag<["-"], "fno-reroll-loops">, Group<f_Group>,
	HelpText<"Turn off loop reroller">;
	def ftrigraphs : Flag<["-"], "ftrigraphs">, Group<f_Group>,
	HelpText<"Process trigraph sequences">, Flags<[CC1Option]>;
	def fno_trigraphs : Flag<["-"], "fno-trigraphs">, Group<f_Group>,
	HelpText<"Do not process trigraph sequences">, Flags<[CC1Option]>;
	def funsigned_bitfields : Flag<["-"], "funsigned-bitfields">, Group<f_Group>;
	def funsigned_char : Flag<["-"], "funsigned-char">, Group<f_Group>;
	def fno_unsigned_char : Flag<["-"], "fno-unsigned-char">;
	def funwind_tables : Flag<["-"], "funwind-tables">, Group<f_Group>;
	def fuse_cxa_atexit : Flag<["-"], "fuse-cxa-atexit">, Group<f_Group>;
	def fuse_init_array : Flag<["-"], "fuse-init-array">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use .init_array instead of .ctors">;
	def fno_var_tracking : Flag<["-"], "fno-var-tracking">, Group<clang_ignored_f_Group>;
	def fverbose_asm : Flag<["-"], "fverbose-asm">, Group<f_Group>;
	def fvisibility_EQ : Joined<["-"], "fvisibility=">, Group<f_Group>,
	HelpText<"Set the default symbol visibility for all global declarations">, Values<"hidden,default">;
	def fvisibility_inlines_hidden : Flag<["-"], "fvisibility-inlines-hidden">, Group<f_Group>,
	HelpText<"Give inline C++ member functions default visibility by default">,
	Flags<[CC1Option]>;
	def fvisibility_ms_compat : Flag<["-"], "fvisibility-ms-compat">, Group<f_Group>,
	HelpText<"Give global types 'default' visibility and global functions and "
	"variables 'hidden' visibility by default">;
	def fwhole_program_vtables : Flag<["-"], "fwhole-program-vtables">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Enables whole-program vtable optimization. Requires -flto">;
	def fno_whole_program_vtables : Flag<["-"], "fno-whole-program-vtables">, Group<f_Group>;
	def fwrapv : Flag<["-"], "fwrapv">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Treat signed integer overflow as two's complement">;
	def fwritable_strings : Flag<["-"], "fwritable-strings">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Store string literals as writable data">;
	def fzero_initialized_in_bss : Flag<["-"], "fzero-initialized-in-bss">, Group<f_Group>;
	def ffunction_sections : Flag<["-"], "ffunction-sections">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Place each function in its own section (ELF Only)">;
	def fno_function_sections : Flag<["-"], "fno-function-sections">,
	Group<f_Group>, Flags<[CC1Option]>;
	def fdata_sections : Flag <["-"], "fdata-sections">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Place each data in its own section (ELF Only)">;
	def fno_data_sections : Flag <["-"], "fno-data-sections">, Group<f_Group>,
	Flags<[CC1Option]>;

	def funique_section_names : Flag <["-"], "funique-section-names">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Use unique names for text and data sections (ELF Only)">;
	def fno_unique_section_names : Flag <["-"], "fno-unique-section-names">,
	Group<f_Group>, Flags<[CC1Option]>;

	def fstrict_return : Flag<["-"], "fstrict-return">, Group<f_Group>,
	Flags<[CC1Option]>,
	HelpText<"Always treat control flow paths that fall off the end of a "
	"non-void function as unreachable">;
	def fno_strict_return : Flag<["-"], "fno-strict-return">, Group<f_Group>,
	Flags<[CC1Option]>;

	def fallow_editor_placeholders : Flag<["-"], "fallow-editor-placeholders">,
	Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Treat editor placeholders as valid source code">;
	def fno_allow_editor_placeholders : Flag<["-"],
	"fno-allow-editor-placeholders">, Group<f_Group>;

	def fdebug_types_section: Flag <["-"], "fdebug-types-section">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Place debug types in their own section (ELF Only)">;
	def fno_debug_types_section: Flag<["-"], "fno-debug-types-section">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fsplit_dwarf_inlining: Flag <["-"], "fsplit-dwarf-inlining">, Group<f_Group>,
	Flags<[CC1Option]>, HelpText<"Place debug types in their own section (ELF Only)">;
	def fno_split_dwarf_inlining: Flag<["-"], "fno-split-dwarf-inlining">, Group<f_Group>,
	Flags<[CC1Option]>;
	def fdebug_prefix_map_EQ
	: Joined<["-"], "fdebug-prefix-map=">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"remap file source paths in debug info">;
	def g_Flag : Flag<["-"], "g">, Group<g_Group>,
	HelpText<"Generate source-level debug information">;
	def gline_tables_only : Flag<["-"], "gline-tables-only">, Group<gN_Group>,
	Flags<[CoreOption]>, HelpText<"Emit debug line number tables only">;
	def gmlt : Flag<["-"], "gmlt">, Alias<gline_tables_only>;
	def g0 : Flag<["-"], "g0">, Group<gN_Group>;
	def g1 : Flag<["-"], "g1">, Group<gN_Group>, Alias<gline_tables_only>;
	def g2 : Flag<["-"], "g2">, Group<gN_Group>;
	def g3 : Flag<["-"], "g3">, Group<gN_Group>;
	def ggdb : Flag<["-"], "ggdb">, Group<gTune_Group>;
	def ggdb0 : Flag<["-"], "ggdb0">, Group<ggdbN_Group>;
	def ggdb1 : Flag<["-"], "ggdb1">, Group<ggdbN_Group>;
	def ggdb2 : Flag<["-"], "ggdb2">, Group<ggdbN_Group>;
	def ggdb3 : Flag<["-"], "ggdb3">, Group<ggdbN_Group>;
	def glldb : Flag<["-"], "glldb">, Group<gTune_Group>;
	def gsce : Flag<["-"], "gsce">, Group<gTune_Group>;
	def gdwarf_2 : Flag<["-"], "gdwarf-2">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 2">;
	def gdwarf_3 : Flag<["-"], "gdwarf-3">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 3">;
	def gdwarf_4 : Flag<["-"], "gdwarf-4">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 4">;
	def gdwarf_5 : Flag<["-"], "gdwarf-5">, Group<g_Group>,
	HelpText<"Generate source-level debug information with dwarf version 5">;
	def gcodeview : Flag<["-"], "gcodeview">,
	HelpText<"Generate CodeView debug information">,
	Flags<[CC1Option, CC1AsOption, CoreOption]>;
	// Equivalent to our default dwarf version. Forces usual dwarf emission when
	// CodeView is enabled.
	def gdwarf : Flag<["-"], "gdwarf">, Alias<gdwarf_4>, Flags<[CoreOption]>;

	def gfull : Flag<["-"], "gfull">, Group<g_Group>;
	def gused : Flag<["-"], "gused">, Group<g_Group>;
	def gstabs : Joined<["-"], "gstabs">, Group<g_Group>, Flags<[Unsupported]>;
	def gcoff : Joined<["-"], "gcoff">, Group<g_Group>, Flags<[Unsupported]>;
	def gxcoff : Joined<["-"], "gxcoff">, Group<g_Group>, Flags<[Unsupported]>;
	def gvms : Joined<["-"], "gvms">, Group<g_Group>, Flags<[Unsupported]>;
	def gtoggle : Flag<["-"], "gtoggle">, Group<g_flags_Group>, Flags<[Unsupported]>;
	def grecord_gcc_switches : Flag<["-"], "grecord-gcc-switches">, Group<g_flags_Group>;
	def gno_record_gcc_switches : Flag<["-"], "gno-record-gcc-switches">,
	Group<g_flags_Group>;
	def gstrict_dwarf : Flag<["-"], "gstrict-dwarf">, Group<g_flags_Group>;
	def gno_strict_dwarf : Flag<["-"], "gno-strict-dwarf">, Group<g_flags_Group>;
	def gcolumn_info : Flag<["-"], "gcolumn-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
	def gno_column_info : Flag<["-"], "gno-column-info">, Group<g_flags_Group>, Flags<[CoreOption]>;
	def gsplit_dwarf : Flag<["-"], "gsplit-dwarf">, Group<g_flags_Group>;
	def ggnu_pubnames : Flag<["-"], "ggnu-pubnames">, Group<g_flags_Group>;
	def gdwarf_aranges : Flag<["-"], "gdwarf-aranges">, Group<g_flags_Group>;
	def gmodules : Flag <["-"], "gmodules">, Group<gN_Group>,
	HelpText<"Generate debug info with external references to clang modules"
	" or precompiled headers">;
	def gz : Flag<["-"], "gz">, Group<g_flags_Group>,
	HelpText<"DWARF debug sections compression type">;
	def gz_EQ : Joined<["-"], "gz=">, Group<g_flags_Group>,
	HelpText<"DWARF debug sections compression type">;
	def headerpad__max__install__names : Joined<["-"], "headerpad_max_install_names">;
	def help : Flag<["-", "--"], "help">, Flags<[CC1Option,CC1AsOption]>,
	HelpText<"Display available options">;
	def index_header_map : Flag<["-"], "index-header-map">, Flags<[CC1Option]>,
	HelpText<"Make the next included directory (-I or -F) an indexer header map">;
	def idirafter : JoinedOrSeparate<["-"], "idirafter">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to AFTER include search path">;
	def iframework : JoinedOrSeparate<["-"], "iframework">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to SYSTEM framework search path">;
	def iframeworkwithsysroot : JoinedOrSeparate<["-"], "iframeworkwithsysroot">,
	Group<clang_i_Group>,
	HelpText<"Add directory to SYSTEM framework search path, "
	"absolute paths are relative to -isysroot">,
	MetaVarName<"<directory>">, Flags<[CC1Option]>;
	def imacros : JoinedOrSeparate<["-", "--"], "imacros">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Include macros from file before parsing">, MetaVarName<"<file>">;
	def image__base : Separate<["-"], "image_base">;
	def include_ : JoinedOrSeparate<["-", "--"], "include">, Group<clang_i_Group>, EnumName<"include">,
	MetaVarName<"<file>">, HelpText<"Include file before parsing">, Flags<[CC1Option]>;
	def include_pch : Separate<["-"], "include-pch">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Include precompiled header file">, MetaVarName<"<file>">;
	def relocatable_pch : Flag<["-", "--"], "relocatable-pch">, Flags<[CC1Option]>,
	HelpText<"Whether to build a relocatable precompiled header">;
	def verify_pch : Flag<["-"], "verify-pch">, Group<Action_Group>, Flags<[CC1Option]>,
	HelpText<"Load and verify that a pre-compiled header file is not stale">;
	def init : Separate<["-"], "init">;
	def install__name : Separate<["-"], "install_name">;
	def iprefix : JoinedOrSeparate<["-"], "iprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set the -iwithprefix/-iwithprefixbefore prefix">, MetaVarName<"<dir>">;
	def iquote : JoinedOrSeparate<["-"], "iquote">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Add directory to QUOTE include search path">, MetaVarName<"<directory>">;
	def isysroot : JoinedOrSeparate<["-"], "isysroot">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set the system root directory (usually /)">, MetaVarName<"<dir>">;
	def isystem : JoinedOrSeparate<["-"], "isystem">, Group<clang_i_Group>,
	Flags<[CC1Option]>,
	HelpText<"Add directory to SYSTEM include search path">, MetaVarName<"<directory>">;
	def isystem_after : JoinedOrSeparate<["-"], "isystem-after">,
	Group<clang_i_Group>, Flags<[DriverOption]>, MetaVarName<"<directory>">,
	HelpText<"Add directory to end of the SYSTEM include search path">;
	def iwithprefixbefore : JoinedOrSeparate<["-"], "iwithprefixbefore">, Group<clang_i_Group>,
	HelpText<"Set directory to include search path with prefix">, MetaVarName<"<dir>">,
	Flags<[CC1Option]>;
	def iwithprefix : JoinedOrSeparate<["-"], "iwithprefix">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Set directory to SYSTEM include search path with prefix">, MetaVarName<"<dir>">;
	def iwithsysroot : JoinedOrSeparate<["-"], "iwithsysroot">, Group<clang_i_Group>,
	HelpText<"Add directory to SYSTEM include search path, "
	"absolute paths are relative to -isysroot">, MetaVarName<"<directory>">,
	Flags<[CC1Option]>;
	def ivfsoverlay : JoinedOrSeparate<["-"], "ivfsoverlay">, Group<clang_i_Group>, Flags<[CC1Option]>,
	HelpText<"Overlay the virtual filesystem described by file over the real file system">;
	def i : Joined<["-"], "i">, Group<i_Group>;
	def keep__private__externs : Flag<["-"], "keep_private_externs">;
	def l : JoinedOrSeparate<["-"], "l">, Flags<[LinkerInput, RenderJoined]>,
	Group<Link_Group>;
	def lazy__framework : Separate<["-"], "lazy_framework">, Flags<[LinkerInput]>;
	def lazy__library : Separate<["-"], "lazy_library">, Flags<[LinkerInput]>;
	def mlittle_endian : Flag<["-"], "mlittle-endian">, Flags<[DriverOption]>;
	def EL : Flag<["-"], "EL">, Alias<mlittle_endian>;
	def mbig_endian : Flag<["-"], "mbig-endian">, Flags<[DriverOption]>;
	def EB : Flag<["-"], "EB">, Alias<mbig_endian>;
	def m16 : Flag<["-"], "m16">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def m32 : Flag<["-"], "m32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mqdsp6_compat : Flag<["-"], "mqdsp6-compat">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Enable hexagon-qdsp6 backward compatibility">;
	def m3dnowa : Flag<["-"], "m3dnowa">, Group<m_x86_Features_Group>;
	def m3dnow : Flag<["-"], "m3dnow">, Group<m_x86_Features_Group>;
	def m64 : Flag<["-"], "m64">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mx32 : Flag<["-"], "mx32">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def mabi_EQ : Joined<["-"], "mabi=">, Group<m_Group>;
	def miamcu : Flag<["-"], "miamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>,
	HelpText<"Use Intel MCU ABI">;
	def mno_iamcu : Flag<["-"], "mno-iamcu">, Group<m_Group>, Flags<[DriverOption, CoreOption]>;
	def malign_functions_EQ : Joined<["-"], "malign-functions=">, Group<clang_ignored_m_Group>;
	def malign_loops_EQ : Joined<["-"], "malign-loops=">, Group<clang_ignored_m_Group>;
	def malign_jumps_EQ : Joined<["-"], "malign-jumps=">, Group<clang_ignored_m_Group>;
	def mfancy_math_387 : Flag<["-"], "mfancy-math-387">, Group<clang_ignored_m_Group>;
	def mlong_calls : Flag<["-"], "mlong-calls">, Group<m_Group>,
	HelpText<"Generate branches with extended addressability, usually via indirect jumps.">;
	def mno_long_calls : Flag<["-"], "mno-long-calls">, Group<m_Group>,
	HelpText<"Restore the default behaviour of not generating long calls">;
	def mexecute_only : Flag<["-"], "mexecute-only">, Group<m_arm_Features_Group>,
	HelpText<"Disallow generation of data access to code sections (ARM only)">;
	def mno_execute_only : Flag<["-"], "mno-execute-only">, Group<m_arm_Features_Group>,
	HelpText<"Allow generation of data access to code sections (ARM only)">;
	def mpure_code : Flag<["-"], "mpure-code">, Alias<mexecute_only>; // Alias for GCC compatibility
	def mno_pure_code : Flag<["-"], "mno-pure-code">, Alias<mno_execute_only>;
	def mtvos_version_min_EQ : Joined<["-"], "mtvos-version-min=">, Group<m_Group>;
	def mappletvos_version_min_EQ : Joined<["-"], "mappletvos-version-min=">, Alias<mtvos_version_min_EQ>;
	def mtvos_simulator_version_min_EQ : Joined<["-"], "mtvos-simulator-version-min=">;
	def mappletvsimulator_version_min_EQ : Joined<["-"], "mappletvsimulator-version-min=">, Alias<mtvos_simulator_version_min_EQ>;
	def mwatchos_version_min_EQ : Joined<["-"], "mwatchos-version-min=">, Group<m_Group>;
	def mwatchos_simulator_version_min_EQ : Joined<["-"], "mwatchos-simulator-version-min=">;
	def mwatchsimulator_version_min_EQ : Joined<["-"], "mwatchsimulator-version-min=">, Alias<mwatchos_simulator_version_min_EQ>;
	def march_EQ : Joined<["-"], "march=">, Group<m_Group>;
	def masm_EQ : Joined<["-"], "masm=">, Group<m_Group>, Flags<[DriverOption]>;
	def mcmodel_EQ : Joined<["-"], "mcmodel=">, Group<m_Group>;
	def mimplicit_it_EQ : Joined<["-"], "mimplicit-it=">, Group<m_Group>;
	def mdefault_build_attributes : Joined<["-"], "mdefault-build-attributes">, Group<m_Group>;
	def mno_default_build_attributes : Joined<["-"], "mno-default-build-attributes">, Group<m_Group>;
	def mconstant_cfstrings : Flag<["-"], "mconstant-cfstrings">, Group<clang_ignored_m_Group>;
	def mconsole : Joined<["-"], "mconsole">, Group<m_Group>, Flags<[DriverOption]>;
	def mwindows : Joined<["-"], "mwindows">, Group<m_Group>, Flags<[DriverOption]>;
	def mdll : Joined<["-"], "mdll">, Group<m_Group>, Flags<[DriverOption]>;
	def municode : Joined<["-"], "municode">, Group<m_Group>, Flags<[DriverOption]>;
	def mthreads : Joined<["-"], "mthreads">, Group<m_Group>, Flags<[DriverOption]>;
	def mcpu_EQ : Joined<["-"], "mcpu=">, Group<m_Group>;
	def mmcu_EQ : Joined<["-"], "mmcu=">, Group<m_Group>;
	def mdynamic_no_pic : Joined<["-"], "mdynamic-no-pic">, Group<m_Group>;
	def mfix_and_continue : Flag<["-"], "mfix-and-continue">, Group<clang_ignored_m_Group>;
	def mieee_fp : Flag<["-"], "mieee-fp">, Group<clang_ignored_m_Group>;
	def minline_all_stringops : Flag<["-"], "minline-all-stringops">, Group<clang_ignored_m_Group>;
	def mno_inline_all_stringops : Flag<["-"], "mno-inline-all-stringops">, Group<clang_ignored_m_Group>;
	def malign_double : Flag<["-"], "malign-double">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Align doubles to two words in structs (x86 only)">;
	def mfloat_abi_EQ : Joined<["-"], "mfloat-abi=">, Group<m_Group>, Values<"soft,softfp,hard">;
	def mfpmath_EQ : Joined<["-"], "mfpmath=">, Group<m_Group>;
	def mfpu_EQ : Joined<["-"], "mfpu=">, Group<m_Group>;
	def mhwdiv_EQ : Joined<["-"], "mhwdiv=">, Group<m_Group>;
	def mglobal_merge : Flag<["-"], "mglobal-merge">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Enable merging of globals">;
	def mhard_float : Flag<["-"], "mhard-float">, Group<m_Group>;
	def miphoneos_version_min_EQ : Joined<["-"], "miphoneos-version-min=">, Group<m_Group>;
	def mios_version_min_EQ : Joined<["-"], "mios-version-min=">,
	Alias<miphoneos_version_min_EQ>, HelpText<"Set iOS deployment target">;
	def mios_simulator_version_min_EQ : Joined<["-"], "mios-simulator-version-min=">;
	def miphonesimulator_version_min_EQ : Joined<["-"], "miphonesimulator-version-min=">, Alias<mios_simulator_version_min_EQ>;
	def mkernel : Flag<["-"], "mkernel">, Group<m_Group>;
	def mlinker_version_EQ : Joined<["-"], "mlinker-version=">,
	Flags<[DriverOption]>;
	def mllvm : Separate<["-"], "mllvm">, Flags<[CC1Option,CC1AsOption,CoreOption]>,
	HelpText<"Additional arguments to forward to LLVM's option processing">;
	def mmacosx_version_min_EQ : Joined<["-"], "mmacosx-version-min=">,
	Group<m_Group>, HelpText<"Set Mac OS X deployment target">;
	def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">,
	Group<m_Group>, Alias<mmacosx_version_min_EQ>;
	def mms_bitfields : Flag<["-"], "mms-bitfields">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the default structure layout to be compatible with the Microsoft compiler standard">;
	def mno_ms_bitfields : Flag<["-"], "mno-ms-bitfields">, Group<m_Group>,
	HelpText<"Do not set the default structure layout to be compatible with the Microsoft compiler standard">;
	def mstackrealign : Flag<["-"], "mstackrealign">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Force realign the stack at entry to every function">;
	def mstack_alignment : Joined<["-"], "mstack-alignment=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the stack alignment">;
	def mstack_probe_size : Joined<["-"], "mstack-probe-size=">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set the stack probe size">;
	def mthread_model : Separate<["-"], "mthread-model">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"The thread model to use, e.g. posix, single (posix by default)">, Values<"posix,single">;
	def meabi : Separate<["-"], "meabi">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Set EABI type, e.g. 4, 5 or gnu (default depends on triple)">, Values<"default,4,5,gnu">;

	def mmmx : Flag<["-"], "mmmx">, Group<m_x86_Features_Group>;
	def mno_3dnowa : Flag<["-"], "mno-3dnowa">, Group<m_x86_Features_Group>;
	def mno_3dnow : Flag<["-"], "mno-3dnow">, Group<m_x86_Features_Group>;
	def mno_constant_cfstrings : Flag<["-"], "mno-constant-cfstrings">, Group<m_Group>;
	def mno_global_merge : Flag<["-"], "mno-global-merge">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Disable merging of globals">;
	def mno_mmx : Flag<["-"], "mno-mmx">, Group<m_x86_Features_Group>;
	def mno_pascal_strings : Flag<["-"], "mno-pascal-strings">,
	Alias<fno_pascal_strings>;
	def mno_red_zone : Flag<["-"], "mno-red-zone">, Group<m_Group>;
	def mno_relax_all : Flag<["-"], "mno-relax-all">, Group<m_Group>;
	def mno_rtd: Flag<["-"], "mno-rtd">, Group<m_Group>;
	def mno_soft_float : Flag<["-"], "mno-soft-float">, Group<m_Group>;
	def mno_stackrealign : Flag<["-"], "mno-stackrealign">, Group<m_Group>;
	def mno_x87 : Flag<["-"], "mno-x87">, Group<m_x86_Features_Group>;
	def mno_80387 : Flag<["-"], "mno-80387">, Alias<mno_x87>;
	def mno_sse2 : Flag<["-"], "mno-sse2">, Group<m_x86_Features_Group>;
	def mno_sse3 : Flag<["-"], "mno-sse3">, Group<m_x86_Features_Group>;
	def mno_sse4a : Flag<["-"], "mno-sse4a">, Group<m_x86_Features_Group>;
	def mno_sse4_1 : Flag<["-"], "mno-sse4.1">, Group<m_x86_Features_Group>;
	def mno_sse4_2 : Flag<["-"], "mno-sse4.2">, Group<m_x86_Features_Group>;
	// -mno-sse4 turns off sse4.1 which has the effect of turning off everything
	// later than 4.1. -msse4 turns on 4.2 which has the effect of turning on
	// everything earlier than 4.2.
	def mno_sse4 : Flag<["-"], "mno-sse4">, Alias<mno_sse4_1>;
	def mno_sse : Flag<["-"], "mno-sse">, Group<m_x86_Features_Group>;
	def mno_ssse3 : Flag<["-"], "mno-ssse3">, Group<m_x86_Features_Group>;
	def mno_aes : Flag<["-"], "mno-aes">, Group<m_x86_Features_Group>;
	def mno_avx : Flag<["-"], "mno-avx">, Group<m_x86_Features_Group>;
	def mno_avx2 : Flag<["-"], "mno-avx2">, Group<m_x86_Features_Group>;
	def mno_avx512f : Flag<["-"], "mno-avx512f">, Group<m_x86_Features_Group>;
	def mno_avx512cd : Flag<["-"], "mno-avx512cd">, Group<m_x86_Features_Group>;
	def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
	def mno_avx512er : Flag<["-"], "mno-avx512er">, Group<m_x86_Features_Group>;
	def mno_avx512pf : Flag<["-"], "mno-avx512pf">, Group<m_x86_Features_Group>;
	def mno_avx512dq : Flag<["-"], "mno-avx512dq">, Group<m_x86_Features_Group>;
	def mno_avx512bw : Flag<["-"], "mno-avx512bw">, Group<m_x86_Features_Group>;
	def mno_avx512vl : Flag<["-"], "mno-avx512vl">, Group<m_x86_Features_Group>;
	def mno_avx512vbmi : Flag<["-"], "mno-avx512vbmi">, Group<m_x86_Features_Group>;
	def mno_avx512ifma : Flag<["-"], "mno-avx512ifma">, Group<m_x86_Features_Group>;
	def mno_pclmul : Flag<["-"], "mno-pclmul">, Group<m_x86_Features_Group>;
	def mno_lzcnt : Flag<["-"], "mno-lzcnt">, Group<m_x86_Features_Group>;
	def mno_rdrnd : Flag<["-"], "mno-rdrnd">, Group<m_x86_Features_Group>;
	def mno_fsgsbase : Flag<["-"], "mno-fsgsbase">, Group<m_x86_Features_Group>;
	def mno_bmi : Flag<["-"], "mno-bmi">, Group<m_x86_Features_Group>;
	def mno_bmi2 : Flag<["-"], "mno-bmi2">, Group<m_x86_Features_Group>;
	def mno_popcnt : Flag<["-"], "mno-popcnt">, Group<m_x86_Features_Group>;
	def mno_tbm : Flag<["-"], "mno-tbm">, Group<m_x86_Features_Group>;
	def mno_lwp : Flag<["-"], "mno-lwp">, Group<m_x86_Features_Group>;
	def mno_fma4 : Flag<["-"], "mno-fma4">, Group<m_x86_Features_Group>;
	def mno_fma : Flag<["-"], "mno-fma">, Group<m_x86_Features_Group>;
	def mno_xop : Flag<["-"], "mno-xop">, Group<m_x86_Features_Group>;
	def mno_f16c : Flag<["-"], "mno-f16c">, Group<m_x86_Features_Group>;
	def mno_rtm : Flag<["-"], "mno-rtm">, Group<m_x86_Features_Group>;
	def mno_prfchw : Flag<["-"], "mno-prfchw">, Group<m_x86_Features_Group>;
	def mno_rdseed : Flag<["-"], "mno-rdseed">, Group<m_x86_Features_Group>;
	def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
	def mno_sha : Flag<["-"], "mno-sha">, Group<m_x86_Features_Group>;
	def mno_cx16 : Flag<["-"], "mno-cx16">, Group<m_x86_Features_Group>;
	def mno_fxsr : Flag<["-"], "mno-fxsr">, Group<m_x86_Features_Group>;
	def mno_xsave : Flag<["-"], "mno-xsave">, Group<m_x86_Features_Group>;
	def mno_xsaveopt : Flag<["-"], "mno-xsaveopt">, Group<m_x86_Features_Group>;
	def mno_xsavec : Flag<["-"], "mno-xsavec">, Group<m_x86_Features_Group>;
	def mno_xsaves : Flag<["-"], "mno-xsaves">, Group<m_x86_Features_Group>;
	def mno_mwaitx : Flag<["-"], "mno-mwaitx">, Group<m_x86_Features_Group>;
	def mno_clzero : Flag<["-"], "mno-clzero">, Group<m_x86_Features_Group>;
	def mno_pku : Flag<["-"], "mno-pku">, Group<m_x86_Features_Group>;
	def mno_clflushopt : Flag<["-"], "mno-clflushopt">, Group<m_x86_Features_Group>;
	def mno_clwb : Flag<["-"], "mno-clwb">, Group<m_x86_Features_Group>;
	def mno_movbe : Flag<["-"], "mno-movbe">, Group<m_x86_Features_Group>;
	def mno_mpx : Flag<["-"], "mno-mpx">, Group<m_x86_Features_Group>;
	def mno_sgx : Flag<["-"], "mno-sgx">, Group<m_x86_Features_Group>;
	def mno_prefetchwt1 : Flag<["-"], "mno-prefetchwt1">, Group<m_x86_Features_Group>;

	def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_arm_Features_Group>,
	HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64 only)">;
	def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_arm_Features_Group>,
	HelpText<"Force all memory accesses to be aligned (AArch32/AArch64 only)">;
	def mstrict_align : Flag<["-"], "mstrict-align">, Alias<mno_unaligned_access>, Flags<[CC1Option,HelpHidden]>,
	HelpText<"Force all memory accesses to be aligned (same as mno-unaligned-access)">;
	def mno_thumb : Flag<["-"], "mno-thumb">, Group<m_arm_Features_Group>;
	def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
	HelpText<"Disallow generation of deprecated IT blocks for ARMv8. It is on by default for ARMv8 Thumb mode.">;
	def mno_restrict_it: Flag<["-"], "mno-restrict-it">, Group<m_arm_Features_Group>,
	HelpText<"Allow generation of deprecated IT blocks for ARMv8. It is off by default for ARMv8 Thumb mode">;
	def marm : Flag<["-"], "marm">, Alias<mno_thumb>;
	def ffixed_r9 : Flag<["-"], "ffixed-r9">, Group<m_arm_Features_Group>,
	HelpText<"Reserve the r9 register (ARM only)">;
	def mno_movt : Flag<["-"], "mno-movt">, Group<m_arm_Features_Group>,
	HelpText<"Disallow use of movt/movw pairs (ARM only)">;
	def mcrc : Flag<["-"], "mcrc">, Group<m_arm_Features_Group>,
	HelpText<"Allow use of CRC instructions (ARM only)">;
	def mnocrc : Flag<["-"], "mnocrc">, Group<m_arm_Features_Group>,
	HelpText<"Disallow use of CRC instructions (ARM only)">;
	def mno_neg_immediates: Flag<["-"], "mno-neg-immediates">, Group<m_arm_Features_Group>,
	HelpText<"Disallow converting instructions with negative immediates to their negation or inversion.">;

	def mgeneral_regs_only : Flag<["-"], "mgeneral-regs-only">, Group<m_aarch64_Features_Group>,
	HelpText<"Generate code which only uses the general purpose registers (AArch64 only)">;

	def mfix_cortex_a53_835769 : Flag<["-"], "mfix-cortex-a53-835769">,
	Group<m_aarch64_Features_Group>,
	HelpText<"Workaround Cortex-A53 erratum 835769 (AArch64 only)">;
	def mno_fix_cortex_a53_835769 : Flag<["-"], "mno-fix-cortex-a53-835769">,
	Group<m_aarch64_Features_Group>,
	HelpText<"Don't workaround Cortex-A53 erratum 835769 (AArch64 only)">;
	def ffixed_x18 : Flag<["-"], "ffixed-x18">, Group<m_aarch64_Features_Group>,
	HelpText<"Reserve the x18 register (AArch64 only)">;

	def msimd128 : Flag<["-"], "msimd128">, Group<m_wasm_Features_Group>;
	def mno_simd128 : Flag<["-"], "mno-simd128">, Group<m_wasm_Features_Group>;

	def mamdgpu_debugger_abi : Joined<["-"], "mamdgpu-debugger-abi=">,
	Flags<[HelpHidden]>,
	Group<m_Group>,
	HelpText<"Generate additional code for specified <version> of debugger ABI (AMDGPU only)">,
	MetaVarName<"<version>">;

	def faltivec : Flag<["-"], "faltivec">, Group<f_Group>, Flags<[DriverOption]>;
	def fno_altivec : Flag<["-"], "fno-altivec">, Group<f_Group>, Flags<[DriverOption]>;
	def maltivec : Flag<["-"], "maltivec">, Group<m_ppc_Features_Group>;
	def mno_altivec : Flag<["-"], "mno-altivec">, Group<m_ppc_Features_Group>;
	def mvsx : Flag<["-"], "mvsx">, Group<m_ppc_Features_Group>;
	def mno_vsx : Flag<["-"], "mno-vsx">, Group<m_ppc_Features_Group>;
	def mpower8_vector : Flag<["-"], "mpower8-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power8_vector : Flag<["-"], "mno-power8-vector">,
	Group<m_ppc_Features_Group>;
	def mpower9_vector : Flag<["-"], "mpower9-vector">,
	Group<m_ppc_Features_Group>;
	def mno_power9_vector : Flag<["-"], "mno-power9-vector">,
	Group<m_ppc_Features_Group>;
	def mpower8_crypto : Flag<["-"], "mcrypto">,
	Group<m_ppc_Features_Group>;
	def mnopower8_crypto : Flag<["-"], "mno-crypto">,
	Group<m_ppc_Features_Group>;
	def mdirect_move : Flag<["-"], "mdirect-move">,
	Group<m_ppc_Features_Group>;
	def mnodirect_move : Flag<["-"], "mno-direct-move">,
	Group<m_ppc_Features_Group>;
	def mhtm : Flag<["-"], "mhtm">, Group<m_ppc_Features_Group>;
	def mno_htm : Flag<["-"], "mno-htm">, Group<m_ppc_Features_Group>;
	def mfprnd : Flag<["-"], "mfprnd">, Group<m_ppc_Features_Group>;
	def mno_fprnd : Flag<["-"], "mno-fprnd">, Group<m_ppc_Features_Group>;
	def mcmpb : Flag<["-"], "mcmpb">, Group<m_ppc_Features_Group>;
	def mno_cmpb : Flag<["-"], "mno-cmpb">, Group<m_ppc_Features_Group>;
	def misel : Flag<["-"], "misel">, Group<m_ppc_Features_Group>;
	def mno_isel : Flag<["-"], "mno-isel">, Group<m_ppc_Features_Group>;
	def mmfocrf : Flag<["-"], "mmfocrf">, Group<m_ppc_Features_Group>;
	def mmfcrf : Flag<["-"], "mmfcrf">, Alias<mmfocrf>;
	def mno_mfocrf : Flag<["-"], "mno-mfocrf">, Group<m_ppc_Features_Group>;
	def mno_mfcrf : Flag<["-"], "mno-mfcrf">, Alias<mno_mfocrf>;
	def mpopcntd : Flag<["-"], "mpopcntd">, Group<m_ppc_Features_Group>;
	def mno_popcntd : Flag<["-"], "mno-popcntd">, Group<m_ppc_Features_Group>;
	def mqpx : Flag<["-"], "mqpx">, Group<m_ppc_Features_Group>;
	def mno_qpx : Flag<["-"], "mno-qpx">, Group<m_ppc_Features_Group>;
	def mcrbits : Flag<["-"], "mcrbits">, Group<m_ppc_Features_Group>;
	def mno_crbits : Flag<["-"], "mno-crbits">, Group<m_ppc_Features_Group>;
	def minvariant_function_descriptors :
	Flag<["-"], "minvariant-function-descriptors">, Group<m_ppc_Features_Group>;
	def mno_invariant_function_descriptors :
	Flag<["-"], "mno-invariant-function-descriptors">,
	Group<m_ppc_Features_Group>;
	def mfloat128: Flag<["-"], "mfloat128">,
	Group<m_ppc_Features_Group>;
	def mno_float128 : Flag<["-"], "mno-float128">,
	Group<m_ppc_Features_Group>;
	def mlongcall: Flag<["-"], "mlongcall">,
	Group<m_ppc_Features_Group>;
	def mno_longcall : Flag<["-"], "mno-longcall">,
	Group<m_ppc_Features_Group>;

	def mvx : Flag<["-"], "mvx">, Group<m_Group>;
	def mno_vx : Flag<["-"], "mno-vx">, Group<m_Group>;

	def fzvector : Flag<["-"], "fzvector">, Group<f_Group>, Flags<[CC1Option]>,
	HelpText<"Enable System z vector language extension">;
	def fno_zvector : Flag<["-"], "fno-zvector">, Group<f_Group>,
	Flags<[CC1Option]>;
	def mzvector : Flag<["-"], "mzvector">, Alias<fzvector>;
	def mno_zvector : Flag<["-"], "mno-zvector">, Alias<fno_zvector>;

	def mbackchain : Flag<["-"], "mbackchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>,
	HelpText<"Link stack frames through backchain on System Z">;
	def mno_backchain : Flag<["-"], "mno-backchain">, Group<m_Group>, Flags<[DriverOption,CC1Option]>;

	def mno_warn_nonportable_cfstrings : Flag<["-"], "mno-warn-nonportable-cfstrings">, Group<m_Group>;
	def mno_omit_leaf_frame_pointer : Flag<["-"], "mno-omit-leaf-frame-pointer">, Group<m_Group>;
	def momit_leaf_frame_pointer : Flag<["-"], "momit-leaf-frame-pointer">, Group<m_Group>,
	HelpText<"Omit frame pointer setup for leaf functions">, Flags<[CC1Option]>;
	def moslib_EQ : Joined<["-"], "moslib=">, Group<m_Group>;
	def mpascal_strings : Flag<["-"], "mpascal-strings">, Alias<fpascal_strings>;
	def mred_zone : Flag<["-"], "mred-zone">, Group<m_Group>;
	def mregparm_EQ : Joined<["-"], "mregparm=">, Group<m_Group>;
	def mrelax_all : Flag<["-"], "mrelax-all">, Group<m_Group>, Flags<[CC1Option,CC1AsOption]>,
	HelpText<"(integrated-as) Relax all machine instructions">;
	def mincremental_linker_compatible : Flag<["-"], "mincremental-linker-compatible">, Group<m_Group>,
	Flags<[CC1Option,CC1AsOption]>,
	HelpText<"(integrated-as) Emit an object file which can be used with an incremental linker">;
	def mno_incremental_linker_compatible : Flag<["-"], "mno-incremental-linker-compatible">, Group<m_Group>,
	HelpText<"(integrated-as) Emit an object file which cannot be used with an incremental linker">;
	def mrtd : Flag<["-"], "mrtd">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Make StdCall calling convention the default">;
	def msmall_data_threshold_EQ : Joined <["-"], "msmall-data-threshold=">,
	Group<m_Group>, Alias<G>;
	def msoft_float : Flag<["-"], "msoft-float">, Group<m_Group>, Flags<[CC1Option]>,
	HelpText<"Use software floating point">;
	def mno_implicit_float : Flag<["-"], "mno-implicit-float">, Group<m_Group>,
	HelpText<"Don't generate implicit floating point instructions">;
	def mimplicit_float : Flag<["-"], "mimplicit-float">, Group<m_Group>;
	def mrecip : Flag<["-"], "mrecip">, Group<m_Group>;
	def mrecip_EQ : CommaJoined<["-"], "mrecip=">, Group<m_Group>, Flags<[CC1Option]>;
	def mpie_copy_relocations : Flag<["-"], "mpie-copy-relocations">, Group<m_Group>,
	Flags<[CC1Option]>,
	HelpText<"Use copy relocations support for PIE builds">;
	def mno_pie_copy_relocations : Flag<["-"], "mno-pie-copy-relocations">, Group<m_Group>;
	def mfentry : Flag<["-"], "mfentry">, HelpText<"Insert calls to fentry at function entry (x86 only)">,
	Flags<[CC1Option]>, Group<m_Group>;
	def mx87 : Flag<["-"], "mx87">, Group<m_x86_Features_Group>;
	def m80387 : Flag<["-"], "m80387">, Alias<mx87>;
	def msse2 : Flag<["-"], "msse2">, Group<m_x86_Features_Group>;
	def msse3 : Flag<["-"], "msse3">, Group<m_x86_Features_Group>;
	def msse4a : Flag<["-"], "msse4a">, Group<m_x86_Features_Group>;
	def msse4_1 : Flag<["-"], "msse4.1">, Group<m_x86_Features_Group>;
	def msse4_2 : Flag<["-"], "msse4.2">, Group<m_x86_Features_Group>;
	def msse4 : Flag<["-"], "msse4">, Alias<msse4_2>;
	def msse : Flag<["-"], "msse">, Group<m_x86_Features_Group>;
	def mssse3 : Flag<["-"], "mssse3">, Group<m_x86_Features_Group>;
	def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;
	def mavx : Flag<["-"], "mavx">, Group<m_x86_Features_Group>;
	def mavx2 : Flag<["-"], "mavx2">, Group<m_x86_Features_Group>;
	def mavx512f : Flag<["-"], "mavx512f">, Group<m_x86_Features_Group>;
	def mavx512cd : Flag<["-"], "mavx512cd">, Group<m_x86_Features_Group>;
	def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Group>;
	def mavx512er : Flag<["-"], "mavx512er">, Group<m_x86_Features_Group>;
	def mavx512pf : Flag<["-"], "mavx512pf">, Group<m_x86_Features_Group>;
	def mavx512dq : Flag<["-"], "mavx512dq">, Group<m_x86_Features_Group>;
	def mavx512bw : Flag<["-"], "mavx512bw">, Group<m_x86_Features_Group>;
	def mavx512vl : Flag<["-"], "mavx512vl">, Group<m_x86_Features_Group>;
	def mavx512vbmi : Flag<["-"], "mavx512vbmi">, Group<m_x86_Features_Group>;
	def mavx512ifma : Flag<["-"], "mavx512ifma">, Group<m_x86_Features_Group>;
	def mpclmul : Flag<["-"], "mpclmul">, Group<m_x86_Features_Group>;
	def mlzcnt : Flag<["-"], "mlzcnt">, Group<m_x86_Features_Group>;
	def mrdrnd : Flag<["-"], "mrdrnd">, Group<m_x86_Features_Group>;
	def mfsgsbase : Flag<["-"], "mfsgsbase">, Group<m_x86_Features_Group>;
	def mbmi : Flag<["-"], "mbmi">, Group<m_x86_Features_Group>;
	def mbmi2 : Flag<["-"], "mbmi2">, Group<m_x86_Features_Group>;
	def mpopcnt : Flag<["-"], "mpopcnt">, Group<m_x86_Features_Group>;
	def mtbm : Flag<["-"], "mtbm">, Group<m_x86_Features_Group>;
	def mlwp : Flag<["-"], "mlwp">, Group<m_x86_Features_Group>;
	def mfma4 : Flag<["-"], "mfma4">, Group<m_x86_Features_Group>;
	def mfma : Flag<["-"], "mfma">, Group<m_x86_Features_Group>;
	def mxop : Flag<["-"], "mxop">, Group<m_x86_Features_Group>;
	def mf16c : Flag<["-"], "mf16c">, Group<m_x86_Features_Group>;
	def mrtm : Flag<["-"], "mrtm">, Group<m_x86_Features_Group>;
	def mprfchw : Flag<["-"], "mprfchw">, Group<m_x86_Features_Group>;
	def mrdseed : Flag<["-"], "mrdseed">, Group<m_x86_Features_Group>;
	def mpku : Flag<["-"], "mpku">, Group<m_x86_Features_Group>;
	def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
	def msha : Flag<["-"], "msha">, Group<m_x86_Features_Group>;
	def mcx16 : Flag<["-"], "mcx16">, Group<m_x86_Features_Group>;
	def mfxsr : Flag<["-"], "mfxsr">, Group<m_x86_Features_Group>;
	def mxsave : Flag<["-"], "mxsave">, Group<m_x86_Features_Group>;
	def mxsaveopt : Flag<["-"], "mxsaveopt">, Group<m_x86_Features_Group>;
	def mxsavec : Flag<["-"], "mxsavec">, Group<m_x86_Features_Group>;
	def mxsaves : Flag<["-"], "mxsaves">, Group<m_x86_Features_Group>;
	def mmwaitx : Flag<["-"], "mmwaitx">, Group<m_x86_Features_Group>;
	def mclzero : Flag<["-"], "mclzero">, Group<m_x86_Features_Group>;
	def mclflushopt : Flag<["-"], "mclflushopt">, Group<m_x86_Features_Group>;
	def mclwb : Flag<["-"], "mclwb">, Group<m_x86_Features_Group>;
	def mmovbe : Flag<["-"], "mmovbe">, Group<m_x86_Features_Group>;
	def mmpx : Flag<["-"], "mmpx">, Group<m_x86_Features_Group>;
	def msgx : Flag<["-"], "msgx">, Group<m_x86_Features_Group>;
	def mprefetchwt1 : Flag<["-"], "mprefetchwt1">, Group<m_x86_Features_Group>;
	def mips16 : Flag<["-"], "mips16">, Group<m_Group>;
	def mno_mips16 : Flag<["-"], "mno-mips16">, Group<m_Group>;
	def mmicromips : Flag<["-"], "mmicromips">, Group<m_Group>;
	def mno_micromips : Flag<["-"], "mno-micromips">, Group<m_Group>;
	def mxgot : Flag<["-"], "mxgot">, Group<m_Group>;
	def mno_xgot : Flag<["-"], "mno-xgot">, Group<m_Group>;
	def mldc1_sdc1 : Flag<["-"], "mldc1-sdc1">, Group<m_Group>;
	def mno_ldc1_sdc1 : Flag<["-"], "mno-ldc1-sdc1">, Group<m_Group>;
	def mcheck_zero_division : Flag<["-"], "mcheck-zero-division">, Group<m_Group>;
	def mno_check_zero_division : Flag<["-"], "mno-check-zero-division">,
	Group<m_Group>;
	def mcompact_branches_EQ : Joined<["-"], "mcompact-branches=">, Group<m_Group>;
	def mdsp : Flag<["-"], "mdsp">, Group<m_Group>;
	def mno_dsp : Flag<["-"], "mno-dsp">, Group<m_Group>;
	def mdspr2 : Flag<["-"], "mdspr2">, Group<m_Group>;
	def mno_dspr2 : Flag<["-"], "mno-dspr2">, Group<m_Group>;
	def msingle_float : Flag<["-"], "msingle-float">, Group<m_Group>;
	def mdouble_float : Flag<["-"], "mdouble-float">, Group<m_Group>;
	-def mmadd4 : Flag<["-"], "mmadd4">, Group<m_Group>,
	- HelpText<"Enable the generation of 4-operand madd.s, madd.d and related instructions.">;
	-def mno_madd4 : Flag<["-"], "mno-madd4">, Group<m_Group>,
	- HelpText<"Disable the generation of 4-operand madd.s, madd.d and related instructions.">;
	def mmsa : Flag<["-"], "mmsa">, Group<m_Group>,
	HelpText<"Enable MSA ASE (MIPS only)">;
	def mno_msa : Flag<["-"], "mno-msa">, Group<m_Group>,
	HelpText<"Disable MSA ASE (MIPS only)">;
	def mmt : Flag<["-"], "mmt">, Group<m_Group>,
	HelpText<"Enable MT ASE (MIPS only)">;
	def mno_mt : Flag<["-"], "mno-mt">, Group<m_Group>,
	HelpText<"Disable MT ASE (MIPS only)">;
	def mfp64 : Flag<["-"], "mfp64">, Group<m_Group>,
	HelpText<"Use 64-bit floating point registers (MIPS only)">;
	def mfp32 : Flag<["-"], "mfp32">, Group<m_Group>,
	HelpText<"Use 32-bit floating point registers (MIPS only)">;
	def mnan_EQ : Joined<["-"], "mnan=">, Group<m_Group>;
	def mabicalls : Flag<["-"], "mabicalls">, Group<m_Group>,
	HelpText<"Enable SVR4-style position-independent code (Mips only)">;
	def mno_abicalls : Flag<["-"], "mno-abicalls">, Group<m_Group>,
	HelpText<"Disable SVR4-style position-independent code (Mips only)">;
	def mips1 : Flag<["-"], "mips1">,
	Alias<march_EQ>, AliasArgs<["mips1"]>,
	HelpText<"Equivalent to -march=mips1">, Flags<[HelpHidden]>;
	def mips2 : Flag<["-"], "mips2">,
	Alias<march_EQ>, AliasArgs<["mips2"]>,
	HelpText<"Equivalent to -march=mips2">, Flags<[HelpHidden]>;
	def mips3 : Flag<["-"], "mips3">,
	Alias<march_EQ>, AliasArgs<["mips3"]>,
	HelpText<"Equivalent to -march=mips3">, Flags<[HelpHidden]>;
	def mips4 : Flag<["-"], "mips4">,
	Alias<march_EQ>, AliasArgs<["mips4"]>,
	HelpText<"Equivalent to -march=mips4">, Flags<[HelpHidden]>;
	def mips5 : Flag<["-"], "mips5">,
	Alias<march_EQ>, AliasArgs<["mips5"]>,
	HelpText<"Equivalent to -march=mips5">, Flags<[HelpHidden]>;
	def mips32 : Flag<["-"], "mips32">,
	Alias<march_EQ>, AliasArgs<["mips32"]>,
	HelpText<"Equivalent to -march=mips32">, Flags<[HelpHidden]>;
	def mips32r2 : Flag<["-"], "mips32r2">,
	Alias<march_EQ>, AliasArgs<["mips32r2"]>,
	HelpText<"Equivalent to -march=mips32r2">, Flags<[HelpHidden]>;
	def mips32r3 : Flag<["-"], "mips32r3">,
	Alias<march_EQ>, AliasArgs<["mips32r3"]>,
	HelpText<"Equivalent to -march=mips32r3">, Flags<[HelpHidden]>;
	def mips32r5 : Flag<["-"], "mips32r5">,
	Alias<march_EQ>, AliasArgs<["mips32r5"]>,
	HelpText<"Equivalent to -march=mips32r5">, Flags<[HelpHidden]>;
	def mips32r6 : Flag<["-"], "mips32r6">,
	Alias<march_EQ>, AliasArgs<["mips32r6"]>,
	HelpText<"Equivalent to -march=mips32r6">, Flags<[HelpHidden]>;
	def mips64 : Flag<["-"], "mips64">,
	Alias<march_EQ>, AliasArgs<["mips64"]>,
	HelpText<"Equivalent to -march=mips64">, Flags<[HelpHidden]>;
	def mips64r2 : Flag<["-"], "mips64r2">,
	Alias<march_EQ>, AliasArgs<["mips64r2"]>,
	HelpText<"Equivalent to -march=mips64r2">, Flags<[HelpHidden]>;
	def mips64r3 : Flag<["-"], "mips64r3">,
	Alias<march_EQ>, AliasArgs<["mips64r3"]>,
	HelpText<"Equivalent to -march=mips64r3">, Flags<[HelpHidden]>;
	def mips64r5 : Flag<["-"], "mips64r5">,
	Alias<march_EQ>, AliasArgs<["mips64r5"]>,
	HelpText<"Equivalent to -march=mips64r5">, Flags<[HelpHidden]>;
	def mips64r6 : Flag<["-"], "mips64r6">,
	Alias<march_EQ>, AliasArgs<["mips64r6"]>,
	HelpText<"Equivalent to -march=mips64r6">, Flags<[HelpHidden]>;
	def mfpxx : Flag<["-"], "mfpxx">, Group<m_Group>,
	HelpText<"Avoid FPU mode dependent operations when used with the O32 ABI">,
	Flags<[HelpHidden]>;
	def modd_spreg : Flag<["-"], "modd-spreg">, Group<m_Group>,
	HelpText<"Enable odd single-precision floating point registers">,
	Flags<[HelpHidden]>;
	def mno_odd_spreg : Flag<["-"], "mno-odd-spreg">, Group<m_Group>,
	HelpText<"Disable odd single-precision floating point registers">,
	Flags<[HelpHidden]>;
	def mglibc : Flag<["-"], "mglibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
	def muclibc : Flag<["-"], "muclibc">, Group<m_libc_Group>, Flags<[HelpHidden]>;
	def module_file_info : Flag<["-"], "module-file-info">, Flags<[DriverOption,CC1Option]>, Group<Action_Group>,
	HelpText<"Provide information about a particular module file">;
	def mthumb : Flag<["-"], "mthumb">, Group<m_Group>;
	def mtune_EQ : Joined<["-"], "mtune=">, Group<m_Group>;
	def multi__module : Flag<["-"], "multi_module">;
	def multiply__defined__unused : Separate<["-"], "multiply_defined_unused">;
	def multiply__defined : Separate<["-"], "multiply_defined">;
	def mwarn_nonportable_cfstrings : Flag<["-"], "mwarn-nonportable-cfstrings">, Group<m_Group>;
	def no_canonical_prefixes : Flag<["-"], "no-canonical-prefixes">, Flags<[HelpHidden]>,
	HelpText<"Use relative instead of canonical paths">;
	def no_cpp_precomp : Flag<["-"], "no-cpp-precomp">, Group<clang_ignored_f_Group>;
	def no_integrated_cpp : Flag<["-", "--"], "no-integrated-cpp">, Flags<[DriverOption]>;
	def no_pedantic : Flag<["-", "--"], "no-pedantic">, Group<pedantic_Group>;
	def no__dead__strip__inits__and__terms : Flag<["-"], "no_dead_strip_inits_and_terms">;
	def nobuiltininc : Flag<["-"], "nobuiltininc">, Flags<[CC1Option, CoreOption]>,
	HelpText<"Disable builtin #include directories">;
	def nocudainc : Flag<["-"], "nocudainc">;
	def nocudalib : Flag<["-"], "nocudalib">;
	def nodefaultlibs : Flag<["-"], "nodefaultlibs">;
	def nofixprebinding : Flag<["-"], "nofixprebinding">;
	def nolibc : Flag<["-"], "nolibc">;
	def nomultidefs : Flag<["-"], "nomultidefs">;
	def nopie : Flag<["-"], "nopie">;
	def no_pie : Flag<["-"], "no-pie">, Alias<nopie>;
	def noprebind : Flag<["-"], "noprebind">;
	def noseglinkedit : Flag<["-"], "noseglinkedit">;
	def nostartfiles : Flag<["-"], "nostartfiles">;
	def nostdinc : Flag<["-"], "nostdinc">, Flags<[CoreOption]>;
	def nostdlibinc : Flag<["-"], "nostdlibinc">;
	def nostdincxx : Flag<["-"], "nostdinc++">, Flags<[CC1Option]>,
	HelpText<"Disable standard #include directories for the C++ standard library">;
	def nostdlib : Flag<["-"], "nostdlib">;
	def object : Flag<["-"], "object">;
	def o : JoinedOrSeparate<["-"], "o">, Flags<[DriverOption, RenderAsInput, CC1Option, CC1AsOption]>,
	HelpText<"Write output to <file>">, MetaVarName<"<file>">;
	def pagezero__size : JoinedOrSeparate<["-"], "pagezero_size">;
	def pass_exit_codes : Flag<["-", "--"], "pass-exit-codes">, Flags<[Unsupported]>;
	def pedantic_errors : Flag<["-", "--"], "pedantic-errors">, Group<pedantic_Group>, Flags<[CC1Option]>;
	def pedantic : Flag<["-", "--"], "pedantic">, Group<pedantic_Group>, Flags<[CC1Option]>;
	def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">, Flags<[CC1Option]>;
	def pipe : Flag<["-", "--"], "pipe">,
	HelpText<"Use pipes between commands, when possible">;
	def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
	def prebind : Flag<["-"], "prebind">;
	def preload : Flag<["-"], "preload">;
	def print_file_name_EQ : Joined<["-", "--"], "print-file-name=">,
	HelpText<"Print the full library path of <file>">, MetaVarName<"<file>">;
	def print_ivar_layout : Flag<["-"], "print-ivar-layout">, Flags<[CC1Option]>,
	HelpText<"Enable Objective-C Ivar layout bitmap print trace">;
	def print_libgcc_file_name : Flag<["-", "--"], "print-libgcc-file-name">,
	HelpText<"Print the library path for the currently used compiler runtime "
	"library (\"libgcc.a\" or \"libclang_rt.builtins.*.a\")">;
	def print_multi_directory : Flag<["-", "--"], "print-multi-directory">;
	def print_multi_lib : Flag<["-", "--"], "print-multi-lib">;
	def print_multi_os_directory : Flag<["-", "--"], "print-multi-os-directory">,
	Flags<[Unsupported]>;
	def print_prog_name_EQ : Joined<["-", "--"], "print-prog-name=">,
	HelpText<"Print the full program path of <name>">, MetaVarName<"<name>">;
	def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
	HelpText<"Print the resource directory pathname">;
	def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
	HelpText<"Print the paths used for finding libraries and programs">;
	def private__bundle : Flag<["-"], "private_bundle">;
	def pthreads : Flag<["-"], "pthreads">;
	def pthread : Flag<["-"], "pthread">, Flags<[CC1Option]>,
	HelpText<"Support POSIX threads in generated code">;
	def no_pthread : Flag<["-"], "no-pthread">, Flags<[CC1Option]>;
	def p : Flag<["-"], "p">;
	def pie : Flag<["-"], "pie">;
	def read__only__relocs : Separate<["-"], "read_only_relocs">;
	def remap : Flag<["-"], "remap">;
	def rewrite_objc : Flag<["-"], "rewrite-objc">, Flags<[DriverOption,CC1Option]>,
	HelpText<"Rewrite Objective-C source to C++">, Group<Action_Group>;
	def rewrite_legacy_objc : Flag<["-"], "rewrite-legacy-objc">, Flags<[DriverOption]>,
	HelpText<"Rewrite Legacy Objective-C source to C++">;
	def rdynamic : Flag<["-"], "rdynamic">;
	def resource_dir : Separate<["-"], "resource-dir">,
	Flags<[DriverOption, CC1Option, CoreOption, HelpHidden]>,
	HelpText<"The directory which holds the compiler resource files">;
	def resource_dir_EQ : Joined<["-"], "resource-dir=">, Flags<[DriverOption, CoreOption]>,
	Alias<resource_dir>;
	def rpath : Separate<["-"], "rpath">, Flags<[LinkerInput]>, Group<Link_Group>;
	def rtlib_EQ : Joined<["-", "--"], "rtlib=">,
	HelpText<"Compiler runtime library to use">;
	def frtlib_add_rpath: Flag<["-"], "frtlib-add-rpath">, Flags<[NoArgumentUnused]>,
	HelpText<"Add -rpath with architecture-specific resource directory to the linker flags">;
	def fno_rtlib_add_rpath: Flag<["-"], "fno-rtlib-add-rpath">, Flags<[NoArgumentUnused]>,
	HelpText<"Do not add -rpath with architecture-specific resource directory to the linker flags">;
	def r : Flag<["-"], "r">, Flags<[LinkerInput,NoArgumentUnused]>,
	Group<Link_Group>;
	def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[DriverOption]>,
	HelpText<"Save intermediate compilation results.">;
	def save_temps : Flag<["-", "--"], "save-temps">, Flags<[DriverOption]>,
	Alias<save_temps_EQ>, AliasArgs<["cwd"]>,
	HelpText<"Save intermediate compilation results">;
	def save_stats_EQ : Joined<["-", "--"], "save-stats=">, Flags<[DriverOption]>,
	HelpText<"Save llvm statistics.">;
	def save_stats : Flag<["-", "--"], "save-stats">, Flags<[DriverOption]>,
	Alias<save_stats_EQ>, AliasArgs<["cwd"]>,
	HelpText<"Save llvm statistics.">;
	def via_file_asm : Flag<["-", "--"], "via-file-asm">, InternalDebugOpt,
	HelpText<"Write assembly to file for input to assemble jobs">;
	def sectalign : MultiArg<["-"], "sectalign", 3>;
	def sectcreate : MultiArg<["-"], "sectcreate", 3>;
	def sectobjectsymbols : MultiArg<["-"], "sectobjectsymbols", 2>;
	def sectorder : MultiArg<["-"], "sectorder", 3>;
	def seg1addr : JoinedOrSeparate<["-"], "seg1addr">;
	def seg__addr__table__filename : Separate<["-"], "seg_addr_table_filename">;
	def seg__addr__table : Separate<["-"], "seg_addr_table">;
	def segaddr : MultiArg<["-"], "segaddr", 2>;
	def segcreate : MultiArg<["-"], "segcreate", 3>;
	def seglinkedit : Flag<["-"], "seglinkedit">;
	def segprot : MultiArg<["-"], "segprot", 3>;
	def segs__read__only__addr : Separate<["-"], "segs_read_only_addr">;
	def segs__read__write__addr : Separate<["-"], "segs_read_write_addr">;
	def segs__read__ : Joined<["-"], "segs_read_">;
	def shared_libgcc : Flag<["-"], "shared-libgcc">;
	def shared : Flag<["-", "--"], "shared">;
	def single__module : Flag<["-"], "single_module">;
	def specs_EQ : Joined<["-", "--"], "specs=">;
	def specs : Separate<["-", "--"], "specs">, Flags<[Unsupported]>;
	def static_libgcc : Flag<["-"], "static-libgcc">;
	def static_libstdcxx : Flag<["-"], "static-libstdc++">;
	def static : Flag<["-", "--"], "static">, Flags<[NoArgumentUnused]>;
	def std_default_EQ : Joined<["-"], "std-default=">;
	def std_EQ : Joined<["-", "--"], "std=">, Flags<[CC1Option]>,
	Group<CompileOnly_Group>, HelpText<"Language standard to compile for">;
	def stdlib_EQ : Joined<["-", "--"], "stdlib=">, Flags<[CC1Option]>,
	HelpText<"C++ standard library to use">, Values<"libc++,libstdc++,platform">;
	def sub__library : JoinedOrSeparate<["-"], "sub_library">;
	def sub__umbrella : JoinedOrSeparate<["-"], "sub_umbrella">;
	def system_header_prefix : Joined<["--"], "system-header-prefix=">,
	Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
	HelpText<"Treat all #include paths starting with <prefix> as including a "
	"system header.">;
	def : Separate<["--"], "system-header-prefix">, Alias<system_header_prefix>;
	def no_system_header_prefix : Joined<["--"], "no-system-header-prefix=">,
	Group<clang_i_Group>, Flags<[CC1Option]>, MetaVarName<"<prefix>">,
	HelpText<"Treat all #include paths starting with <prefix> as not including a "
	"system header.">;
	def : Separate<["--"], "no-system-header-prefix">, Alias<no_system_header_prefix>;
	def s : Flag<["-"], "s">, Group<Link_Group>;
	def target : Joined<["--"], "target=">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Generate code for the given target">;
	def gcc_toolchain : Joined<["--"], "gcc-toolchain=">, Flags<[DriverOption]>,
	HelpText<"Use the gcc toolchain at the given directory">;
	def time : Flag<["-"], "time">,
	HelpText<"Time individual commands">;
	def traditional_cpp : Flag<["-", "--"], "traditional-cpp">, Flags<[CC1Option]>,
	HelpText<"Enable some traditional CPP emulation">;
	def traditional : Flag<["-", "--"], "traditional">;
	def trigraphs : Flag<["-", "--"], "trigraphs">, Alias<ftrigraphs>,
	HelpText<"Process trigraph sequences">;
	def twolevel__namespace__hints : Flag<["-"], "twolevel_namespace_hints">;
	def twolevel__namespace : Flag<["-"], "twolevel_namespace">;
	def t : Flag<["-"], "t">, Group<Link_Group>;
	def umbrella : Separate<["-"], "umbrella">;
	def undefined : JoinedOrSeparate<["-"], "undefined">, Group<u_Group>;
	def undef : Flag<["-"], "undef">, Group<u_Group>, Flags<[CC1Option]>,
	HelpText<"undef all system defines">;
	def unexported__symbols__list : Separate<["-"], "unexported_symbols_list">;
	def u : JoinedOrSeparate<["-"], "u">, Group<u_Group>;
	def v : Flag<["-"], "v">, Flags<[CC1Option, CoreOption]>,
	HelpText<"Show commands to run and use verbose output">;
	def verify_debug_info : Flag<["--"], "verify-debug-info">, Flags<[DriverOption]>,
	HelpText<"Verify the binary representation of debug output">;
	def weak_l : Joined<["-"], "weak-l">, Flags<[LinkerInput]>;
	def weak__framework : Separate<["-"], "weak_framework">, Flags<[LinkerInput]>;
	def weak__library : Separate<["-"], "weak_library">, Flags<[LinkerInput]>;
	def weak__reference__mismatches : Separate<["-"], "weak_reference_mismatches">;
	def whatsloaded : Flag<["-"], "whatsloaded">;
	def whyload : Flag<["-"], "whyload">;
	def w : Flag<["-"], "w">, HelpText<"Suppress all warnings">, Flags<[CC1Option]>;
	def x : JoinedOrSeparate<["-"], "x">, Flags<[DriverOption,CC1Option]>,
	HelpText<"Treat subsequent input files as having type <language>">,
	MetaVarName<"<language>">;
	def y : Joined<["-"], "y">;

	def fintegrated_as : Flag<["-"], "fintegrated-as">, Flags<[DriverOption]>,
	Group<f_Group>, HelpText<"Enable the integrated assembler">;
	def fno_integrated_as : Flag<["-"], "fno-integrated-as">,
	Flags<[CC1Option, DriverOption]>, Group<f_Group>,
	HelpText<"Disable the integrated assembler">;
	def : Flag<["-"], "integrated-as">, Alias<fintegrated_as>, Flags<[DriverOption]>;
	def : Flag<["-"], "no-integrated-as">, Alias<fno_integrated_as>,
	Flags<[CC1Option, DriverOption]>;

	def working_directory : JoinedOrSeparate<["-"], "working-directory">, Flags<[CC1Option]>,
	HelpText<"Resolve file paths relative to the specified directory">;
	def working_directory_EQ : Joined<["-"], "working-directory=">, Flags<[CC1Option]>,
	Alias<working_directory>;

	// Double dash options, which are usually an alias for one of the previous
	// options.

	def _mhwdiv_EQ : Joined<["--"], "mhwdiv=">, Alias<mhwdiv_EQ>;
	def _mhwdiv : Separate<["--"], "mhwdiv">, Alias<mhwdiv_EQ>;
	def _CLASSPATH_EQ : Joined<["--"], "CLASSPATH=">, Alias<fclasspath_EQ>;
	def _CLASSPATH : Separate<["--"], "CLASSPATH">, Alias<fclasspath_EQ>;
	def _all_warnings : Flag<["--"], "all-warnings">, Alias<Wall>;
	def _analyze_auto : Flag<["--"], "analyze-auto">, Flags<[DriverOption]>;
	def _analyzer_no_default_checks : Flag<["--"], "analyzer-no-default-checks">, Flags<[DriverOption]>;
	def _analyzer_output : JoinedOrSeparate<["--"], "analyzer-output">, Flags<[DriverOption]>,
	HelpText<"Static analyzer report output format (html\|plist\|plist-multi-file\|plist-html\|text).">;
	def _analyze : Flag<["--"], "analyze">, Flags<[DriverOption, CoreOption]>,
	HelpText<"Run the static analyzer">;
	def _assemble : Flag<["--"], "assemble">, Alias<S>;
	def _assert_EQ : Joined<["--"], "assert=">, Alias<A>;
	def _assert : Separate<["--"], "assert">, Alias<A>;
	def _bootclasspath_EQ : Joined<["--"], "bootclasspath=">, Alias<fbootclasspath_EQ>;
	def _bootclasspath : Separate<["--"], "bootclasspath">, Alias<fbootclasspath_EQ>;
	def _classpath_EQ : Joined<["--"], "classpath=">, Alias<fclasspath_EQ>;
	def _classpath : Separate<["--"], "classpath">, Alias<fclasspath_EQ>;
	def _comments_in_macros : Flag<["--"], "comments-in-macros">, Alias<CC>;
	def _comments : Flag<["--"], "comments">, Alias<C>;
	def _compile : Flag<["--"], "compile">, Alias<c>;
	def _constant_cfstrings : Flag<["--"], "constant-cfstrings">;
	def _debug_EQ : Joined<["--"], "debug=">, Alias<g_Flag>;
	def _debug : Flag<["--"], "debug">, Alias<g_Flag>;
	def _define_macro_EQ : Joined<["--"], "define-macro=">, Alias<D>;
	def _define_macro : Separate<["--"], "define-macro">, Alias<D>;
	def _dependencies : Flag<["--"], "dependencies">, Alias<M>;
	def _dyld_prefix_EQ : Joined<["--"], "dyld-prefix=">;
	def _dyld_prefix : Separate<["--"], "dyld-prefix">, Alias<_dyld_prefix_EQ>;
	def _encoding_EQ : Joined<["--"], "encoding=">, Alias<fencoding_EQ>;
	def _encoding : Separate<["--"], "encoding">, Alias<fencoding_EQ>;
	def _entry : Flag<["--"], "entry">, Alias<e>;
	def _extdirs_EQ : Joined<["--"], "extdirs=">, Alias<fextdirs_EQ>;
	def _extdirs : Separate<["--"], "extdirs">, Alias<fextdirs_EQ>;
	def _extra_warnings : Flag<["--"], "extra-warnings">, Alias<W_Joined>;
	def _for_linker_EQ : Joined<["--"], "for-linker=">, Alias<Xlinker>;
	def _for_linker : Separate<["--"], "for-linker">, Alias<Xlinker>;
	def _force_link_EQ : Joined<["--"], "force-link=">, Alias<u>;
	def _force_link : Separate<["--"], "force-link">, Alias<u>;
	def _help_hidden : Flag<["--"], "help-hidden">;
	def _imacros_EQ : Joined<["--"], "imacros=">, Alias<imacros>;
	def _include_barrier : Flag<["--"], "include-barrier">, Alias<I_>;
	def _include_directory_after_EQ : Joined<["--"], "include-directory-after=">, Alias<idirafter>;
	def _include_directory_after : Separate<["--"], "include-directory-after">, Alias<idirafter>;
	def _include_directory_EQ : Joined<["--"], "include-directory=">, Alias<I>;
	def _include_directory : Separate<["--"], "include-directory">, Alias<I>;
	def _include_prefix_EQ : Joined<["--"], "include-prefix=">, Alias<iprefix>;
	def _include_prefix : Separate<["--"], "include-prefix">, Alias<iprefix>;
	def _include_with_prefix_after_EQ : Joined<["--"], "include-with-prefix-after=">, Alias<iwithprefix>;
	def _include_with_prefix_after : Separate<["--"], "include-with-prefix-after">, Alias<iwithprefix>;
	def _include_with_prefix_before_EQ : Joined<["--"], "include-with-prefix-before=">, Alias<iwithprefixbefore>;
	def _include_with_prefix_before : Separate<["--"], "include-with-prefix-before">, Alias<iwithprefixbefore>;
	def _include_with_prefix_EQ : Joined<["--"], "include-with-prefix=">, Alias<iwithprefix>;
	def _include_with_prefix : Separate<["--"], "include-with-prefix">, Alias<iwithprefix>;
	def _include_EQ : Joined<["--"], "include=">, Alias<include_>;
	def _language_EQ : Joined<["--"], "language=">, Alias<x>;
	def _language : Separate<["--"], "language">, Alias<x>;
	def _library_directory_EQ : Joined<["--"], "library-directory=">, Alias<L>;
	def _library_directory : Separate<["--"], "library-directory">, Alias<L>;
	def _no_line_commands : Flag<["--"], "no-line-commands">, Alias<P>;
	def _no_standard_includes : Flag<["--"], "no-standard-includes">, Alias<nostdinc>;
	def _no_standard_libraries : Flag<["--"], "no-standard-libraries">, Alias<nostdlib>;
	def _no_undefined : Flag<["--"], "no-undefined">, Flags<[LinkerInput]>;
	def _no_warnings : Flag<["--"], "no-warnings">, Alias<w>;
	def _optimize_EQ : Joined<["--"], "optimize=">, Alias<O>;
	def _optimize : Flag<["--"], "optimize">, Alias<O>;
	def _output_class_directory_EQ : Joined<["--"], "output-class-directory=">, Alias<foutput_class_dir_EQ>;
	def _output_class_directory : Separate<["--"], "output-class-directory">, Alias<foutput_class_dir_EQ>;
	def _output_EQ : Joined<["--"], "output=">, Alias<o>;
	def _output : Separate<["--"], "output">, Alias<o>;
	def _param : Separate<["--"], "param">, Group<CompileOnly_Group>;
	def _param_EQ : Joined<["--"], "param=">, Alias<_param>;
	def _precompile : Flag<["--"], "precompile">, Flags<[DriverOption]>,
	Group<Action_Group>, HelpText<"Only precompile the input">;
	def _prefix_EQ : Joined<["--"], "prefix=">, Alias<B>;
	def _prefix : Separate<["--"], "prefix">, Alias<B>;
	def _preprocess : Flag<["--"], "preprocess">, Alias<E>;
	def _print_diagnostic_categories : Flag<["--"], "print-diagnostic-categories">;
	def _print_file_name : Separate<["--"], "print-file-name">, Alias<print_file_name_EQ>;
	def _print_missing_file_dependencies : Flag<["--"], "print-missing-file-dependencies">, Alias<MG>;
	def _print_prog_name : Separate<["--"], "print-prog-name">, Alias<print_prog_name_EQ>;
	def _profile_blocks : Flag<["--"], "profile-blocks">, Alias<a>;
	def _profile : Flag<["--"], "profile">, Alias<p>;
	def _resource_EQ : Joined<["--"], "resource=">, Alias<fcompile_resource_EQ>;
	def _resource : Separate<["--"], "resource">, Alias<fcompile_resource_EQ>;
	def _rtlib : Separate<["--"], "rtlib">, Alias<rtlib_EQ>;
	def _serialize_diags : Separate<["-", "--"], "serialize-diagnostics">, Flags<[DriverOption]>,
	HelpText<"Serialize compiler diagnostics to a file">;
	// We give --version different semantics from -version.
	def _version : Flag<["--"], "version">, Flags<[CC1Option]>;
	def _signed_char : Flag<["--"], "signed-char">, Alias<fsigned_char>;
	def _std : Separate<["--"], "std">, Alias<std_EQ>;
	def _stdlib : Separate<["--"], "stdlib">, Alias<stdlib_EQ>;
	def _sysroot_EQ : Joined<["--"], "sysroot=">;
	def _sysroot : Separate<["--"], "sysroot">, Alias<_sysroot_EQ>;
	def _target_help : Flag<["--"], "target-help">;
	def _trace_includes : Flag<["--"], "trace-includes">, Alias<H>;
	def _undefine_macro_EQ : Joined<["--"], "undefine-macro=">, Alias<U>;
	def _undefine_macro : Separate<["--"], "undefine-macro">, Alias<U>;
	def _unsigned_char : Flag<["--"], "unsigned-char">, Alias<funsigned_char>;
	def _user_dependencies : Flag<["--"], "user-dependencies">, Alias<MM>;
	def _verbose : Flag<["--"], "verbose">, Alias<v>;
	def _warn__EQ : Joined<["--"], "warn-=">, Alias<W_Joined>;
	def _warn_ : Joined<["--"], "warn-">, Alias<W_Joined>;
	def _write_dependencies : Flag<["--"], "write-dependencies">, Alias<MD>;
	def _write_user_dependencies : Flag<["--"], "write-user-dependencies">, Alias<MMD>;
	def _ : Joined<["--"], "">, Flags<[Unsupported]>;

	def mieee_rnd_near : Flag<["-"], "mieee-rnd-near">, Group<m_hexagon_Features_Group>;
	def mv4 : Flag<["-"], "mv4">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv4"]>;
	def mv5 : Flag<["-"], "mv5">, Group<m_hexagon_Features_Group>, Alias<mcpu_EQ>,
	AliasArgs<["hexagonv5"]>;
	def mv55 : Flag<["-"], "mv55">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv55"]>;
	def mv60 : Flag<["-"], "mv60">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv60"]>;
	def mv62 : Flag<["-"], "mv62">, Group<m_hexagon_Features_Group>,
	Alias<mcpu_EQ>, AliasArgs<["hexagonv62"]>;
	def mhexagon_hvx : Flag<["-"], "mhvx">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable Hexagon Vector eXtensions">;
	def mno_hexagon_hvx : Flag<["-"], "mno-hvx">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable Hexagon Vector eXtensions">;
	def mhexagon_hvx_double : Flag<["-"], "mhvx-double">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Enable Hexagon Double Vector eXtensions">;
	def mno_hexagon_hvx_double : Flag<["-"], "mno-hvx-double">, Group<m_hexagon_Features_Group>,
	Flags<[CC1Option]>, HelpText<"Disable Hexagon Double Vector eXtensions">;

	// These are legacy user-facing driver-level option spellings. They are always
	// aliases for options that are spelled using the more common Unix / GNU flag
	// style of double-dash and equals-joined flags.
	def gcc_toolchain_legacy_spelling : Separate<["-"], "gcc-toolchain">, Alias<gcc_toolchain>;
	def target_legacy_spelling : Separate<["-"], "target">, Alias<target>;

	// Special internal option to handle -Xlinker --no-demangle.
	def Z_Xlinker__no_demangle : Flag<["-"], "Z-Xlinker-no-demangle">,
	Flags<[Unsupported, NoArgumentUnused]>;

	// Special internal option to allow forwarding arbitrary arguments to linker.
	def Zlinker_input : Separate<["-"], "Zlinker-input">,
	Flags<[Unsupported, NoArgumentUnused]>;

	// Reserved library options.
	def Z_reserved_lib_stdcxx : Flag<["-"], "Z-reserved-lib-stdc++">,
	Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;
	def Z_reserved_lib_cckext : Flag<["-"], "Z-reserved-lib-cckext">,
	Flags<[LinkerInput, NoArgumentUnused, Unsupported]>, Group<reserved_lib_Group>;

	// Ignored options
	// FIXME: multiclasess produce suffixes, not prefixes. This is fine for now
	// since it is only used in ignored options.
	multiclass BooleanFFlag<string name> {
	def _f : Flag<["-"], "f"#name>;
	def _fno : Flag<["-"], "fno-"#name>;
	}

	defm : BooleanFFlag<"keep-inline-functions">, Group<clang_ignored_gcc_optimization_f_Group>;

	def fprofile_dir : Joined<["-"], "fprofile-dir=">, Group<f_Group>;

	def fuse_ld_EQ : Joined<["-"], "fuse-ld=">, Group<f_Group>, Flags<[CoreOption]>;

	defm align_functions : BooleanFFlag<"align-functions">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_functions_EQ : Joined<["-"], "falign-functions=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_labels : BooleanFFlag<"align-labels">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_labels_EQ : Joined<["-"], "falign-labels=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_loops : BooleanFFlag<"align-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_loops_EQ : Joined<["-"], "falign-loops=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm align_jumps : BooleanFFlag<"align-jumps">, Group<clang_ignored_gcc_optimization_f_Group>;
	def falign_jumps_EQ : Joined<["-"], "falign-jumps=">, Group<clang_ignored_gcc_optimization_f_Group>;

	// FIXME: This option should be supported and wired up to our diognostics, but
	// ignore it for now to avoid breaking builds that use it.
	def fdiagnostics_show_location_EQ : Joined<["-"], "fdiagnostics-show-location=">, Group<clang_ignored_f_Group>;

	defm fcheck_new : BooleanFFlag<"check-new">, Group<clang_ignored_f_Group>;
	defm caller_saves : BooleanFFlag<"caller-saves">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm reorder_blocks : BooleanFFlag<"reorder-blocks">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm eliminate_unused_debug_types : BooleanFFlag<"eliminate-unused-debug-types">, Group<clang_ignored_f_Group>;
	defm branch_count_reg : BooleanFFlag<"branch-count-reg">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm default_inline : BooleanFFlag<"default-inline">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm delete_null_pointer_checks : BooleanFFlag<"delete-null-pointer-checks">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm fat_lto_objects : BooleanFFlag<"fat-lto-objects">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm float_store : BooleanFFlag<"float-store">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm friend_injection : BooleanFFlag<"friend-injection">, Group<clang_ignored_f_Group>;
	defm function_attribute_list : BooleanFFlag<"function-attribute-list">, Group<clang_ignored_f_Group>;
	defm gcse : BooleanFFlag<"gcse">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_after_reload: BooleanFFlag<"gcse-after-reload">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_las: BooleanFFlag<"gcse-las">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gcse_sm: BooleanFFlag<"gcse-sm">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm gnu : BooleanFFlag<"gnu">, Group<clang_ignored_f_Group>;
	defm ident : BooleanFFlag<"ident">, Group<clang_ignored_f_Group>;
	defm implicit_templates : BooleanFFlag<"implicit-templates">, Group<clang_ignored_f_Group>;
	defm implement_inlines : BooleanFFlag<"implement-inlines">, Group<clang_ignored_f_Group>;
	defm merge_constants : BooleanFFlag<"merge-constants">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm modulo_sched : BooleanFFlag<"modulo-sched">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm modulo_sched_allow_regmoves : BooleanFFlag<"modulo-sched-allow-regmoves">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm inline_functions_called_once : BooleanFFlag<"inline-functions-called-once">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	def finline_limit_EQ : Joined<["-"], "finline-limit=">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm finline_limit : BooleanFFlag<"inline-limit">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm inline_small_functions : BooleanFFlag<"inline-small-functions">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm ipa_cp : BooleanFFlag<"ipa-cp">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm ivopts : BooleanFFlag<"ivopts">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm non_call_exceptions : BooleanFFlag<"non-call-exceptions">, Group<clang_ignored_f_Group>;
	defm peel_loops : BooleanFFlag<"peel-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm permissive : BooleanFFlag<"permissive">, Group<clang_ignored_f_Group>;
	defm prefetch_loop_arrays : BooleanFFlag<"prefetch-loop-arrays">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm printf : BooleanFFlag<"printf">, Group<clang_ignored_f_Group>;
	defm profile : BooleanFFlag<"profile">, Group<clang_ignored_f_Group>;
	defm profile_correction : BooleanFFlag<"profile-correction">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm profile_generate_sampling : BooleanFFlag<"profile-generate-sampling">, Group<clang_ignored_f_Group>;
	defm profile_reusedist : BooleanFFlag<"profile-reusedist">, Group<clang_ignored_f_Group>;
	defm profile_values : BooleanFFlag<"profile-values">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm regs_graph : BooleanFFlag<"regs-graph">, Group<clang_ignored_f_Group>;
	defm rename_registers : BooleanFFlag<"rename-registers">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm ripa : BooleanFFlag<"ripa">, Group<clang_ignored_f_Group>;
	defm rounding_math : BooleanFFlag<"rounding-math">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm schedule_insns : BooleanFFlag<"schedule-insns">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm schedule_insns2 : BooleanFFlag<"schedule-insns2">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm see : BooleanFFlag<"see">, Group<clang_ignored_f_Group>;
	defm signaling_nans : BooleanFFlag<"signaling-nans">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm single_precision_constant : BooleanFFlag<"single-precision-constant">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm spec_constr_count : BooleanFFlag<"spec-constr-count">, Group<clang_ignored_f_Group>;
	defm stack_check : BooleanFFlag<"stack-check">, Group<clang_ignored_f_Group>;
	defm strength_reduce :
	BooleanFFlag<"strength-reduce">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tls_model : BooleanFFlag<"tls-model">, Group<clang_ignored_f_Group>;
	defm tracer : BooleanFFlag<"tracer">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_dce : BooleanFFlag<"tree-dce">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_im : BooleanFFlag<"tree_loop_im">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_ivcanon : BooleanFFlag<"tree_loop_ivcanon">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_loop_linear : BooleanFFlag<"tree_loop_linear">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_salias : BooleanFFlag<"tree-salias">, Group<clang_ignored_f_Group>;
	defm tree_ter : BooleanFFlag<"tree-ter">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm tree_vectorizer_verbose : BooleanFFlag<"tree-vectorizer-verbose">, Group<clang_ignored_f_Group>;
	defm tree_vrp : BooleanFFlag<"tree-vrp">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm unroll_all_loops : BooleanFFlag<"unroll-all-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm unsafe_loop_optimizations : BooleanFFlag<"unsafe-loop-optimizations">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm unswitch_loops : BooleanFFlag<"unswitch-loops">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm use_linker_plugin : BooleanFFlag<"use-linker-plugin">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm vect_cost_model : BooleanFFlag<"vect-cost-model">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm variable_expansion_in_unroller : BooleanFFlag<"variable-expansion-in-unroller">,
	Group<clang_ignored_gcc_optimization_f_Group>;
	defm web : BooleanFFlag<"web">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm whole_program : BooleanFFlag<"whole-program">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm devirtualize : BooleanFFlag<"devirtualize">, Group<clang_ignored_gcc_optimization_f_Group>;
	defm devirtualize_speculatively : BooleanFFlag<"devirtualize-speculatively">,
	Group<clang_ignored_gcc_optimization_f_Group>;

	// Generic gfortran options.
	def A_DASH : Joined<["-"], "A-">, Group<gfortran_Group>;
	def J : JoinedOrSeparate<["-"], "J">, Flags<[RenderJoined]>, Group<gfortran_Group>;
	def cpp : Flag<["-"], "cpp">, Group<gfortran_Group>;
	def nocpp : Flag<["-"], "nocpp">, Group<gfortran_Group>;
	def static_libgfortran : Flag<["-"], "static-libgfortran">, Group<gfortran_Group>;

	// "f" options with values for gfortran.
	def fblas_matmul_limit_EQ : Joined<["-"], "fblas-matmul-limit=">, Group<gfortran_Group>;
	def fcheck_EQ : Joined<["-"], "fcheck=">, Group<gfortran_Group>;
	def fcoarray_EQ : Joined<["-"], "fcoarray=">, Group<gfortran_Group>;
	def fconvert_EQ : Joined<["-"], "fconvert=">, Group<gfortran_Group>;
	def ffixed_line_length_VALUE : Joined<["-"], "ffixed-line-length-">, Group<gfortran_Group>;
	def ffpe_trap_EQ : Joined<["-"], "ffpe-trap=">, Group<gfortran_Group>;
	def ffree_line_length_VALUE : Joined<["-"], "ffree-line-length-">, Group<gfortran_Group>;
	def finit_character_EQ : Joined<["-"], "finit-character=">, Group<gfortran_Group>;
	def finit_integer_EQ : Joined<["-"], "finit-integer=">, Group<gfortran_Group>;
	def finit_logical_EQ : Joined<["-"], "finit-logical=">, Group<gfortran_Group>;
	def finit_real_EQ : Joined<["-"], "finit-real=">, Group<gfortran_Group>;
	def fmax_array_constructor_EQ : Joined<["-"], "fmax-array-constructor=">, Group<gfortran_Group>;
	def fmax_errors_EQ : Joined<["-"], "fmax-errors=">, Group<gfortran_Group>;
	def fmax_stack_var_size_EQ : Joined<["-"], "fmax-stack-var-size=">, Group<gfortran_Group>;
	def fmax_subrecord_length_EQ : Joined<["-"], "fmax-subrecord-length=">, Group<gfortran_Group>;
	def frecord_marker_EQ : Joined<["-"], "frecord-marker=">, Group<gfortran_Group>;

	// "f" flags for gfortran.
	defm aggressive_function_elimination : BooleanFFlag<"aggressive-function-elimination">, Group<gfortran_Group>;
	defm align_commons : BooleanFFlag<"align-commons">, Group<gfortran_Group>;
	defm all_intrinsics : BooleanFFlag<"all-intrinsics">, Group<gfortran_Group>;
	defm automatic : BooleanFFlag<"automatic">, Group<gfortran_Group>;
	defm backslash : BooleanFFlag<"backslash">, Group<gfortran_Group>;
	defm backtrace : BooleanFFlag<"backtrace">, Group<gfortran_Group>;
	defm bounds_check : BooleanFFlag<"bounds-check">, Group<gfortran_Group>;
	defm check_array_temporaries : BooleanFFlag<"check-array-temporaries">, Group<gfortran_Group>;
	defm cray_pointer : BooleanFFlag<"cray-pointer">, Group<gfortran_Group>;
	defm d_lines_as_code : BooleanFFlag<"d-lines-as-code">, Group<gfortran_Group>;
	defm d_lines_as_comments : BooleanFFlag<"d-lines-as-comments">, Group<gfortran_Group>;
	defm default_double_8 : BooleanFFlag<"default-double-8">, Group<gfortran_Group>;
	defm default_integer_8 : BooleanFFlag<"default-integer-8">, Group<gfortran_Group>;
	defm default_real_8 : BooleanFFlag<"default-real-8">, Group<gfortran_Group>;
	defm dollar_ok : BooleanFFlag<"dollar-ok">, Group<gfortran_Group>;
	defm dump_fortran_optimized : BooleanFFlag<"dump-fortran-optimized">, Group<gfortran_Group>;
	defm dump_fortran_original : BooleanFFlag<"dump-fortran-original">, Group<gfortran_Group>;
	defm dump_parse_tree : BooleanFFlag<"dump-parse-tree">, Group<gfortran_Group>;
	defm external_blas : BooleanFFlag<"external-blas">, Group<gfortran_Group>;
	defm f2c : BooleanFFlag<"f2c">, Group<gfortran_Group>;
	defm fixed_form : BooleanFFlag<"fixed-form">, Group<gfortran_Group>;
	defm free_form : BooleanFFlag<"free-form">, Group<gfortran_Group>;
	defm frontend_optimize : BooleanFFlag<"frontend-optimize">, Group<gfortran_Group>;
	defm implicit_none : BooleanFFlag<"implicit-none">, Group<gfortran_Group>;
	defm init_local_zero : BooleanFFlag<"init-local-zero">, Group<gfortran_Group>;
	defm integer_4_integer_8 : BooleanFFlag<"integer-4-integer-8">, Group<gfortran_Group>;
	defm intrinsic_modules_path : BooleanFFlag<"intrinsic-modules-path">, Group<gfortran_Group>;
	defm max_identifier_length : BooleanFFlag<"max-identifier-length">, Group<gfortran_Group>;
	defm module_private : BooleanFFlag<"module-private">, Group<gfortran_Group>;
	defm pack_derived : BooleanFFlag<"pack-derived">, Group<gfortran_Group>;
	defm protect_parens : BooleanFFlag<"protect-parens">, Group<gfortran_Group>;
	defm range_check : BooleanFFlag<"range-check">, Group<gfortran_Group>;
	defm real_4_real_10 : BooleanFFlag<"real-4-real-10">, Group<gfortran_Group>;
	defm real_4_real_16 : BooleanFFlag<"real-4-real-16">, Group<gfortran_Group>;
	defm real_4_real_8 : BooleanFFlag<"real-4-real-8">, Group<gfortran_Group>;
	defm real_8_real_10 : BooleanFFlag<"real-8-real-10">, Group<gfortran_Group>;
	defm real_8_real_16 : BooleanFFlag<"real-8-real-16">, Group<gfortran_Group>;
	defm real_8_real_4 : BooleanFFlag<"real-8-real-4">, Group<gfortran_Group>;
	defm realloc_lhs : BooleanFFlag<"realloc-lhs">, Group<gfortran_Group>;
	defm recursive : BooleanFFlag<"recursive">, Group<gfortran_Group>;
	defm repack_arrays : BooleanFFlag<"repack-arrays">, Group<gfortran_Group>;
	defm second_underscore : BooleanFFlag<"second-underscore">, Group<gfortran_Group>;
	defm sign_zero : BooleanFFlag<"sign-zero">, Group<gfortran_Group>;
	defm stack_arrays : BooleanFFlag<"stack-arrays">, Group<gfortran_Group>;
	defm underscoring : BooleanFFlag<"underscoring">, Group<gfortran_Group>;
	defm whole_file : BooleanFFlag<"whole-file">, Group<gfortran_Group>;


	include "CC1Options.td"

	include "CLCompatOptions.td"
	Index: head/contrib/llvm/tools/clang/include/clang/Serialization/ASTBitCodes.h
	===================================================================
	--- head/contrib/llvm/tools/clang/include/clang/Serialization/ASTBitCodes.h (revision 322319)
	+++ head/contrib/llvm/tools/clang/include/clang/Serialization/ASTBitCodes.h (revision 322320)
	@@ -1,1697 +1,1702 @@
	//===- ASTBitCodes.h - Enum values for the PCH bitcode format ---- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This header defines Bitcode enum values for Clang serialized AST files.
	//
	// The enum values defined in this file should be considered permanent. If
	// new features are added, they should have values added at the end of the
	// respective lists.
	//
	//===----------------------------------------------------------------------===//
	#ifndef LLVM_CLANG_SERIALIZATION_ASTBITCODES_H
	#define LLVM_CLANG_SERIALIZATION_ASTBITCODES_H

	#include "clang/AST/DeclarationName.h"
	#include "clang/AST/Type.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/Bitcode/BitCodes.h"
	#include "llvm/Support/DataTypes.h"

	namespace clang {
	namespace serialization {
	/// \brief AST file major version number supported by this version of
	/// Clang.
	///
	/// Whenever the AST file format changes in a way that makes it
	/// incompatible with previous versions (such that a reader
	/// designed for the previous version could not support reading
	/// the new version), this number should be increased.
	///
	/// Version 4 of AST files also requires that the version control branch and
	/// revision match exactly, since there is no backward compatibility of
	/// AST files at this time.
	const unsigned VERSION_MAJOR = 6;

	/// \brief AST file minor version number supported by this version of
	/// Clang.
	///
	/// Whenever the AST format changes in a way that is still
	/// compatible with previous versions (such that a reader designed
	/// for the previous version could still support reading the new
	/// version by ignoring new kinds of subblocks), this number
	/// should be increased.
	const unsigned VERSION_MINOR = 0;

	/// \brief An ID number that refers to an identifier in an AST file.
	///
	/// The ID numbers of identifiers are consecutive (in order of discovery)
	/// and start at 1. 0 is reserved for NULL.
	typedef uint32_t IdentifierID;

	/// \brief An ID number that refers to a declaration in an AST file.
	///
	/// The ID numbers of declarations are consecutive (in order of
	/// discovery), with values below NUM_PREDEF_DECL_IDS being reserved.
	/// At the start of a chain of precompiled headers, declaration ID 1 is
	/// used for the translation unit declaration.
	typedef uint32_t DeclID;

	// FIXME: Turn these into classes so we can have some type safety when
	// we go from local ID to global and vice-versa.
	typedef DeclID LocalDeclID;
	typedef DeclID GlobalDeclID;

	/// \brief An ID number that refers to a type in an AST file.
	///
	/// The ID of a type is partitioned into two parts: the lower
	/// three bits are used to store the const/volatile/restrict
	/// qualifiers (as with QualType) and the upper bits provide a
	/// type index. The type index values are partitioned into two
	/// sets. The values below NUM_PREDEF_TYPE_IDs are predefined type
	/// IDs (based on the PREDEF_TYPE_*_ID constants), with 0 as a
	/// placeholder for "no type". Values from NUM_PREDEF_TYPE_IDs are
	/// other types that have serialized representations.
	typedef uint32_t TypeID;

	/// \brief A type index; the type ID with the qualifier bits removed.
	class TypeIdx {
	uint32_t Idx;
	public:
	TypeIdx() : Idx(0) { }
	explicit TypeIdx(uint32_t index) : Idx(index) { }

	uint32_t getIndex() const { return Idx; }
	TypeID asTypeID(unsigned FastQuals) const {
	if (Idx == uint32_t(-1))
	return TypeID(-1);

	return (Idx << Qualifiers::FastWidth) \| FastQuals;
	}
	static TypeIdx fromTypeID(TypeID ID) {
	if (ID == TypeID(-1))
	return TypeIdx(-1);

	return TypeIdx(ID >> Qualifiers::FastWidth);
	}
	};

	/// A structure for putting "fast"-unqualified QualTypes into a
	/// DenseMap. This uses the standard pointer hash function.
	struct UnsafeQualTypeDenseMapInfo {
	static inline bool isEqual(QualType A, QualType B) { return A == B; }
	static inline QualType getEmptyKey() {
	return QualType::getFromOpaquePtr((void*) 1);
	}
	static inline QualType getTombstoneKey() {
	return QualType::getFromOpaquePtr((void*) 2);
	}
	static inline unsigned getHashValue(QualType T) {
	assert(!T.getLocalFastQualifiers() &&
	"hash invalid for types with fast quals");
	uintptr_t v = reinterpret_cast<uintptr_t>(T.getAsOpaquePtr());
	return (unsigned(v) >> 4) ^ (unsigned(v) >> 9);
	}
	};

	/// \brief An ID number that refers to an identifier in an AST file.
	typedef uint32_t IdentID;

	/// \brief The number of predefined identifier IDs.
	const unsigned int NUM_PREDEF_IDENT_IDS = 1;

	/// \brief An ID number that refers to a macro in an AST file.
	typedef uint32_t MacroID;

	/// \brief A global ID number that refers to a macro in an AST file.
	typedef uint32_t GlobalMacroID;

	/// \brief A local to a module ID number that refers to a macro in an
	/// AST file.
	typedef uint32_t LocalMacroID;

	/// \brief The number of predefined macro IDs.
	const unsigned int NUM_PREDEF_MACRO_IDS = 1;

	/// \brief An ID number that refers to an ObjC selector in an AST file.
	typedef uint32_t SelectorID;

	/// \brief The number of predefined selector IDs.
	const unsigned int NUM_PREDEF_SELECTOR_IDS = 1;

	/// \brief An ID number that refers to a set of CXXBaseSpecifiers in an
	/// AST file.
	typedef uint32_t CXXBaseSpecifiersID;

	/// \brief An ID number that refers to a list of CXXCtorInitializers in an
	/// AST file.
	typedef uint32_t CXXCtorInitializersID;

	/// \brief An ID number that refers to an entity in the detailed
	/// preprocessing record.
	typedef uint32_t PreprocessedEntityID;

	/// \brief An ID number that refers to a submodule in a module file.
	typedef uint32_t SubmoduleID;

	/// \brief The number of predefined submodule IDs.
	const unsigned int NUM_PREDEF_SUBMODULE_IDS = 1;

	/// \brief Source range/offset of a preprocessed entity.
	struct PPEntityOffset {
	/// \brief Raw source location of beginning of range.
	unsigned Begin;
	/// \brief Raw source location of end of range.
	unsigned End;
	/// \brief Offset in the AST file.
	uint32_t BitOffset;

	PPEntityOffset(SourceRange R, uint32_t BitOffset)
	: Begin(R.getBegin().getRawEncoding()),
	End(R.getEnd().getRawEncoding()),
	BitOffset(BitOffset) { }
	SourceLocation getBegin() const {
	return SourceLocation::getFromRawEncoding(Begin);
	}
	SourceLocation getEnd() const {
	return SourceLocation::getFromRawEncoding(End);
	}
	};

	/// \brief Source range/offset of a preprocessed entity.
	struct DeclOffset {
	/// \brief Raw source location.
	unsigned Loc;
	/// \brief Offset in the AST file.
	uint32_t BitOffset;

	DeclOffset() : Loc(0), BitOffset(0) { }
	DeclOffset(SourceLocation Loc, uint32_t BitOffset)
	: Loc(Loc.getRawEncoding()),
	BitOffset(BitOffset) { }
	void setLocation(SourceLocation L) {
	Loc = L.getRawEncoding();
	}
	SourceLocation getLocation() const {
	return SourceLocation::getFromRawEncoding(Loc);
	}
	};

	/// \brief The number of predefined preprocessed entity IDs.
	const unsigned int NUM_PREDEF_PP_ENTITY_IDS = 1;

	/// \brief Describes the various kinds of blocks that occur within
	/// an AST file.
	enum BlockIDs {
	/// \brief The AST block, which acts as a container around the
	/// full AST block.
	AST_BLOCK_ID = llvm::bitc::FIRST_APPLICATION_BLOCKID,

	/// \brief The block containing information about the source
	/// manager.
	SOURCE_MANAGER_BLOCK_ID,

	/// \brief The block containing information about the
	/// preprocessor.
	PREPROCESSOR_BLOCK_ID,

	/// \brief The block containing the definitions of all of the
	/// types and decls used within the AST file.
	DECLTYPES_BLOCK_ID,

	/// \brief The block containing the detailed preprocessing record.
	PREPROCESSOR_DETAIL_BLOCK_ID,

	/// \brief The block containing the submodule structure.
	SUBMODULE_BLOCK_ID,

	/// \brief The block containing comments.
	COMMENTS_BLOCK_ID,

	/// \brief The control block, which contains all of the
	/// information that needs to be validated prior to committing
	/// to loading the AST file.
	CONTROL_BLOCK_ID,

	/// \brief The block of input files, which were used as inputs
	/// to create this AST file.
	///
	/// This block is part of the control block.
	INPUT_FILES_BLOCK_ID,

	/// \brief The block of configuration options, used to check that
	/// a module is being used in a configuration compatible with the
	/// configuration in which it was built.
	///
	/// This block is part of the control block.
	OPTIONS_BLOCK_ID,

	/// \brief A block containing a module file extension.
	EXTENSION_BLOCK_ID,

	/// A block with unhashed content.
	///
	/// These records should not change the \a ASTFileSignature. See \a
	/// UnhashedControlBlockRecordTypes for the list of records.
	UNHASHED_CONTROL_BLOCK_ID,
	};

	/// \brief Record types that occur within the control block.
	enum ControlRecordTypes {
	/// \brief AST file metadata, including the AST file version number
	/// and information about the compiler used to build this AST file.
	METADATA = 1,

	/// \brief Record code for the list of other AST files imported by
	/// this AST file.
	IMPORTS,

	/// \brief Record code for the original file that was used to
	/// generate the AST file, including both its file ID and its
	/// name.
	ORIGINAL_FILE,

	/// \brief The directory that the PCH was originally created in.
	ORIGINAL_PCH_DIR,

	/// \brief Record code for file ID of the file or buffer that was used to
	/// generate the AST file.
	ORIGINAL_FILE_ID,

	/// \brief Offsets into the input-files block where input files
	/// reside.
	INPUT_FILE_OFFSETS,

	/// \brief Record code for the module name.
	MODULE_NAME,

	/// \brief Record code for the module map file that was used to build this
	/// AST file.
	MODULE_MAP_FILE,

	/// \brief Record code for the module build directory.
	MODULE_DIRECTORY,
	};

	/// \brief Record types that occur within the options block inside
	/// the control block.
	enum OptionsRecordTypes {
	/// \brief Record code for the language options table.
	///
	/// The record with this code contains the contents of the
	/// LangOptions structure. We serialize the entire contents of
	/// the structure, and let the reader decide which options are
	/// actually important to check.
	LANGUAGE_OPTIONS = 1,

	/// \brief Record code for the target options table.
	TARGET_OPTIONS,

	/// \brief Record code for the filesystem options table.
	FILE_SYSTEM_OPTIONS,

	/// \brief Record code for the headers search options table.
	HEADER_SEARCH_OPTIONS,

	/// \brief Record code for the preprocessor options table.
	PREPROCESSOR_OPTIONS,
	};

	/// Record codes for the unhashed control block.
	enum UnhashedControlBlockRecordTypes {
	/// Record code for the signature that identifiers this AST file.
	SIGNATURE = 1,

	/// Record code for the diagnostic options table.
	DIAGNOSTIC_OPTIONS,

	/// Record code for \#pragma diagnostic mappings.
	DIAG_PRAGMA_MAPPINGS,
	};

	/// \brief Record code for extension blocks.
	enum ExtensionBlockRecordTypes {
	/// Metadata describing this particular extension.
	EXTENSION_METADATA = 1,

	/// The first record ID allocated to the extensions themselves.
	FIRST_EXTENSION_RECORD_ID = 4
	};

	/// \brief Record types that occur within the input-files block
	/// inside the control block.
	enum InputFileRecordTypes {
	/// \brief An input file.
	INPUT_FILE = 1
	};

	/// \brief Record types that occur within the AST block itself.
	enum ASTRecordTypes {
	/// \brief Record code for the offsets of each type.
	///
	/// The TYPE_OFFSET constant describes the record that occurs
	/// within the AST block. The record itself is an array of offsets that
	/// point into the declarations and types block (identified by
	/// DECLTYPES_BLOCK_ID). The index into the array is based on the ID
	/// of a type. For a given type ID @c T, the lower three bits of
	/// @c T are its qualifiers (const, volatile, restrict), as in
	/// the QualType class. The upper bits, after being shifted and
	/// subtracting NUM_PREDEF_TYPE_IDS, are used to index into the
	/// TYPE_OFFSET block to determine the offset of that type's
	/// corresponding record within the DECLTYPES_BLOCK_ID block.
	TYPE_OFFSET = 1,

	/// \brief Record code for the offsets of each decl.
	///
	/// The DECL_OFFSET constant describes the record that occurs
	/// within the block identified by DECL_OFFSETS_BLOCK_ID within
	/// the AST block. The record itself is an array of offsets that
	/// point into the declarations and types block (identified by
	/// DECLTYPES_BLOCK_ID). The declaration ID is an index into this
	/// record, after subtracting one to account for the use of
	/// declaration ID 0 for a NULL declaration pointer. Index 0 is
	/// reserved for the translation unit declaration.
	DECL_OFFSET = 2,

	/// \brief Record code for the table of offsets of each
	/// identifier ID.
	///
	/// The offset table contains offsets into the blob stored in
	/// the IDENTIFIER_TABLE record. Each offset points to the
	/// NULL-terminated string that corresponds to that identifier.
	IDENTIFIER_OFFSET = 3,

	/// \brief This is so that older clang versions, before the introduction
	/// of the control block, can read and reject the newer PCH format.
	/// DON'T CHANGE THIS NUMBER.
	METADATA_OLD_FORMAT = 4,

	/// \brief Record code for the identifier table.
	///
	/// The identifier table is a simple blob that contains
	/// NULL-terminated strings for all of the identifiers
	/// referenced by the AST file. The IDENTIFIER_OFFSET table
	/// contains the mapping from identifier IDs to the characters
	/// in this blob. Note that the starting offsets of all of the
	/// identifiers are odd, so that, when the identifier offset
	/// table is loaded in, we can use the low bit to distinguish
	/// between offsets (for unresolved identifier IDs) and
	/// IdentifierInfo pointers (for already-resolved identifier
	/// IDs).
	IDENTIFIER_TABLE = 5,

	/// \brief Record code for the array of eagerly deserialized decls.
	///
	/// The AST file contains a list of all of the declarations that should be
	/// eagerly deserialized present within the parsed headers, stored as an
	/// array of declaration IDs. These declarations will be
	/// reported to the AST consumer after the AST file has been
	/// read, since their presence can affect the semantics of the
	/// program (e.g., for code generation).
	EAGERLY_DESERIALIZED_DECLS = 6,

	/// \brief Record code for the set of non-builtin, special
	/// types.
	///
	/// This record contains the type IDs for the various type nodes
	/// that are constructed during semantic analysis (e.g.,
	/// __builtin_va_list). The SPECIAL_TYPE_* constants provide
	/// offsets into this record.
	SPECIAL_TYPES = 7,

	/// \brief Record code for the extra statistics we gather while
	/// generating an AST file.
	STATISTICS = 8,

	/// \brief Record code for the array of tentative definitions.
	TENTATIVE_DEFINITIONS = 9,

	// ID 10 used to be for a list of extern "C" declarations.

	/// \brief Record code for the table of offsets into the
	/// Objective-C method pool.
	SELECTOR_OFFSETS = 11,

	/// \brief Record code for the Objective-C method pool,
	METHOD_POOL = 12,

	/// \brief The value of the next __COUNTER__ to dispense.
	/// [PP_COUNTER_VALUE, Val]
	PP_COUNTER_VALUE = 13,

	/// \brief Record code for the table of offsets into the block
	/// of source-location information.
	SOURCE_LOCATION_OFFSETS = 14,

	/// \brief Record code for the set of source location entries
	/// that need to be preloaded by the AST reader.
	///
	/// This set contains the source location entry for the
	/// predefines buffer and for any file entries that need to be
	/// preloaded.
	SOURCE_LOCATION_PRELOADS = 15,

	/// \brief Record code for the set of ext_vector type names.
	EXT_VECTOR_DECLS = 16,

	/// \brief Record code for the array of unused file scoped decls.
	UNUSED_FILESCOPED_DECLS = 17,

	/// \brief Record code for the table of offsets to entries in the
	/// preprocessing record.
	PPD_ENTITIES_OFFSETS = 18,

	/// \brief Record code for the array of VTable uses.
	VTABLE_USES = 19,

	// ID 20 used to be for a list of dynamic classes.

	/// \brief Record code for referenced selector pool.
	REFERENCED_SELECTOR_POOL = 21,

	/// \brief Record code for an update to the TU's lexically contained
	/// declarations.
	TU_UPDATE_LEXICAL = 22,

	// ID 23 used to be for a list of local redeclarations.

	/// \brief Record code for declarations that Sema keeps references of.
	SEMA_DECL_REFS = 24,

	/// \brief Record code for weak undeclared identifiers.
	WEAK_UNDECLARED_IDENTIFIERS = 25,

	/// \brief Record code for pending implicit instantiations.
	PENDING_IMPLICIT_INSTANTIATIONS = 26,

	// ID 27 used to be for a list of replacement decls.

	/// \brief Record code for an update to a decl context's lookup table.
	///
	/// In practice, this should only be used for the TU and namespaces.
	UPDATE_VISIBLE = 28,

	/// \brief Record for offsets of DECL_UPDATES records for declarations
	/// that were modified after being deserialized and need updates.
	DECL_UPDATE_OFFSETS = 29,

	// ID 30 used to be a decl update record. These are now in the DECLTYPES
	// block.

	// ID 31 used to be a list of offsets to DECL_CXX_BASE_SPECIFIERS records.

	// ID 32 used to be the code for \#pragma diagnostic mappings.

	/// \brief Record code for special CUDA declarations.
	CUDA_SPECIAL_DECL_REFS = 33,

	/// \brief Record code for header search information.
	HEADER_SEARCH_TABLE = 34,

	/// \brief Record code for floating point \#pragma options.
	FP_PRAGMA_OPTIONS = 35,

	/// \brief Record code for enabled OpenCL extensions.
	OPENCL_EXTENSIONS = 36,

	/// \brief The list of delegating constructor declarations.
	DELEGATING_CTORS = 37,

	/// \brief Record code for the set of known namespaces, which are used
	/// for typo correction.
	KNOWN_NAMESPACES = 38,

	/// \brief Record code for the remapping information used to relate
	/// loaded modules to the various offsets and IDs(e.g., source location
	/// offests, declaration and type IDs) that are used in that module to
	/// refer to other modules.
	MODULE_OFFSET_MAP = 39,

	/// \brief Record code for the source manager line table information,
	/// which stores information about \#line directives.
	SOURCE_MANAGER_LINE_TABLE = 40,

	/// \brief Record code for map of Objective-C class definition IDs to the
	/// ObjC categories in a module that are attached to that class.
	OBJC_CATEGORIES_MAP = 41,

	/// \brief Record code for a file sorted array of DeclIDs in a module.
	FILE_SORTED_DECLS = 42,

	/// \brief Record code for an array of all of the (sub)modules that were
	/// imported by the AST file.
	IMPORTED_MODULES = 43,

	// ID 44 used to be a table of merged canonical declarations.
	// ID 45 used to be a list of declaration IDs of local redeclarations.

	/// \brief Record code for the array of Objective-C categories (including
	/// extensions).
	///
	/// This array can only be interpreted properly using the Objective-C
	/// categories map.
	OBJC_CATEGORIES = 46,

	/// \brief Record code for the table of offsets of each macro ID.
	///
	/// The offset table contains offsets into the blob stored in
	/// the preprocessor block. Each offset points to the corresponding
	/// macro definition.
	MACRO_OFFSET = 47,

	/// \brief A list of "interesting" identifiers. Only used in C++ (where we
	/// don't normally do lookups into the serialized identifier table). These
	/// are eagerly deserialized.
	INTERESTING_IDENTIFIERS = 48,

	/// \brief Record code for undefined but used functions and variables that
	/// need a definition in this TU.
	UNDEFINED_BUT_USED = 49,

	/// \brief Record code for late parsed template functions.
	LATE_PARSED_TEMPLATE = 50,

	/// \brief Record code for \#pragma optimize options.
	OPTIMIZE_PRAGMA_OPTIONS = 51,

	/// \brief Record code for potentially unused local typedef names.
	UNUSED_LOCAL_TYPEDEF_NAME_CANDIDATES = 52,

	// ID 53 used to be a table of constructor initializer records.

	/// \brief Delete expressions that will be analyzed later.
	DELETE_EXPRS_TO_ANALYZE = 54,

	/// \brief Record code for \#pragma ms_struct options.
	MSSTRUCT_PRAGMA_OPTIONS = 55,

	/// \brief Record code for \#pragma ms_struct options.
	POINTERS_TO_MEMBERS_PRAGMA_OPTIONS = 56,

	/// \brief Number of unmatched #pragma clang cuda_force_host_device begin
	/// directives we've seen.
	CUDA_PRAGMA_FORCE_HOST_DEVICE_DEPTH = 57,

	/// \brief Record code for types associated with OpenCL extensions.
	OPENCL_EXTENSION_TYPES = 58,

	/// \brief Record code for declarations associated with OpenCL extensions.
	OPENCL_EXTENSION_DECLS = 59,

	MODULAR_CODEGEN_DECLS = 60,

	/// \brief Record code for \#pragma pack options.
	PACK_PRAGMA_OPTIONS = 61,

	/// \brief The stack of open #ifs/#ifdefs recorded in a preamble.
	PP_CONDITIONAL_STACK = 62,
	};

	/// \brief Record types used within a source manager block.
	enum SourceManagerRecordTypes {
	/// \brief Describes a source location entry (SLocEntry) for a
	/// file.
	SM_SLOC_FILE_ENTRY = 1,
	/// \brief Describes a source location entry (SLocEntry) for a
	/// buffer.
	SM_SLOC_BUFFER_ENTRY = 2,
	/// \brief Describes a blob that contains the data for a buffer
	/// entry. This kind of record always directly follows a
	/// SM_SLOC_BUFFER_ENTRY record or a SM_SLOC_FILE_ENTRY with an
	/// overridden buffer.
	SM_SLOC_BUFFER_BLOB = 3,
	/// \brief Describes a zlib-compressed blob that contains the data for
	/// a buffer entry.
	SM_SLOC_BUFFER_BLOB_COMPRESSED = 4,
	/// \brief Describes a source location entry (SLocEntry) for a
	/// macro expansion.
	SM_SLOC_EXPANSION_ENTRY = 5
	};

	/// \brief Record types used within a preprocessor block.
	enum PreprocessorRecordTypes {
	// The macros in the PP section are a PP_MACRO_* instance followed by a
	// list of PP_TOKEN instances for each token in the definition.

	/// \brief An object-like macro definition.
	/// [PP_MACRO_OBJECT_LIKE, IdentInfoID, SLoc, IsUsed]
	PP_MACRO_OBJECT_LIKE = 1,

	/// \brief A function-like macro definition.
	/// [PP_MACRO_FUNCTION_LIKE, \<ObjectLikeStuff>, IsC99Varargs,
	/// IsGNUVarars, NumArgs, ArgIdentInfoID* ]
	PP_MACRO_FUNCTION_LIKE = 2,

	/// \brief Describes one token.
	/// [PP_TOKEN, SLoc, Length, IdentInfoID, Kind, Flags]
	PP_TOKEN = 3,

	/// \brief The macro directives history for a particular identifier.
	PP_MACRO_DIRECTIVE_HISTORY = 4,

	/// \brief A macro directive exported by a module.
	/// [PP_MODULE_MACRO, SubmoduleID, MacroID, (Overridden SubmoduleID)*]
	PP_MODULE_MACRO = 5,
	};

	/// \brief Record types used within a preprocessor detail block.
	enum PreprocessorDetailRecordTypes {
	/// \brief Describes a macro expansion within the preprocessing record.
	PPD_MACRO_EXPANSION = 0,

	/// \brief Describes a macro definition within the preprocessing record.
	PPD_MACRO_DEFINITION = 1,

	/// \brief Describes an inclusion directive within the preprocessing
	/// record.
	PPD_INCLUSION_DIRECTIVE = 2
	};

	/// \brief Record types used within a submodule description block.
	enum SubmoduleRecordTypes {
	/// \brief Metadata for submodules as a whole.
	SUBMODULE_METADATA = 0,
	/// \brief Defines the major attributes of a submodule, including its
	/// name and parent.
	SUBMODULE_DEFINITION = 1,
	/// \brief Specifies the umbrella header used to create this module,
	/// if any.
	SUBMODULE_UMBRELLA_HEADER = 2,
	/// \brief Specifies a header that falls into this (sub)module.
	SUBMODULE_HEADER = 3,
	/// \brief Specifies a top-level header that falls into this (sub)module.
	SUBMODULE_TOPHEADER = 4,
	/// \brief Specifies an umbrella directory.
	SUBMODULE_UMBRELLA_DIR = 5,
	/// \brief Specifies the submodules that are imported by this
	/// submodule.
	SUBMODULE_IMPORTS = 6,
	/// \brief Specifies the submodules that are re-exported from this
	/// submodule.
	SUBMODULE_EXPORTS = 7,
	/// \brief Specifies a required feature.
	SUBMODULE_REQUIRES = 8,
	/// \brief Specifies a header that has been explicitly excluded
	/// from this submodule.
	SUBMODULE_EXCLUDED_HEADER = 9,
	/// \brief Specifies a library or framework to link against.
	SUBMODULE_LINK_LIBRARY = 10,
	/// \brief Specifies a configuration macro for this module.
	SUBMODULE_CONFIG_MACRO = 11,
	/// \brief Specifies a conflict with another module.
	SUBMODULE_CONFLICT = 12,
	/// \brief Specifies a header that is private to this submodule.
	SUBMODULE_PRIVATE_HEADER = 13,
	/// \brief Specifies a header that is part of the module but must be
	/// textually included.
	SUBMODULE_TEXTUAL_HEADER = 14,
	/// \brief Specifies a header that is private to this submodule but
	/// must be textually included.
	SUBMODULE_PRIVATE_TEXTUAL_HEADER = 15,
	/// \brief Specifies some declarations with initializers that must be
	/// emitted to initialize the module.
	SUBMODULE_INITIALIZERS = 16,
	};

	/// \brief Record types used within a comments block.
	enum CommentRecordTypes {
	COMMENTS_RAW_COMMENT = 0
	};

	/// \defgroup ASTAST AST file AST constants
	///
	/// The constants in this group describe various components of the
	/// abstract syntax tree within an AST file.
	///
	/// @{

	/// \brief Predefined type IDs.
	///
	/// These type IDs correspond to predefined types in the AST
	/// context, such as built-in types (int) and special place-holder
	/// types (the \<overload> and \<dependent> type markers). Such
	/// types are never actually serialized, since they will be built
	/// by the AST context when it is created.
	enum PredefinedTypeIDs {
	/// \brief The NULL type.
	PREDEF_TYPE_NULL_ID = 0,
	/// \brief The void type.
	PREDEF_TYPE_VOID_ID = 1,
	/// \brief The 'bool' or '_Bool' type.
	PREDEF_TYPE_BOOL_ID = 2,
	/// \brief The 'char' type, when it is unsigned.
	PREDEF_TYPE_CHAR_U_ID = 3,
	/// \brief The 'unsigned char' type.
	PREDEF_TYPE_UCHAR_ID = 4,
	/// \brief The 'unsigned short' type.
	PREDEF_TYPE_USHORT_ID = 5,
	/// \brief The 'unsigned int' type.
	PREDEF_TYPE_UINT_ID = 6,
	/// \brief The 'unsigned long' type.
	PREDEF_TYPE_ULONG_ID = 7,
	/// \brief The 'unsigned long long' type.
	PREDEF_TYPE_ULONGLONG_ID = 8,
	/// \brief The 'char' type, when it is signed.
	PREDEF_TYPE_CHAR_S_ID = 9,
	/// \brief The 'signed char' type.
	PREDEF_TYPE_SCHAR_ID = 10,
	/// \brief The C++ 'wchar_t' type.
	PREDEF_TYPE_WCHAR_ID = 11,
	/// \brief The (signed) 'short' type.
	PREDEF_TYPE_SHORT_ID = 12,
	/// \brief The (signed) 'int' type.
	PREDEF_TYPE_INT_ID = 13,
	/// \brief The (signed) 'long' type.
	PREDEF_TYPE_LONG_ID = 14,
	/// \brief The (signed) 'long long' type.
	PREDEF_TYPE_LONGLONG_ID = 15,
	/// \brief The 'float' type.
	PREDEF_TYPE_FLOAT_ID = 16,
	/// \brief The 'double' type.
	PREDEF_TYPE_DOUBLE_ID = 17,
	/// \brief The 'long double' type.
	PREDEF_TYPE_LONGDOUBLE_ID = 18,
	/// \brief The placeholder type for overloaded function sets.
	PREDEF_TYPE_OVERLOAD_ID = 19,
	/// \brief The placeholder type for dependent types.
	PREDEF_TYPE_DEPENDENT_ID = 20,
	/// \brief The '__uint128_t' type.
	PREDEF_TYPE_UINT128_ID = 21,
	/// \brief The '__int128_t' type.
	PREDEF_TYPE_INT128_ID = 22,
	/// \brief The type of 'nullptr'.
	PREDEF_TYPE_NULLPTR_ID = 23,
	/// \brief The C++ 'char16_t' type.
	PREDEF_TYPE_CHAR16_ID = 24,
	/// \brief The C++ 'char32_t' type.
	PREDEF_TYPE_CHAR32_ID = 25,
	/// \brief The ObjC 'id' type.
	PREDEF_TYPE_OBJC_ID = 26,
	/// \brief The ObjC 'Class' type.
	PREDEF_TYPE_OBJC_CLASS = 27,
	/// \brief The ObjC 'SEL' type.
	PREDEF_TYPE_OBJC_SEL = 28,
	/// \brief The 'unknown any' placeholder type.
	PREDEF_TYPE_UNKNOWN_ANY = 29,
	/// \brief The placeholder type for bound member functions.
	PREDEF_TYPE_BOUND_MEMBER = 30,
	/// \brief The "auto" deduction type.
	PREDEF_TYPE_AUTO_DEDUCT = 31,
	/// \brief The "auto &&" deduction type.
	PREDEF_TYPE_AUTO_RREF_DEDUCT = 32,
	/// \brief The OpenCL 'half' / ARM NEON __fp16 type.
	PREDEF_TYPE_HALF_ID = 33,
	/// \brief ARC's unbridged-cast placeholder type.
	PREDEF_TYPE_ARC_UNBRIDGED_CAST = 34,
	/// \brief The pseudo-object placeholder type.
	PREDEF_TYPE_PSEUDO_OBJECT = 35,
	/// \brief The placeholder type for builtin functions.
	PREDEF_TYPE_BUILTIN_FN = 36,
	/// \brief OpenCL event type.
	PREDEF_TYPE_EVENT_ID = 37,
	/// \brief OpenCL clk event type.
	PREDEF_TYPE_CLK_EVENT_ID = 38,
	/// \brief OpenCL sampler type.
	PREDEF_TYPE_SAMPLER_ID = 39,
	/// \brief OpenCL queue type.
	PREDEF_TYPE_QUEUE_ID = 40,
	/// \brief OpenCL reserve_id type.
	PREDEF_TYPE_RESERVE_ID_ID = 41,
	/// \brief The placeholder type for OpenMP array section.
	PREDEF_TYPE_OMP_ARRAY_SECTION = 42,
	/// \brief The '__float128' type
	PREDEF_TYPE_FLOAT128_ID = 43,
	/// \brief OpenCL image types with auto numeration
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	PREDEF_TYPE_##Id##_ID,
	#include "clang/Basic/OpenCLImageTypes.def"
	};

	/// \brief The number of predefined type IDs that are reserved for
	/// the PREDEF_TYPE_* constants.
	///
	/// Type IDs for non-predefined types will start at
	/// NUM_PREDEF_TYPE_IDs.
	const unsigned NUM_PREDEF_TYPE_IDS = 100;

	/// \brief Record codes for each kind of type.
	///
	/// These constants describe the type records that can occur within a
	/// block identified by DECLTYPES_BLOCK_ID in the AST file. Each
	/// constant describes a record for a specific type class in the
	/// AST. Note that DeclCode values share this code space.
	enum TypeCode {
	/// \brief An ExtQualType record.
	TYPE_EXT_QUAL = 1,
	/// \brief A ComplexType record.
	TYPE_COMPLEX = 3,
	/// \brief A PointerType record.
	TYPE_POINTER = 4,
	/// \brief A BlockPointerType record.
	TYPE_BLOCK_POINTER = 5,
	/// \brief An LValueReferenceType record.
	TYPE_LVALUE_REFERENCE = 6,
	/// \brief An RValueReferenceType record.
	TYPE_RVALUE_REFERENCE = 7,
	/// \brief A MemberPointerType record.
	TYPE_MEMBER_POINTER = 8,
	/// \brief A ConstantArrayType record.
	TYPE_CONSTANT_ARRAY = 9,
	/// \brief An IncompleteArrayType record.
	TYPE_INCOMPLETE_ARRAY = 10,
	/// \brief A VariableArrayType record.
	TYPE_VARIABLE_ARRAY = 11,
	/// \brief A VectorType record.
	TYPE_VECTOR = 12,
	/// \brief An ExtVectorType record.
	TYPE_EXT_VECTOR = 13,
	/// \brief A FunctionNoProtoType record.
	TYPE_FUNCTION_NO_PROTO = 14,
	/// \brief A FunctionProtoType record.
	TYPE_FUNCTION_PROTO = 15,
	/// \brief A TypedefType record.
	TYPE_TYPEDEF = 16,
	/// \brief A TypeOfExprType record.
	TYPE_TYPEOF_EXPR = 17,
	/// \brief A TypeOfType record.
	TYPE_TYPEOF = 18,
	/// \brief A RecordType record.
	TYPE_RECORD = 19,
	/// \brief An EnumType record.
	TYPE_ENUM = 20,
	/// \brief An ObjCInterfaceType record.
	TYPE_OBJC_INTERFACE = 21,
	/// \brief An ObjCObjectPointerType record.
	TYPE_OBJC_OBJECT_POINTER = 22,
	/// \brief a DecltypeType record.
	TYPE_DECLTYPE = 23,
	/// \brief An ElaboratedType record.
	TYPE_ELABORATED = 24,
	/// \brief A SubstTemplateTypeParmType record.
	TYPE_SUBST_TEMPLATE_TYPE_PARM = 25,
	/// \brief An UnresolvedUsingType record.
	TYPE_UNRESOLVED_USING = 26,
	/// \brief An InjectedClassNameType record.
	TYPE_INJECTED_CLASS_NAME = 27,
	/// \brief An ObjCObjectType record.
	TYPE_OBJC_OBJECT = 28,
	/// \brief An TemplateTypeParmType record.
	TYPE_TEMPLATE_TYPE_PARM = 29,
	/// \brief An TemplateSpecializationType record.
	TYPE_TEMPLATE_SPECIALIZATION = 30,
	/// \brief A DependentNameType record.
	TYPE_DEPENDENT_NAME = 31,
	/// \brief A DependentTemplateSpecializationType record.
	TYPE_DEPENDENT_TEMPLATE_SPECIALIZATION = 32,
	/// \brief A DependentSizedArrayType record.
	TYPE_DEPENDENT_SIZED_ARRAY = 33,
	/// \brief A ParenType record.
	TYPE_PAREN = 34,
	/// \brief A PackExpansionType record.
	TYPE_PACK_EXPANSION = 35,
	/// \brief An AttributedType record.
	TYPE_ATTRIBUTED = 36,
	/// \brief A SubstTemplateTypeParmPackType record.
	TYPE_SUBST_TEMPLATE_TYPE_PARM_PACK = 37,
	/// \brief A AutoType record.
	TYPE_AUTO = 38,
	/// \brief A UnaryTransformType record.
	TYPE_UNARY_TRANSFORM = 39,
	/// \brief An AtomicType record.
	TYPE_ATOMIC = 40,
	/// \brief A DecayedType record.
	TYPE_DECAYED = 41,
	/// \brief An AdjustedType record.
	TYPE_ADJUSTED = 42,
	/// \brief A PipeType record.
	TYPE_PIPE = 43,
	/// \brief An ObjCTypeParamType record.
	TYPE_OBJC_TYPE_PARAM = 44,
	/// \brief A DeducedTemplateSpecializationType record.
	TYPE_DEDUCED_TEMPLATE_SPECIALIZATION = 45,
	/// \brief A DependentSizedExtVectorType record.
	TYPE_DEPENDENT_SIZED_EXT_VECTOR = 46
	};

	/// \brief The type IDs for special types constructed by semantic
	/// analysis.
	///
	/// The constants in this enumeration are indices into the
	/// SPECIAL_TYPES record.
	enum SpecialTypeIDs {
	/// \brief CFConstantString type
	SPECIAL_TYPE_CF_CONSTANT_STRING = 0,
	/// \brief C FILE typedef type
	SPECIAL_TYPE_FILE = 1,
	/// \brief C jmp_buf typedef type
	SPECIAL_TYPE_JMP_BUF = 2,
	/// \brief C sigjmp_buf typedef type
	SPECIAL_TYPE_SIGJMP_BUF = 3,
	/// \brief Objective-C "id" redefinition type
	SPECIAL_TYPE_OBJC_ID_REDEFINITION = 4,
	/// \brief Objective-C "Class" redefinition type
	SPECIAL_TYPE_OBJC_CLASS_REDEFINITION = 5,
	/// \brief Objective-C "SEL" redefinition type
	SPECIAL_TYPE_OBJC_SEL_REDEFINITION = 6,
	/// \brief C ucontext_t typedef type
	SPECIAL_TYPE_UCONTEXT_T = 7
	};

	/// \brief The number of special type IDs.
	const unsigned NumSpecialTypeIDs = 8;

	/// \brief Predefined declaration IDs.
	///
	/// These declaration IDs correspond to predefined declarations in the AST
	/// context, such as the NULL declaration ID. Such declarations are never
	/// actually serialized, since they will be built by the AST context when
	/// it is created.
	enum PredefinedDeclIDs {
	/// \brief The NULL declaration.
	PREDEF_DECL_NULL_ID = 0,

	/// \brief The translation unit.
	PREDEF_DECL_TRANSLATION_UNIT_ID = 1,

	/// \brief The Objective-C 'id' type.
	PREDEF_DECL_OBJC_ID_ID = 2,

	/// \brief The Objective-C 'SEL' type.
	PREDEF_DECL_OBJC_SEL_ID = 3,

	/// \brief The Objective-C 'Class' type.
	PREDEF_DECL_OBJC_CLASS_ID = 4,

	/// \brief The Objective-C 'Protocol' type.
	PREDEF_DECL_OBJC_PROTOCOL_ID = 5,

	/// \brief The signed 128-bit integer type.
	PREDEF_DECL_INT_128_ID = 6,

	/// \brief The unsigned 128-bit integer type.
	PREDEF_DECL_UNSIGNED_INT_128_ID = 7,

	/// \brief The internal 'instancetype' typedef.
	PREDEF_DECL_OBJC_INSTANCETYPE_ID = 8,

	/// \brief The internal '__builtin_va_list' typedef.
	PREDEF_DECL_BUILTIN_VA_LIST_ID = 9,

	/// \brief The internal '__va_list_tag' struct, if any.
	PREDEF_DECL_VA_LIST_TAG = 10,

	/// \brief The internal '__builtin_ms_va_list' typedef.
	PREDEF_DECL_BUILTIN_MS_VA_LIST_ID = 11,

	/// \brief The extern "C" context.
	PREDEF_DECL_EXTERN_C_CONTEXT_ID = 12,

	/// \brief The internal '__make_integer_seq' template.
	PREDEF_DECL_MAKE_INTEGER_SEQ_ID = 13,

	/// \brief The internal '__NSConstantString' typedef.
	PREDEF_DECL_CF_CONSTANT_STRING_ID = 14,

	/// \brief The internal '__NSConstantString' tag type.
	PREDEF_DECL_CF_CONSTANT_STRING_TAG_ID = 15,

	/// \brief The internal '__type_pack_element' template.
	PREDEF_DECL_TYPE_PACK_ELEMENT_ID = 16,
	};

	/// \brief The number of declaration IDs that are predefined.
	///
	/// For more information about predefined declarations, see the
	/// \c PredefinedDeclIDs type and the PREDEF_DECL_*_ID constants.
	const unsigned int NUM_PREDEF_DECL_IDS = 17;

	/// \brief Record of updates for a declaration that was modified after
	/// being deserialized. This can occur within DECLTYPES_BLOCK_ID.
	const unsigned int DECL_UPDATES = 49;

	/// \brief Record code for a list of local redeclarations of a declaration.
	/// This can occur within DECLTYPES_BLOCK_ID.
	const unsigned int LOCAL_REDECLARATIONS = 50;

	/// \brief Record codes for each kind of declaration.
	///
	/// These constants describe the declaration records that can occur within
	/// a declarations block (identified by DECLTYPES_BLOCK_ID). Each
	/// constant describes a record for a specific declaration class
	/// in the AST. Note that TypeCode values share this code space.
	enum DeclCode {
	/// \brief A TypedefDecl record.
	DECL_TYPEDEF = 51,
	/// \brief A TypeAliasDecl record.
	DECL_TYPEALIAS,
	/// \brief An EnumDecl record.
	DECL_ENUM,
	/// \brief A RecordDecl record.
	DECL_RECORD,
	/// \brief An EnumConstantDecl record.
	DECL_ENUM_CONSTANT,
	/// \brief A FunctionDecl record.
	DECL_FUNCTION,
	/// \brief A ObjCMethodDecl record.
	DECL_OBJC_METHOD,
	/// \brief A ObjCInterfaceDecl record.
	DECL_OBJC_INTERFACE,
	/// \brief A ObjCProtocolDecl record.
	DECL_OBJC_PROTOCOL,
	/// \brief A ObjCIvarDecl record.
	DECL_OBJC_IVAR,
	/// \brief A ObjCAtDefsFieldDecl record.
	DECL_OBJC_AT_DEFS_FIELD,
	/// \brief A ObjCCategoryDecl record.
	DECL_OBJC_CATEGORY,
	/// \brief A ObjCCategoryImplDecl record.
	DECL_OBJC_CATEGORY_IMPL,
	/// \brief A ObjCImplementationDecl record.
	DECL_OBJC_IMPLEMENTATION,
	/// \brief A ObjCCompatibleAliasDecl record.
	DECL_OBJC_COMPATIBLE_ALIAS,
	/// \brief A ObjCPropertyDecl record.
	DECL_OBJC_PROPERTY,
	/// \brief A ObjCPropertyImplDecl record.
	DECL_OBJC_PROPERTY_IMPL,
	/// \brief A FieldDecl record.
	DECL_FIELD,
	/// \brief A MSPropertyDecl record.
	DECL_MS_PROPERTY,
	/// \brief A VarDecl record.
	DECL_VAR,
	/// \brief An ImplicitParamDecl record.
	DECL_IMPLICIT_PARAM,
	/// \brief A ParmVarDecl record.
	DECL_PARM_VAR,
	/// \brief A DecompositionDecl record.
	DECL_DECOMPOSITION,
	/// \brief A BindingDecl record.
	DECL_BINDING,
	/// \brief A FileScopeAsmDecl record.
	DECL_FILE_SCOPE_ASM,
	/// \brief A BlockDecl record.
	DECL_BLOCK,
	/// \brief A CapturedDecl record.
	DECL_CAPTURED,
	/// \brief A record that stores the set of declarations that are
	/// lexically stored within a given DeclContext.
	///
	/// The record itself is a blob that is an array of declaration IDs,
	/// in the order in which those declarations were added to the
	/// declaration context. This data is used when iterating over
	/// the contents of a DeclContext, e.g., via
	/// DeclContext::decls_begin() and DeclContext::decls_end().
	DECL_CONTEXT_LEXICAL,
	/// \brief A record that stores the set of declarations that are
	/// visible from a given DeclContext.
	///
	/// The record itself stores a set of mappings, each of which
	/// associates a declaration name with one or more declaration
	/// IDs. This data is used when performing qualified name lookup
	/// into a DeclContext via DeclContext::lookup.
	DECL_CONTEXT_VISIBLE,
	/// \brief A LabelDecl record.
	DECL_LABEL,
	/// \brief A NamespaceDecl record.
	DECL_NAMESPACE,
	/// \brief A NamespaceAliasDecl record.
	DECL_NAMESPACE_ALIAS,
	/// \brief A UsingDecl record.
	DECL_USING,
	/// \brief A UsingPackDecl record.
	DECL_USING_PACK,
	/// \brief A UsingShadowDecl record.
	DECL_USING_SHADOW,
	/// \brief A ConstructorUsingShadowDecl record.
	DECL_CONSTRUCTOR_USING_SHADOW,
	/// \brief A UsingDirecitveDecl record.
	DECL_USING_DIRECTIVE,
	/// \brief An UnresolvedUsingValueDecl record.
	DECL_UNRESOLVED_USING_VALUE,
	/// \brief An UnresolvedUsingTypenameDecl record.
	DECL_UNRESOLVED_USING_TYPENAME,
	/// \brief A LinkageSpecDecl record.
	DECL_LINKAGE_SPEC,
	/// \brief An ExportDecl record.
	DECL_EXPORT,
	/// \brief A CXXRecordDecl record.
	DECL_CXX_RECORD,
	/// \brief A CXXDeductionGuideDecl record.
	DECL_CXX_DEDUCTION_GUIDE,
	/// \brief A CXXMethodDecl record.
	DECL_CXX_METHOD,
	/// \brief A CXXConstructorDecl record.
	DECL_CXX_CONSTRUCTOR,
	/// \brief A CXXConstructorDecl record for an inherited constructor.
	DECL_CXX_INHERITED_CONSTRUCTOR,
	/// \brief A CXXDestructorDecl record.
	DECL_CXX_DESTRUCTOR,
	/// \brief A CXXConversionDecl record.
	DECL_CXX_CONVERSION,
	/// \brief An AccessSpecDecl record.
	DECL_ACCESS_SPEC,

	/// \brief A FriendDecl record.
	DECL_FRIEND,
	/// \brief A FriendTemplateDecl record.
	DECL_FRIEND_TEMPLATE,
	/// \brief A ClassTemplateDecl record.
	DECL_CLASS_TEMPLATE,
	/// \brief A ClassTemplateSpecializationDecl record.
	DECL_CLASS_TEMPLATE_SPECIALIZATION,
	/// \brief A ClassTemplatePartialSpecializationDecl record.
	DECL_CLASS_TEMPLATE_PARTIAL_SPECIALIZATION,
	/// \brief A VarTemplateDecl record.
	DECL_VAR_TEMPLATE,
	/// \brief A VarTemplateSpecializationDecl record.
	DECL_VAR_TEMPLATE_SPECIALIZATION,
	/// \brief A VarTemplatePartialSpecializationDecl record.
	DECL_VAR_TEMPLATE_PARTIAL_SPECIALIZATION,
	/// \brief A FunctionTemplateDecl record.
	DECL_FUNCTION_TEMPLATE,
	/// \brief A TemplateTypeParmDecl record.
	DECL_TEMPLATE_TYPE_PARM,
	/// \brief A NonTypeTemplateParmDecl record.
	DECL_NON_TYPE_TEMPLATE_PARM,
	/// \brief A TemplateTemplateParmDecl record.
	DECL_TEMPLATE_TEMPLATE_PARM,
	/// \brief A TypeAliasTemplateDecl record.
	DECL_TYPE_ALIAS_TEMPLATE,
	/// \brief A StaticAssertDecl record.
	DECL_STATIC_ASSERT,
	/// \brief A record containing CXXBaseSpecifiers.
	DECL_CXX_BASE_SPECIFIERS,
	/// \brief A record containing CXXCtorInitializers.
	DECL_CXX_CTOR_INITIALIZERS,
	/// \brief A IndirectFieldDecl record.
	DECL_INDIRECTFIELD,
	/// \brief A NonTypeTemplateParmDecl record that stores an expanded
	/// non-type template parameter pack.
	DECL_EXPANDED_NON_TYPE_TEMPLATE_PARM_PACK,
	/// \brief A TemplateTemplateParmDecl record that stores an expanded
	/// template template parameter pack.
	DECL_EXPANDED_TEMPLATE_TEMPLATE_PARM_PACK,
	/// \brief A ClassScopeFunctionSpecializationDecl record a class scope
	/// function specialization. (Microsoft extension).
	DECL_CLASS_SCOPE_FUNCTION_SPECIALIZATION,
	/// \brief An ImportDecl recording a module import.
	DECL_IMPORT,
	/// \brief An OMPThreadPrivateDecl record.
	DECL_OMP_THREADPRIVATE,
	/// \brief An EmptyDecl record.
	DECL_EMPTY,
	/// \brief An ObjCTypeParamDecl record.
	DECL_OBJC_TYPE_PARAM,
	/// \brief An OMPCapturedExprDecl record.
	DECL_OMP_CAPTUREDEXPR,
	/// \brief A PragmaCommentDecl record.
	DECL_PRAGMA_COMMENT,
	/// \brief A PragmaDetectMismatchDecl record.
	DECL_PRAGMA_DETECT_MISMATCH,
	/// \brief An OMPDeclareReductionDecl record.
	DECL_OMP_DECLARE_REDUCTION,
	};

	/// \brief Record codes for each kind of statement or expression.
	///
	/// These constants describe the records that describe statements
	/// or expressions. These records occur within type and declarations
	/// block, so they begin with record values of 128. Each constant
	/// describes a record for a specific statement or expression class in the
	/// AST.
	enum StmtCode {
	/// \brief A marker record that indicates that we are at the end
	/// of an expression.
	STMT_STOP = 128,
	/// \brief A NULL expression.
	STMT_NULL_PTR,
	/// \brief A reference to a previously [de]serialized Stmt record.
	STMT_REF_PTR,
	/// \brief A NullStmt record.
	STMT_NULL,
	/// \brief A CompoundStmt record.
	STMT_COMPOUND,
	/// \brief A CaseStmt record.
	STMT_CASE,
	/// \brief A DefaultStmt record.
	STMT_DEFAULT,
	/// \brief A LabelStmt record.
	STMT_LABEL,
	/// \brief An AttributedStmt record.
	STMT_ATTRIBUTED,
	/// \brief An IfStmt record.
	STMT_IF,
	/// \brief A SwitchStmt record.
	STMT_SWITCH,
	/// \brief A WhileStmt record.
	STMT_WHILE,
	/// \brief A DoStmt record.
	STMT_DO,
	/// \brief A ForStmt record.
	STMT_FOR,
	/// \brief A GotoStmt record.
	STMT_GOTO,
	/// \brief An IndirectGotoStmt record.
	STMT_INDIRECT_GOTO,
	/// \brief A ContinueStmt record.
	STMT_CONTINUE,
	/// \brief A BreakStmt record.
	STMT_BREAK,
	/// \brief A ReturnStmt record.
	STMT_RETURN,
	/// \brief A DeclStmt record.
	STMT_DECL,
	/// \brief A CapturedStmt record.
	STMT_CAPTURED,
	/// \brief A GCC-style AsmStmt record.
	STMT_GCCASM,
	/// \brief A MS-style AsmStmt record.
	STMT_MSASM,
	/// \brief A PredefinedExpr record.
	EXPR_PREDEFINED,
	/// \brief A DeclRefExpr record.
	EXPR_DECL_REF,
	/// \brief An IntegerLiteral record.
	EXPR_INTEGER_LITERAL,
	/// \brief A FloatingLiteral record.
	EXPR_FLOATING_LITERAL,
	/// \brief An ImaginaryLiteral record.
	EXPR_IMAGINARY_LITERAL,
	/// \brief A StringLiteral record.
	EXPR_STRING_LITERAL,
	/// \brief A CharacterLiteral record.
	EXPR_CHARACTER_LITERAL,
	/// \brief A ParenExpr record.
	EXPR_PAREN,
	/// \brief A ParenListExpr record.
	EXPR_PAREN_LIST,
	/// \brief A UnaryOperator record.
	EXPR_UNARY_OPERATOR,
	/// \brief An OffsetOfExpr record.
	EXPR_OFFSETOF,
	/// \brief A SizefAlignOfExpr record.
	EXPR_SIZEOF_ALIGN_OF,
	/// \brief An ArraySubscriptExpr record.
	EXPR_ARRAY_SUBSCRIPT,
	/// \brief A CallExpr record.
	EXPR_CALL,
	/// \brief A MemberExpr record.
	EXPR_MEMBER,
	/// \brief A BinaryOperator record.
	EXPR_BINARY_OPERATOR,
	/// \brief A CompoundAssignOperator record.
	EXPR_COMPOUND_ASSIGN_OPERATOR,
	/// \brief A ConditionOperator record.
	EXPR_CONDITIONAL_OPERATOR,
	/// \brief An ImplicitCastExpr record.
	EXPR_IMPLICIT_CAST,
	/// \brief A CStyleCastExpr record.
	EXPR_CSTYLE_CAST,
	/// \brief A CompoundLiteralExpr record.
	EXPR_COMPOUND_LITERAL,
	/// \brief An ExtVectorElementExpr record.
	EXPR_EXT_VECTOR_ELEMENT,
	/// \brief An InitListExpr record.
	EXPR_INIT_LIST,
	/// \brief A DesignatedInitExpr record.
	EXPR_DESIGNATED_INIT,
	/// \brief A DesignatedInitUpdateExpr record.
	EXPR_DESIGNATED_INIT_UPDATE,
	/// \brief An NoInitExpr record.
	EXPR_NO_INIT,
	/// \brief An ArrayInitLoopExpr record.
	EXPR_ARRAY_INIT_LOOP,
	/// \brief An ArrayInitIndexExpr record.
	EXPR_ARRAY_INIT_INDEX,
	/// \brief An ImplicitValueInitExpr record.
	EXPR_IMPLICIT_VALUE_INIT,
	/// \brief A VAArgExpr record.
	EXPR_VA_ARG,
	/// \brief An AddrLabelExpr record.
	EXPR_ADDR_LABEL,
	/// \brief A StmtExpr record.
	EXPR_STMT,
	/// \brief A ChooseExpr record.
	EXPR_CHOOSE,
	/// \brief A GNUNullExpr record.
	EXPR_GNU_NULL,
	/// \brief A ShuffleVectorExpr record.
	EXPR_SHUFFLE_VECTOR,
	/// \brief A ConvertVectorExpr record.
	EXPR_CONVERT_VECTOR,
	/// \brief BlockExpr
	EXPR_BLOCK,
	/// \brief A GenericSelectionExpr record.
	EXPR_GENERIC_SELECTION,
	/// \brief A PseudoObjectExpr record.
	EXPR_PSEUDO_OBJECT,
	/// \brief An AtomicExpr record.
	EXPR_ATOMIC,

	// Objective-C

	/// \brief An ObjCStringLiteral record.
	EXPR_OBJC_STRING_LITERAL,

	EXPR_OBJC_BOXED_EXPRESSION,
	EXPR_OBJC_ARRAY_LITERAL,
	EXPR_OBJC_DICTIONARY_LITERAL,


	/// \brief An ObjCEncodeExpr record.
	EXPR_OBJC_ENCODE,
	/// \brief An ObjCSelectorExpr record.
	EXPR_OBJC_SELECTOR_EXPR,
	/// \brief An ObjCProtocolExpr record.
	EXPR_OBJC_PROTOCOL_EXPR,
	/// \brief An ObjCIvarRefExpr record.
	EXPR_OBJC_IVAR_REF_EXPR,
	/// \brief An ObjCPropertyRefExpr record.
	EXPR_OBJC_PROPERTY_REF_EXPR,
	/// \brief An ObjCSubscriptRefExpr record.
	EXPR_OBJC_SUBSCRIPT_REF_EXPR,
	/// \brief UNUSED
	EXPR_OBJC_KVC_REF_EXPR,
	/// \brief An ObjCMessageExpr record.
	EXPR_OBJC_MESSAGE_EXPR,
	/// \brief An ObjCIsa Expr record.
	EXPR_OBJC_ISA,
	/// \brief An ObjCIndirectCopyRestoreExpr record.
	EXPR_OBJC_INDIRECT_COPY_RESTORE,

	/// \brief An ObjCForCollectionStmt record.
	STMT_OBJC_FOR_COLLECTION,
	/// \brief An ObjCAtCatchStmt record.
	STMT_OBJC_CATCH,
	/// \brief An ObjCAtFinallyStmt record.
	STMT_OBJC_FINALLY,
	/// \brief An ObjCAtTryStmt record.
	STMT_OBJC_AT_TRY,
	/// \brief An ObjCAtSynchronizedStmt record.
	STMT_OBJC_AT_SYNCHRONIZED,
	/// \brief An ObjCAtThrowStmt record.
	STMT_OBJC_AT_THROW,
	/// \brief An ObjCAutoreleasePoolStmt record.
	STMT_OBJC_AUTORELEASE_POOL,
	/// \brief An ObjCBoolLiteralExpr record.
	EXPR_OBJC_BOOL_LITERAL,
	/// \brief An ObjCAvailabilityCheckExpr record.
	EXPR_OBJC_AVAILABILITY_CHECK,

	// C++

	/// \brief A CXXCatchStmt record.
	STMT_CXX_CATCH,
	/// \brief A CXXTryStmt record.
	STMT_CXX_TRY,
	/// \brief A CXXForRangeStmt record.
	STMT_CXX_FOR_RANGE,

	/// \brief A CXXOperatorCallExpr record.
	EXPR_CXX_OPERATOR_CALL,
	/// \brief A CXXMemberCallExpr record.
	EXPR_CXX_MEMBER_CALL,
	/// \brief A CXXConstructExpr record.
	EXPR_CXX_CONSTRUCT,
	/// \brief A CXXInheritedCtorInitExpr record.
	EXPR_CXX_INHERITED_CTOR_INIT,
	/// \brief A CXXTemporaryObjectExpr record.
	EXPR_CXX_TEMPORARY_OBJECT,
	/// \brief A CXXStaticCastExpr record.
	EXPR_CXX_STATIC_CAST,
	/// \brief A CXXDynamicCastExpr record.
	EXPR_CXX_DYNAMIC_CAST,
	/// \brief A CXXReinterpretCastExpr record.
	EXPR_CXX_REINTERPRET_CAST,
	/// \brief A CXXConstCastExpr record.
	EXPR_CXX_CONST_CAST,
	/// \brief A CXXFunctionalCastExpr record.
	EXPR_CXX_FUNCTIONAL_CAST,
	/// \brief A UserDefinedLiteral record.
	EXPR_USER_DEFINED_LITERAL,
	/// \brief A CXXStdInitializerListExpr record.
	EXPR_CXX_STD_INITIALIZER_LIST,
	/// \brief A CXXBoolLiteralExpr record.
	EXPR_CXX_BOOL_LITERAL,
	EXPR_CXX_NULL_PTR_LITERAL, // CXXNullPtrLiteralExpr
	EXPR_CXX_TYPEID_EXPR, // CXXTypeidExpr (of expr).
	EXPR_CXX_TYPEID_TYPE, // CXXTypeidExpr (of type).
	EXPR_CXX_THIS, // CXXThisExpr
	EXPR_CXX_THROW, // CXXThrowExpr
	EXPR_CXX_DEFAULT_ARG, // CXXDefaultArgExpr
	EXPR_CXX_DEFAULT_INIT, // CXXDefaultInitExpr
	EXPR_CXX_BIND_TEMPORARY, // CXXBindTemporaryExpr

	EXPR_CXX_SCALAR_VALUE_INIT, // CXXScalarValueInitExpr
	EXPR_CXX_NEW, // CXXNewExpr
	EXPR_CXX_DELETE, // CXXDeleteExpr
	EXPR_CXX_PSEUDO_DESTRUCTOR, // CXXPseudoDestructorExpr

	EXPR_EXPR_WITH_CLEANUPS, // ExprWithCleanups

	EXPR_CXX_DEPENDENT_SCOPE_MEMBER, // CXXDependentScopeMemberExpr
	EXPR_CXX_DEPENDENT_SCOPE_DECL_REF, // DependentScopeDeclRefExpr
	EXPR_CXX_UNRESOLVED_CONSTRUCT, // CXXUnresolvedConstructExpr
	EXPR_CXX_UNRESOLVED_MEMBER, // UnresolvedMemberExpr
	EXPR_CXX_UNRESOLVED_LOOKUP, // UnresolvedLookupExpr

	EXPR_CXX_EXPRESSION_TRAIT, // ExpressionTraitExpr
	EXPR_CXX_NOEXCEPT, // CXXNoexceptExpr

	EXPR_OPAQUE_VALUE, // OpaqueValueExpr
	EXPR_BINARY_CONDITIONAL_OPERATOR, // BinaryConditionalOperator
	EXPR_TYPE_TRAIT, // TypeTraitExpr
	EXPR_ARRAY_TYPE_TRAIT, // ArrayTypeTraitIntExpr

	EXPR_PACK_EXPANSION, // PackExpansionExpr
	EXPR_SIZEOF_PACK, // SizeOfPackExpr
	EXPR_SUBST_NON_TYPE_TEMPLATE_PARM, // SubstNonTypeTemplateParmExpr
	EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK,// SubstNonTypeTemplateParmPackExpr
	EXPR_FUNCTION_PARM_PACK, // FunctionParmPackExpr
	EXPR_MATERIALIZE_TEMPORARY, // MaterializeTemporaryExpr
	EXPR_CXX_FOLD, // CXXFoldExpr

	// CUDA
	EXPR_CUDA_KERNEL_CALL, // CUDAKernelCallExpr

	// OpenCL
	EXPR_ASTYPE, // AsTypeExpr

	// Microsoft
	EXPR_CXX_PROPERTY_REF_EXPR, // MSPropertyRefExpr
	EXPR_CXX_PROPERTY_SUBSCRIPT_EXPR, // MSPropertySubscriptExpr
	EXPR_CXX_UUIDOF_EXPR, // CXXUuidofExpr (of expr).
	EXPR_CXX_UUIDOF_TYPE, // CXXUuidofExpr (of type).
	STMT_SEH_LEAVE, // SEHLeaveStmt
	STMT_SEH_EXCEPT, // SEHExceptStmt
	STMT_SEH_FINALLY, // SEHFinallyStmt
	STMT_SEH_TRY, // SEHTryStmt

	// OpenMP directives
	STMT_OMP_PARALLEL_DIRECTIVE,
	STMT_OMP_SIMD_DIRECTIVE,
	STMT_OMP_FOR_DIRECTIVE,
	STMT_OMP_FOR_SIMD_DIRECTIVE,
	STMT_OMP_SECTIONS_DIRECTIVE,
	STMT_OMP_SECTION_DIRECTIVE,
	STMT_OMP_SINGLE_DIRECTIVE,
	STMT_OMP_MASTER_DIRECTIVE,
	STMT_OMP_CRITICAL_DIRECTIVE,
	STMT_OMP_PARALLEL_FOR_DIRECTIVE,
	STMT_OMP_PARALLEL_FOR_SIMD_DIRECTIVE,
	STMT_OMP_PARALLEL_SECTIONS_DIRECTIVE,
	STMT_OMP_TASK_DIRECTIVE,
	STMT_OMP_TASKYIELD_DIRECTIVE,
	STMT_OMP_BARRIER_DIRECTIVE,
	STMT_OMP_TASKWAIT_DIRECTIVE,
	STMT_OMP_FLUSH_DIRECTIVE,
	STMT_OMP_ORDERED_DIRECTIVE,
	STMT_OMP_ATOMIC_DIRECTIVE,
	STMT_OMP_TARGET_DIRECTIVE,
	STMT_OMP_TARGET_DATA_DIRECTIVE,
	STMT_OMP_TARGET_ENTER_DATA_DIRECTIVE,
	STMT_OMP_TARGET_EXIT_DATA_DIRECTIVE,
	STMT_OMP_TARGET_PARALLEL_DIRECTIVE,
	STMT_OMP_TARGET_PARALLEL_FOR_DIRECTIVE,
	STMT_OMP_TEAMS_DIRECTIVE,
	STMT_OMP_TASKGROUP_DIRECTIVE,
	STMT_OMP_CANCELLATION_POINT_DIRECTIVE,
	STMT_OMP_CANCEL_DIRECTIVE,
	STMT_OMP_TASKLOOP_DIRECTIVE,
	STMT_OMP_TASKLOOP_SIMD_DIRECTIVE,
	STMT_OMP_DISTRIBUTE_DIRECTIVE,
	STMT_OMP_TARGET_UPDATE_DIRECTIVE,
	STMT_OMP_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE,
	STMT_OMP_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE,
	STMT_OMP_DISTRIBUTE_SIMD_DIRECTIVE,
	STMT_OMP_TARGET_PARALLEL_FOR_SIMD_DIRECTIVE,
	STMT_OMP_TARGET_SIMD_DIRECTIVE,
	STMT_OMP_TEAMS_DISTRIBUTE_DIRECTIVE,
	STMT_OMP_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE,
	STMT_OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE,
	STMT_OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE,
	STMT_OMP_TARGET_TEAMS_DIRECTIVE,
	STMT_OMP_TARGET_TEAMS_DISTRIBUTE_DIRECTIVE,
	STMT_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE,
	STMT_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE,
	STMT_OMP_TARGET_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE,
	EXPR_OMP_ARRAY_SECTION,

	// ARC
	EXPR_OBJC_BRIDGED_CAST, // ObjCBridgedCastExpr
	-
	+
	STMT_MS_DEPENDENT_EXISTS, // MSDependentExistsStmt
	- EXPR_LAMBDA // LambdaExpr
	+ EXPR_LAMBDA, // LambdaExpr
	+ STMT_COROUTINE_BODY,
	+ STMT_CORETURN,
	+ EXPR_COAWAIT,
	+ EXPR_COYIELD,
	+ EXPR_DEPENDENT_COAWAIT,
	};

	/// \brief The kinds of designators that can occur in a
	/// DesignatedInitExpr.
	enum DesignatorTypes {
	/// \brief Field designator where only the field name is known.
	DESIG_FIELD_NAME = 0,
	/// \brief Field designator where the field has been resolved to
	/// a declaration.
	DESIG_FIELD_DECL = 1,
	/// \brief Array designator.
	DESIG_ARRAY = 2,
	/// \brief GNU array range designator.
	DESIG_ARRAY_RANGE = 3
	};

	/// \brief The different kinds of data that can occur in a
	/// CtorInitializer.
	enum CtorInitializerType {
	CTOR_INITIALIZER_BASE,
	CTOR_INITIALIZER_DELEGATING,
	CTOR_INITIALIZER_MEMBER,
	CTOR_INITIALIZER_INDIRECT_MEMBER
	};

	/// \brief Describes the redeclarations of a declaration.
	struct LocalRedeclarationsInfo {
	DeclID FirstID; // The ID of the first declaration
	unsigned Offset; // Offset into the array of redeclaration chains.

	friend bool operator<(const LocalRedeclarationsInfo &X,
	const LocalRedeclarationsInfo &Y) {
	return X.FirstID < Y.FirstID;
	}

	friend bool operator>(const LocalRedeclarationsInfo &X,
	const LocalRedeclarationsInfo &Y) {
	return X.FirstID > Y.FirstID;
	}

	friend bool operator<=(const LocalRedeclarationsInfo &X,
	const LocalRedeclarationsInfo &Y) {
	return X.FirstID <= Y.FirstID;
	}

	friend bool operator>=(const LocalRedeclarationsInfo &X,
	const LocalRedeclarationsInfo &Y) {
	return X.FirstID >= Y.FirstID;
	}
	};

	/// \brief Describes the categories of an Objective-C class.
	struct ObjCCategoriesInfo {
	DeclID DefinitionID; // The ID of the definition
	unsigned Offset; // Offset into the array of category lists.

	friend bool operator<(const ObjCCategoriesInfo &X,
	const ObjCCategoriesInfo &Y) {
	return X.DefinitionID < Y.DefinitionID;
	}

	friend bool operator>(const ObjCCategoriesInfo &X,
	const ObjCCategoriesInfo &Y) {
	return X.DefinitionID > Y.DefinitionID;
	}

	friend bool operator<=(const ObjCCategoriesInfo &X,
	const ObjCCategoriesInfo &Y) {
	return X.DefinitionID <= Y.DefinitionID;
	}

	friend bool operator>=(const ObjCCategoriesInfo &X,
	const ObjCCategoriesInfo &Y) {
	return X.DefinitionID >= Y.DefinitionID;
	}
	};

	/// \brief A key used when looking up entities by \ref DeclarationName.
	///
	/// Different \ref DeclarationNames are mapped to different keys, but the
	/// same key can occasionally represent multiple names (for names that
	/// contain types, in particular).
	class DeclarationNameKey {
	typedef unsigned NameKind;

	NameKind Kind;
	uint64_t Data;

	public:
	DeclarationNameKey() : Kind(), Data() {}
	DeclarationNameKey(DeclarationName Name);

	DeclarationNameKey(NameKind Kind, uint64_t Data)
	: Kind(Kind), Data(Data) {}

	NameKind getKind() const { return Kind; }

	IdentifierInfo *getIdentifier() const {
	assert(Kind == DeclarationName::Identifier \|\|
	Kind == DeclarationName::CXXLiteralOperatorName \|\|
	Kind == DeclarationName::CXXDeductionGuideName);
	return (IdentifierInfo *)Data;
	}
	Selector getSelector() const {
	assert(Kind == DeclarationName::ObjCZeroArgSelector \|\|
	Kind == DeclarationName::ObjCOneArgSelector \|\|
	Kind == DeclarationName::ObjCMultiArgSelector);
	return Selector(Data);
	}
	OverloadedOperatorKind getOperatorKind() const {
	assert(Kind == DeclarationName::CXXOperatorName);
	return (OverloadedOperatorKind)Data;
	}

	/// Compute a fingerprint of this key for use in on-disk hash table.
	unsigned getHash() const;

	friend bool operator==(const DeclarationNameKey &A,
	const DeclarationNameKey &B) {
	return A.Kind == B.Kind && A.Data == B.Data;
	}
	};

	/// @}
	}
	} // end namespace clang

	namespace llvm {
	template <> struct DenseMapInfo<clang::serialization::DeclarationNameKey> {
	static clang::serialization::DeclarationNameKey getEmptyKey() {
	return clang::serialization::DeclarationNameKey(-1, 1);
	}
	static clang::serialization::DeclarationNameKey getTombstoneKey() {
	return clang::serialization::DeclarationNameKey(-1, 2);
	}
	static unsigned
	getHashValue(const clang::serialization::DeclarationNameKey &Key) {
	return Key.getHash();
	}
	static bool isEqual(const clang::serialization::DeclarationNameKey &L,
	const clang::serialization::DeclarationNameKey &R) {
	return L == R;
	}
	};
	}

	#endif
	Index: head/contrib/llvm/tools/clang/lib/AST/ODRHash.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/AST/ODRHash.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/AST/ODRHash.cpp (revision 322320)
	@@ -1,632 +1,636 @@
	//===-- ODRHash.cpp - Hashing to diagnose ODR failures ----------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// This file implements the ODRHash class, which calculates a hash based
	/// on AST nodes, which is stable across different runs.
	///
	//===----------------------------------------------------------------------===//

	#include "clang/AST/ODRHash.h"

	#include "clang/AST/DeclVisitor.h"
	#include "clang/AST/NestedNameSpecifier.h"
	#include "clang/AST/StmtVisitor.h"
	#include "clang/AST/TypeVisitor.h"

	using namespace clang;

	void ODRHash::AddStmt(const Stmt *S) {
	assert(S && "Expecting non-null pointer.");
	S->ProcessODRHash(ID, *this);
	}

	void ODRHash::AddIdentifierInfo(const IdentifierInfo *II) {
	assert(II && "Expecting non-null pointer.");
	ID.AddString(II->getName());
	}

	void ODRHash::AddDeclarationName(DeclarationName Name) {
	AddBoolean(Name.isEmpty());
	if (Name.isEmpty())
	return;

	auto Kind = Name.getNameKind();
	ID.AddInteger(Kind);
	switch (Kind) {
	case DeclarationName::Identifier:
	AddIdentifierInfo(Name.getAsIdentifierInfo());
	break;
	case DeclarationName::ObjCZeroArgSelector:
	case DeclarationName::ObjCOneArgSelector:
	case DeclarationName::ObjCMultiArgSelector: {
	Selector S = Name.getObjCSelector();
	AddBoolean(S.isNull());
	AddBoolean(S.isKeywordSelector());
	AddBoolean(S.isUnarySelector());
	unsigned NumArgs = S.getNumArgs();
	for (unsigned i = 0; i < NumArgs; ++i) {
	AddIdentifierInfo(S.getIdentifierInfoForSlot(i));
	}
	break;
	}
	case DeclarationName::CXXConstructorName:
	case DeclarationName::CXXDestructorName:
	AddQualType(Name.getCXXNameType());
	break;
	case DeclarationName::CXXOperatorName:
	ID.AddInteger(Name.getCXXOverloadedOperator());
	break;
	case DeclarationName::CXXLiteralOperatorName:
	AddIdentifierInfo(Name.getCXXLiteralIdentifier());
	break;
	case DeclarationName::CXXConversionFunctionName:
	AddQualType(Name.getCXXNameType());
	break;
	case DeclarationName::CXXUsingDirective:
	break;
	case DeclarationName::CXXDeductionGuideName: {
	auto *Template = Name.getCXXDeductionGuideTemplate();
	AddBoolean(Template);
	if (Template) {
	AddDecl(Template);
	}
	}
	}
	}

	void ODRHash::AddNestedNameSpecifier(const NestedNameSpecifier *NNS) {
	assert(NNS && "Expecting non-null pointer.");
	const auto *Prefix = NNS->getPrefix();
	AddBoolean(Prefix);
	if (Prefix) {
	AddNestedNameSpecifier(Prefix);
	}
	auto Kind = NNS->getKind();
	ID.AddInteger(Kind);
	switch (Kind) {
	case NestedNameSpecifier::Identifier:
	AddIdentifierInfo(NNS->getAsIdentifier());
	break;
	case NestedNameSpecifier::Namespace:
	AddDecl(NNS->getAsNamespace());
	break;
	case NestedNameSpecifier::NamespaceAlias:
	AddDecl(NNS->getAsNamespaceAlias());
	break;
	case NestedNameSpecifier::TypeSpec:
	case NestedNameSpecifier::TypeSpecWithTemplate:
	AddType(NNS->getAsType());
	break;
	case NestedNameSpecifier::Global:
	case NestedNameSpecifier::Super:
	break;
	}
	}

	void ODRHash::AddTemplateName(TemplateName Name) {
	auto Kind = Name.getKind();
	ID.AddInteger(Kind);

	switch (Kind) {
	case TemplateName::Template:
	AddDecl(Name.getAsTemplateDecl());
	break;
	// TODO: Support these cases.
	case TemplateName::OverloadedTemplate:
	case TemplateName::QualifiedTemplate:
	case TemplateName::DependentTemplate:
	case TemplateName::SubstTemplateTemplateParm:
	case TemplateName::SubstTemplateTemplateParmPack:
	break;
	}
	}

	void ODRHash::AddTemplateArgument(TemplateArgument TA) {
	const auto Kind = TA.getKind();
	ID.AddInteger(Kind);

	switch (Kind) {
	case TemplateArgument::Null:
	llvm_unreachable("Expected valid TemplateArgument");
	case TemplateArgument::Type:
	AddQualType(TA.getAsType());
	break;
	case TemplateArgument::Declaration:
	case TemplateArgument::NullPtr:
	case TemplateArgument::Integral:
	break;
	case TemplateArgument::Template:
	case TemplateArgument::TemplateExpansion:
	AddTemplateName(TA.getAsTemplateOrTemplatePattern());
	break;
	case TemplateArgument::Expression:
	AddStmt(TA.getAsExpr());
	break;
	case TemplateArgument::Pack:
	ID.AddInteger(TA.pack_size());
	for (auto SubTA : TA.pack_elements()) {
	AddTemplateArgument(SubTA);
	}
	break;
	}
	}

	void ODRHash::AddTemplateParameterList(const TemplateParameterList *TPL) {}

	void ODRHash::clear() {
	DeclMap.clear();
	TypeMap.clear();
	Bools.clear();
	ID.clear();
	}

	unsigned ODRHash::CalculateHash() {
	// Append the bools to the end of the data segment backwards. This allows
	// for the bools data to be compressed 32 times smaller compared to using
	// ID.AddBoolean
	const unsigned unsigned_bits = sizeof(unsigned) * CHAR_BIT;
	const unsigned size = Bools.size();
	const unsigned remainder = size % unsigned_bits;
	const unsigned loops = size / unsigned_bits;
	auto I = Bools.rbegin();
	unsigned value = 0;
	for (unsigned i = 0; i < remainder; ++i) {
	value <<= 1;
	value \|= *I;
	++I;
	}
	ID.AddInteger(value);

	for (unsigned i = 0; i < loops; ++i) {
	value = 0;
	for (unsigned j = 0; j < unsigned_bits; ++j) {
	value <<= 1;
	value \|= *I;
	++I;
	}
	ID.AddInteger(value);
	}

	assert(I == Bools.rend());
	Bools.clear();
	return ID.ComputeHash();
	}

	// Process a Decl pointer. Add* methods call back into ODRHash while Visit*
	// methods process the relevant parts of the Decl.
	class ODRDeclVisitor : public ConstDeclVisitor<ODRDeclVisitor> {
	typedef ConstDeclVisitor<ODRDeclVisitor> Inherited;
	llvm::FoldingSetNodeID &ID;
	ODRHash &Hash;

	public:
	ODRDeclVisitor(llvm::FoldingSetNodeID &ID, ODRHash &Hash)
	: ID(ID), Hash(Hash) {}

	void AddStmt(const Stmt *S) {
	Hash.AddBoolean(S);
	if (S) {
	Hash.AddStmt(S);
	}
	}

	void AddIdentifierInfo(const IdentifierInfo *II) {
	Hash.AddBoolean(II);
	if (II) {
	Hash.AddIdentifierInfo(II);
	}
	}

	void AddQualType(QualType T) {
	Hash.AddQualType(T);
	}

	void AddDecl(const Decl *D) {
	Hash.AddBoolean(D);
	if (D) {
	Hash.AddDecl(D);
	}
	}

	void Visit(const Decl *D) {
	ID.AddInteger(D->getKind());
	Inherited::Visit(D);
	}

	void VisitNamedDecl(const NamedDecl *D) {
	Hash.AddDeclarationName(D->getDeclName());
	Inherited::VisitNamedDecl(D);
	}

	void VisitValueDecl(const ValueDecl *D) {
	if (!isa<FunctionDecl>(D)) {
	AddQualType(D->getType());
	}
	Inherited::VisitValueDecl(D);
	}

	void VisitVarDecl(const VarDecl *D) {
	Hash.AddBoolean(D->isStaticLocal());
	Hash.AddBoolean(D->isConstexpr());
	const bool HasInit = D->hasInit();
	Hash.AddBoolean(HasInit);
	if (HasInit) {
	AddStmt(D->getInit());
	}
	Inherited::VisitVarDecl(D);
	}

	void VisitParmVarDecl(const ParmVarDecl *D) {
	// TODO: Handle default arguments.
	Inherited::VisitParmVarDecl(D);
	}

	void VisitAccessSpecDecl(const AccessSpecDecl *D) {
	ID.AddInteger(D->getAccess());
	Inherited::VisitAccessSpecDecl(D);
	}

	void VisitStaticAssertDecl(const StaticAssertDecl *D) {
	AddStmt(D->getAssertExpr());
	AddStmt(D->getMessage());

	Inherited::VisitStaticAssertDecl(D);
	}

	void VisitFieldDecl(const FieldDecl *D) {
	const bool IsBitfield = D->isBitField();
	Hash.AddBoolean(IsBitfield);

	if (IsBitfield) {
	AddStmt(D->getBitWidth());
	}

	Hash.AddBoolean(D->isMutable());
	AddStmt(D->getInClassInitializer());

	Inherited::VisitFieldDecl(D);
	}

	void VisitFunctionDecl(const FunctionDecl *D) {
	ID.AddInteger(D->getStorageClass());
	Hash.AddBoolean(D->isInlineSpecified());
	Hash.AddBoolean(D->isVirtualAsWritten());
	Hash.AddBoolean(D->isPure());
	Hash.AddBoolean(D->isDeletedAsWritten());

	ID.AddInteger(D->param_size());

	for (auto *Param : D->parameters()) {
	Hash.AddSubDecl(Param);
	}

	AddQualType(D->getReturnType());

	Inherited::VisitFunctionDecl(D);
	}

	void VisitCXXMethodDecl(const CXXMethodDecl *D) {
	Hash.AddBoolean(D->isConst());
	Hash.AddBoolean(D->isVolatile());

	Inherited::VisitCXXMethodDecl(D);
	}

	void VisitTypedefNameDecl(const TypedefNameDecl *D) {
	AddQualType(D->getUnderlyingType());

	Inherited::VisitTypedefNameDecl(D);
	}

	void VisitTypedefDecl(const TypedefDecl *D) {
	Inherited::VisitTypedefDecl(D);
	}

	void VisitTypeAliasDecl(const TypeAliasDecl *D) {
	Inherited::VisitTypeAliasDecl(D);
	}

	void VisitFriendDecl(const FriendDecl *D) {
	TypeSourceInfo *TSI = D->getFriendType();
	Hash.AddBoolean(TSI);
	if (TSI) {
	AddQualType(TSI->getType());
	} else {
	AddDecl(D->getFriendDecl());
	}
	}
	};

	// Only allow a small portion of Decl's to be processed. Remove this once
	// all Decl's can be handled.
	bool ODRHash::isWhitelistedDecl(const Decl D, const CXXRecordDecl Parent) {
	if (D->isImplicit()) return false;
	if (D->getDeclContext() != Parent) return false;

	switch (D->getKind()) {
	default:
	return false;
	case Decl::AccessSpec:
	case Decl::CXXConstructor:
	case Decl::CXXDestructor:
	case Decl::CXXMethod:
	case Decl::Field:
	case Decl::Friend:
	case Decl::StaticAssert:
	case Decl::TypeAlias:
	case Decl::Typedef:
	case Decl::Var:
	return true;
	}
	}

	void ODRHash::AddSubDecl(const Decl *D) {
	assert(D && "Expecting non-null pointer.");
	AddDecl(D);

	ODRDeclVisitor(ID, *this).Visit(D);
	}

	void ODRHash::AddCXXRecordDecl(const CXXRecordDecl *Record) {
	assert(Record && Record->hasDefinition() &&
	"Expected non-null record to be a definition.");

	- if (isa<ClassTemplateSpecializationDecl>(Record)) {
	- return;
	+ const DeclContext *DC = Record;
	+ while (DC) {
	+ if (isa<ClassTemplateSpecializationDecl>(DC)) {
	+ return;
	+ }
	+ DC = DC->getParent();
	}

	AddDecl(Record);

	// Filter out sub-Decls which will not be processed in order to get an
	// accurate count of Decl's.
	llvm::SmallVector<const Decl *, 16> Decls;
	for (const Decl *SubDecl : Record->decls()) {
	if (isWhitelistedDecl(SubDecl, Record)) {
	Decls.push_back(SubDecl);
	}
	}

	ID.AddInteger(Decls.size());
	for (auto SubDecl : Decls) {
	AddSubDecl(SubDecl);
	}
	}

	void ODRHash::AddDecl(const Decl *D) {
	assert(D && "Expecting non-null pointer.");
	auto Result = DeclMap.insert(std::make_pair(D, DeclMap.size()));
	ID.AddInteger(Result.first->second);
	// On first encounter of a Decl pointer, process it. Every time afterwards,
	// only the index value is needed.
	if (!Result.second) {
	return;
	}

	ID.AddInteger(D->getKind());

	if (const NamedDecl *ND = dyn_cast<NamedDecl>(D)) {
	AddDeclarationName(ND->getDeclName());
	}
	}

	// Process a Type pointer. Add* methods call back into ODRHash while Visit*
	// methods process the relevant parts of the Type.
	class ODRTypeVisitor : public TypeVisitor<ODRTypeVisitor> {
	typedef TypeVisitor<ODRTypeVisitor> Inherited;
	llvm::FoldingSetNodeID &ID;
	ODRHash &Hash;

	public:
	ODRTypeVisitor(llvm::FoldingSetNodeID &ID, ODRHash &Hash)
	: ID(ID), Hash(Hash) {}

	void AddStmt(Stmt *S) {
	Hash.AddBoolean(S);
	if (S) {
	Hash.AddStmt(S);
	}
	}

	void AddDecl(Decl *D) {
	Hash.AddBoolean(D);
	if (D) {
	Hash.AddDecl(D);
	}
	}

	void AddQualType(QualType T) {
	Hash.AddQualType(T);
	}

	void AddType(const Type *T) {
	Hash.AddBoolean(T);
	if (T) {
	Hash.AddType(T);
	}
	}

	void AddNestedNameSpecifier(const NestedNameSpecifier *NNS) {
	Hash.AddBoolean(NNS);
	if (NNS) {
	Hash.AddNestedNameSpecifier(NNS);
	}
	}

	void AddIdentifierInfo(const IdentifierInfo *II) {
	Hash.AddBoolean(II);
	if (II) {
	Hash.AddIdentifierInfo(II);
	}
	}

	void VisitQualifiers(Qualifiers Quals) {
	ID.AddInteger(Quals.getAsOpaqueValue());
	}

	void Visit(const Type *T) {
	ID.AddInteger(T->getTypeClass());
	Inherited::Visit(T);
	}

	void VisitType(const Type *T) {}

	void VisitAdjustedType(const AdjustedType *T) {
	AddQualType(T->getOriginalType());
	AddQualType(T->getAdjustedType());
	VisitType(T);
	}

	void VisitDecayedType(const DecayedType *T) {
	AddQualType(T->getDecayedType());
	AddQualType(T->getPointeeType());
	VisitAdjustedType(T);
	}

	void VisitArrayType(const ArrayType *T) {
	AddQualType(T->getElementType());
	ID.AddInteger(T->getSizeModifier());
	VisitQualifiers(T->getIndexTypeQualifiers());
	VisitType(T);
	}
	void VisitConstantArrayType(const ConstantArrayType *T) {
	T->getSize().Profile(ID);
	VisitArrayType(T);
	}

	void VisitDependentSizedArrayType(const DependentSizedArrayType *T) {
	AddStmt(T->getSizeExpr());
	VisitArrayType(T);
	}

	void VisitIncompleteArrayType(const IncompleteArrayType *T) {
	VisitArrayType(T);
	}

	void VisitVariableArrayType(const VariableArrayType *T) {
	AddStmt(T->getSizeExpr());
	VisitArrayType(T);
	}

	void VisitBuiltinType(const BuiltinType *T) {
	ID.AddInteger(T->getKind());
	VisitType(T);
	}

	void VisitFunctionType(const FunctionType *T) {
	AddQualType(T->getReturnType());
	T->getExtInfo().Profile(ID);
	Hash.AddBoolean(T->isConst());
	Hash.AddBoolean(T->isVolatile());
	Hash.AddBoolean(T->isRestrict());
	VisitType(T);
	}

	void VisitFunctionNoProtoType(const FunctionNoProtoType *T) {
	VisitFunctionType(T);
	}

	void VisitFunctionProtoType(const FunctionProtoType *T) {
	ID.AddInteger(T->getNumParams());
	for (auto ParamType : T->getParamTypes())
	AddQualType(ParamType);

	VisitFunctionType(T);
	}

	void VisitTypedefType(const TypedefType *T) {
	AddDecl(T->getDecl());
	QualType UnderlyingType = T->getDecl()->getUnderlyingType();
	VisitQualifiers(UnderlyingType.getQualifiers());
	while (const TypedefType *Underlying =
	dyn_cast<TypedefType>(UnderlyingType.getTypePtr())) {
	UnderlyingType = Underlying->getDecl()->getUnderlyingType();
	}
	AddType(UnderlyingType.getTypePtr());
	VisitType(T);
	}

	void VisitTagType(const TagType *T) {
	AddDecl(T->getDecl());
	VisitType(T);
	}

	void VisitRecordType(const RecordType *T) { VisitTagType(T); }
	void VisitEnumType(const EnumType *T) { VisitTagType(T); }

	void VisitTypeWithKeyword(const TypeWithKeyword *T) {
	ID.AddInteger(T->getKeyword());
	VisitType(T);
	};

	void VisitDependentNameType(const DependentNameType *T) {
	AddNestedNameSpecifier(T->getQualifier());
	AddIdentifierInfo(T->getIdentifier());
	VisitTypeWithKeyword(T);
	}

	void VisitDependentTemplateSpecializationType(
	const DependentTemplateSpecializationType *T) {
	AddIdentifierInfo(T->getIdentifier());
	AddNestedNameSpecifier(T->getQualifier());
	ID.AddInteger(T->getNumArgs());
	for (const auto &TA : T->template_arguments()) {
	Hash.AddTemplateArgument(TA);
	}
	VisitTypeWithKeyword(T);
	}

	void VisitElaboratedType(const ElaboratedType *T) {
	AddNestedNameSpecifier(T->getQualifier());
	AddQualType(T->getNamedType());
	VisitTypeWithKeyword(T);
	}

	void VisitTemplateSpecializationType(const TemplateSpecializationType *T) {
	ID.AddInteger(T->getNumArgs());
	for (const auto &TA : T->template_arguments()) {
	Hash.AddTemplateArgument(TA);
	}
	Hash.AddTemplateName(T->getTemplateName());
	VisitType(T);
	}

	void VisitTemplateTypeParmType(const TemplateTypeParmType *T) {
	ID.AddInteger(T->getDepth());
	ID.AddInteger(T->getIndex());
	Hash.AddBoolean(T->isParameterPack());
	AddDecl(T->getDecl());
	}
	};

	void ODRHash::AddType(const Type *T) {
	assert(T && "Expecting non-null pointer.");
	auto Result = TypeMap.insert(std::make_pair(T, TypeMap.size()));
	ID.AddInteger(Result.first->second);
	// On first encounter of a Type pointer, process it. Every time afterwards,
	// only the index value is needed.
	if (!Result.second) {
	return;
	}

	ODRTypeVisitor(ID, *this).Visit(T);
	}

	void ODRHash::AddQualType(QualType T) {
	AddBoolean(T.isNull());
	if (T.isNull())
	return;
	SplitQualType split = T.split();
	ID.AddInteger(split.Quals.getAsOpaqueValue());
	AddType(split.Ty);
	}

	void ODRHash::AddBoolean(bool Value) {
	Bools.push_back(Value);
	}
	Index: head/contrib/llvm/tools/clang/lib/AST/StmtCXX.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/AST/StmtCXX.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/AST/StmtCXX.cpp (revision 322320)
	@@ -1,117 +1,131 @@
	//===--- StmtCXX.cpp - Classes for representing C++ statements ------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the subclesses of Stmt class declared in StmtCXX.h
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/StmtCXX.h"

	#include "clang/AST/ASTContext.h"

	using namespace clang;

	QualType CXXCatchStmt::getCaughtType() const {
	if (ExceptionDecl)
	return ExceptionDecl->getType();
	return QualType();
	}

	CXXTryStmt *CXXTryStmt::Create(const ASTContext &C, SourceLocation tryLoc,
	Stmt tryBlock, ArrayRef<Stmt > handlers) {
	std::size_t Size = sizeof(CXXTryStmt);
	Size += ((handlers.size() + 1) * sizeof(Stmt *));

	void *Mem = C.Allocate(Size, alignof(CXXTryStmt));
	return new (Mem) CXXTryStmt(tryLoc, tryBlock, handlers);
	}

	CXXTryStmt *CXXTryStmt::Create(const ASTContext &C, EmptyShell Empty,
	unsigned numHandlers) {
	std::size_t Size = sizeof(CXXTryStmt);
	Size += ((numHandlers + 1) * sizeof(Stmt *));

	void *Mem = C.Allocate(Size, alignof(CXXTryStmt));
	return new (Mem) CXXTryStmt(Empty, numHandlers);
	}

	CXXTryStmt::CXXTryStmt(SourceLocation tryLoc, Stmt *tryBlock,
	ArrayRef<Stmt *> handlers)
	: Stmt(CXXTryStmtClass), TryLoc(tryLoc), NumHandlers(handlers.size()) {
	Stmt Stmts = reinterpret_cast<Stmt >(this + 1);
	Stmts[0] = tryBlock;
	std::copy(handlers.begin(), handlers.end(), Stmts + 1);
	}

	CXXForRangeStmt::CXXForRangeStmt(DeclStmt *Range,
	DeclStmt BeginStmt, DeclStmt EndStmt,
	Expr Cond, Expr Inc, DeclStmt *LoopVar,
	Stmt *Body, SourceLocation FL,
	SourceLocation CAL, SourceLocation CL,
	SourceLocation RPL)
	: Stmt(CXXForRangeStmtClass), ForLoc(FL), CoawaitLoc(CAL), ColonLoc(CL),
	RParenLoc(RPL) {
	SubExprs[RANGE] = Range;
	SubExprs[BEGINSTMT] = BeginStmt;
	SubExprs[ENDSTMT] = EndStmt;
	SubExprs[COND] = Cond;
	SubExprs[INC] = Inc;
	SubExprs[LOOPVAR] = LoopVar;
	SubExprs[BODY] = Body;
	}

	Expr *CXXForRangeStmt::getRangeInit() {
	DeclStmt *RangeStmt = getRangeStmt();
	VarDecl *RangeDecl = dyn_cast_or_null<VarDecl>(RangeStmt->getSingleDecl());
	assert(RangeDecl && "for-range should have a single var decl");
	return RangeDecl->getInit();
	}

	const Expr *CXXForRangeStmt::getRangeInit() const {
	return const_cast<CXXForRangeStmt *>(this)->getRangeInit();
	}

	VarDecl *CXXForRangeStmt::getLoopVariable() {
	Decl *LV = cast<DeclStmt>(getLoopVarStmt())->getSingleDecl();
	assert(LV && "No loop variable in CXXForRangeStmt");
	return cast<VarDecl>(LV);
	}

	const VarDecl *CXXForRangeStmt::getLoopVariable() const {
	return const_cast<CXXForRangeStmt *>(this)->getLoopVariable();
	}

	CoroutineBodyStmt *CoroutineBodyStmt::Create(
	const ASTContext &C, CoroutineBodyStmt::CtorArgs const &Args) {
	std::size_t Size = totalSizeToAlloc<Stmt *>(
	CoroutineBodyStmt::FirstParamMove + Args.ParamMoves.size());

	void *Mem = C.Allocate(Size, alignof(CoroutineBodyStmt));
	return new (Mem) CoroutineBodyStmt(Args);
	}

	+CoroutineBodyStmt *CoroutineBodyStmt::Create(const ASTContext &C, EmptyShell,
	+ unsigned NumParams) {
	+ std::size_t Size = totalSizeToAlloc<Stmt *>(
	+ CoroutineBodyStmt::FirstParamMove + NumParams);
	+
	+ void *Mem = C.Allocate(Size, alignof(CoroutineBodyStmt));
	+ auto *Result = new (Mem) CoroutineBodyStmt(CtorArgs());
	+ Result->NumParams = NumParams;
	+ auto *ParamBegin = Result->getStoredStmts() + SubStmt::FirstParamMove;
	+ std::uninitialized_fill(ParamBegin, ParamBegin + NumParams,
	+ static_cast<Stmt *>(nullptr));
	+ return Result;
	+}
	+
	CoroutineBodyStmt::CoroutineBodyStmt(CoroutineBodyStmt::CtorArgs const &Args)
	: Stmt(CoroutineBodyStmtClass), NumParams(Args.ParamMoves.size()) {
	Stmt **SubStmts = getStoredStmts();
	SubStmts[CoroutineBodyStmt::Body] = Args.Body;
	SubStmts[CoroutineBodyStmt::Promise] = Args.Promise;
	SubStmts[CoroutineBodyStmt::InitSuspend] = Args.InitialSuspend;
	SubStmts[CoroutineBodyStmt::FinalSuspend] = Args.FinalSuspend;
	SubStmts[CoroutineBodyStmt::OnException] = Args.OnException;
	SubStmts[CoroutineBodyStmt::OnFallthrough] = Args.OnFallthrough;
	SubStmts[CoroutineBodyStmt::Allocate] = Args.Allocate;
	SubStmts[CoroutineBodyStmt::Deallocate] = Args.Deallocate;
	SubStmts[CoroutineBodyStmt::ReturnValue] = Args.ReturnValue;
	SubStmts[CoroutineBodyStmt::ResultDecl] = Args.ResultDecl;
	SubStmts[CoroutineBodyStmt::ReturnStmt] = Args.ReturnStmt;
	SubStmts[CoroutineBodyStmt::ReturnStmtOnAllocFailure] =
	Args.ReturnStmtOnAllocFailure;
	std::copy(Args.ParamMoves.begin(), Args.ParamMoves.end(),
	const_cast<Stmt **>(getParamMoves().data()));
	}
	Index: head/contrib/llvm/tools/clang/lib/Basic/Targets.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Basic/Targets.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Basic/Targets.cpp (revision 322320)
	@@ -1,10038 +1,10032 @@
	//===--- Targets.cpp - Implement target feature support -------------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements construction of a TargetInfo object from a
	// target triple.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/Basic/Builtins.h"
	#include "clang/Basic/Cuda.h"
	#include "clang/Basic/Diagnostic.h"
	#include "clang/Basic/LangOptions.h"
	#include "clang/Basic/MacroBuilder.h"
	#include "clang/Basic/TargetBuiltins.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Basic/TargetOptions.h"
	#include "clang/Basic/Version.h"
	#include "clang/Frontend/CodeGenOptions.h"
	#include "llvm/ADT/APFloat.h"
	#include "llvm/ADT/STLExtras.h"
	#include "llvm/ADT/StringExtras.h"
	#include "llvm/ADT/StringRef.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/ADT/Triple.h"
	#include "llvm/MC/MCSectionMachO.h"
	#include "llvm/Support/ErrorHandling.h"
	#include "llvm/Support/TargetParser.h"
	#include <algorithm>
	#include <memory>

	using namespace clang;

	//===----------------------------------------------------------------------===//
	// Common code shared among targets.
	//===----------------------------------------------------------------------===//

	/// DefineStd - Define a macro name and standard variants. For example if
	/// MacroName is "unix", then this will define "__unix", "__unix__", and "unix"
	/// when in GNU mode.
	static void DefineStd(MacroBuilder &Builder, StringRef MacroName,
	const LangOptions &Opts) {
	assert(MacroName[0] != '_' && "Identifier should be in the user's namespace");

	// If in GNU mode (e.g. -std=gnu99 but not -std=c99) define the raw identifier
	// in the user's namespace.
	if (Opts.GNUMode)
	Builder.defineMacro(MacroName);

	// Define __unix.
	Builder.defineMacro("__" + MacroName);

	// Define __unix__.
	Builder.defineMacro("__" + MacroName + "__");
	}

	static void defineCPUMacros(MacroBuilder &Builder, StringRef CPUName,
	bool Tuning = true) {
	Builder.defineMacro("__" + CPUName);
	Builder.defineMacro("__" + CPUName + "__");
	if (Tuning)
	Builder.defineMacro("__tune_" + CPUName + "__");
	}

	static TargetInfo *AllocateTarget(const llvm::Triple &Triple,
	const TargetOptions &Opts);

	//===----------------------------------------------------------------------===//
	// Defines specific to certain operating systems.
	//===----------------------------------------------------------------------===//

	namespace {
	template<typename TgtInfo>
	class OSTargetInfo : public TgtInfo {
	protected:
	virtual void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const=0;
	public:
	OSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: TgtInfo(Triple, Opts) {}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	TgtInfo::getTargetDefines(Opts, Builder);
	getOSDefines(Opts, TgtInfo::getTriple(), Builder);
	}

	};

	// CloudABI Target
	template <typename Target>
	class CloudABITargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__CloudABI__");
	Builder.defineMacro("__ELF__");

	// CloudABI uses ISO/IEC 10646:2012 for wchar_t, char16_t and char32_t.
	Builder.defineMacro("__STDC_ISO_10646__", "201206L");
	Builder.defineMacro("__STDC_UTF_16__");
	Builder.defineMacro("__STDC_UTF_32__");
	}

	public:
	CloudABITargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	// Ananas target
	template<typename Target>
	class AnanasTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Ananas defines
	Builder.defineMacro("__Ananas__");
	Builder.defineMacro("__ELF__");
	}
	public:
	AnanasTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	static void getDarwinDefines(MacroBuilder &Builder, const LangOptions &Opts,
	const llvm::Triple &Triple,
	StringRef &PlatformName,
	VersionTuple &PlatformMinVersion) {
	Builder.defineMacro("__APPLE_CC__", "6000");
	Builder.defineMacro("__APPLE__");
	Builder.defineMacro("__STDC_NO_THREADS__");
	Builder.defineMacro("OBJC_NEW_PROPERTIES");
	// AddressSanitizer doesn't play well with source fortification, which is on
	// by default on Darwin.
	if (Opts.Sanitize.has(SanitizerKind::Address))
	Builder.defineMacro("_FORTIFY_SOURCE", "0");

	// Darwin defines __weak, __strong, and __unsafe_unretained even in C mode.
	if (!Opts.ObjC1) {
	// __weak is always defined, for use in blocks and with objc pointers.
	Builder.defineMacro("__weak", "__attribute__((objc_gc(weak)))");
	Builder.defineMacro("__strong", "");
	Builder.defineMacro("__unsafe_unretained", "");
	}

	if (Opts.Static)
	Builder.defineMacro("__STATIC__");
	else
	Builder.defineMacro("__DYNAMIC__");

	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");

	// Get the platform type and version number from the triple.
	unsigned Maj, Min, Rev;
	if (Triple.isMacOSX()) {
	Triple.getMacOSXVersion(Maj, Min, Rev);
	PlatformName = "macos";
	} else {
	Triple.getOSVersion(Maj, Min, Rev);
	PlatformName = llvm::Triple::getOSTypeName(Triple.getOS());
	}

	// If -target arch-pc-win32-macho option specified, we're
	// generating code for Win32 ABI. No need to emit
	// __ENVIRONMENT_XX_OS_VERSION_MIN_REQUIRED__.
	if (PlatformName == "win32") {
	PlatformMinVersion = VersionTuple(Maj, Min, Rev);
	return;
	}

	// Set the appropriate OS version define.
	if (Triple.isiOS()) {
	assert(Maj < 100 && Min < 100 && Rev < 100 && "Invalid version!");
	char Str[7];
	if (Maj < 10) {
	Str[0] = '0' + Maj;
	Str[1] = '0' + (Min / 10);
	Str[2] = '0' + (Min % 10);
	Str[3] = '0' + (Rev / 10);
	Str[4] = '0' + (Rev % 10);
	Str[5] = '\0';
	} else {
	// Handle versions >= 10.
	Str[0] = '0' + (Maj / 10);
	Str[1] = '0' + (Maj % 10);
	Str[2] = '0' + (Min / 10);
	Str[3] = '0' + (Min % 10);
	Str[4] = '0' + (Rev / 10);
	Str[5] = '0' + (Rev % 10);
	Str[6] = '\0';
	}
	if (Triple.isTvOS())
	Builder.defineMacro("__ENVIRONMENT_TV_OS_VERSION_MIN_REQUIRED__", Str);
	else
	Builder.defineMacro("__ENVIRONMENT_IPHONE_OS_VERSION_MIN_REQUIRED__",
	Str);

	} else if (Triple.isWatchOS()) {
	assert(Maj < 10 && Min < 100 && Rev < 100 && "Invalid version!");
	char Str[6];
	Str[0] = '0' + Maj;
	Str[1] = '0' + (Min / 10);
	Str[2] = '0' + (Min % 10);
	Str[3] = '0' + (Rev / 10);
	Str[4] = '0' + (Rev % 10);
	Str[5] = '\0';
	Builder.defineMacro("__ENVIRONMENT_WATCH_OS_VERSION_MIN_REQUIRED__", Str);
	} else if (Triple.isMacOSX()) {
	// Note that the Driver allows versions which aren't representable in the
	// define (because we only get a single digit for the minor and micro
	// revision numbers). So, we limit them to the maximum representable
	// version.
	assert(Maj < 100 && Min < 100 && Rev < 100 && "Invalid version!");
	char Str[7];
	if (Maj < 10 \|\| (Maj == 10 && Min < 10)) {
	Str[0] = '0' + (Maj / 10);
	Str[1] = '0' + (Maj % 10);
	Str[2] = '0' + std::min(Min, 9U);
	Str[3] = '0' + std::min(Rev, 9U);
	Str[4] = '\0';
	} else {
	// Handle versions > 10.9.
	Str[0] = '0' + (Maj / 10);
	Str[1] = '0' + (Maj % 10);
	Str[2] = '0' + (Min / 10);
	Str[3] = '0' + (Min % 10);
	Str[4] = '0' + (Rev / 10);
	Str[5] = '0' + (Rev % 10);
	Str[6] = '\0';
	}
	Builder.defineMacro("__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__", Str);
	}

	// Tell users about the kernel if there is one.
	if (Triple.isOSDarwin())
	Builder.defineMacro("__MACH__");

	// The Watch ABI uses Dwarf EH.
	if(Triple.isWatchABI())
	Builder.defineMacro("__ARM_DWARF_EH__");

	PlatformMinVersion = VersionTuple(Maj, Min, Rev);
	}

	template<typename Target>
	class DarwinTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	getDarwinDefines(Builder, Opts, Triple, this->PlatformName,
	this->PlatformMinVersion);
	}

	public:
	DarwinTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	// By default, no TLS, and we whitelist permitted architecture/OS
	// combinations.
	this->TLSSupported = false;

	if (Triple.isMacOSX())
	this->TLSSupported = !Triple.isMacOSXVersionLT(10, 7);
	else if (Triple.isiOS()) {
	// 64-bit iOS supported it from 8 onwards, 32-bit from 9 onwards.
	if (Triple.getArch() == llvm::Triple::x86_64 \|\|
	Triple.getArch() == llvm::Triple::aarch64)
	this->TLSSupported = !Triple.isOSVersionLT(8);
	else if (Triple.getArch() == llvm::Triple::x86 \|\|
	Triple.getArch() == llvm::Triple::arm \|\|
	Triple.getArch() == llvm::Triple::thumb)
	this->TLSSupported = !Triple.isOSVersionLT(9);
	} else if (Triple.isWatchOS())
	this->TLSSupported = !Triple.isOSVersionLT(2);

	this->MCountName = "\01mcount";
	}

	std::string isValidSectionSpecifier(StringRef SR) const override {
	// Let MCSectionMachO validate this.
	StringRef Segment, Section;
	unsigned TAA, StubSize;
	bool HasTAA;
	return llvm::MCSectionMachO::ParseSectionSpecifier(SR, Segment, Section,
	TAA, HasTAA, StubSize);
	}

	const char *getStaticInitSectionSpecifier() const override {
	// FIXME: We should return 0 when building kexts.
	return "__TEXT,__StaticInit,regular,pure_instructions";
	}

	/// Darwin does not support protected visibility. Darwin's "default"
	/// is very similar to ELF's "protected"; Darwin requires a "weak"
	/// attribute on declarations that can be dynamically replaced.
	bool hasProtectedVisibility() const override {
	return false;
	}

	unsigned getExnObjectAlignment() const override {
	// The alignment of an exception object is 8-bytes for darwin since
	// libc++abi doesn't declare _Unwind_Exception with __attribute__((aligned))
	// and therefore doesn't guarantee 16-byte alignment.
	return 64;
	}
	};


	// DragonFlyBSD Target
	template<typename Target>
	class DragonFlyBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// DragonFly defines; list based off of gcc output
	Builder.defineMacro("__DragonFly__");
	Builder.defineMacro("__DragonFly_cc_version", "100001");
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__KPRINTF_ATTRIBUTE__");
	Builder.defineMacro("__tune_i386__");
	DefineStd(Builder, "unix", Opts);
	}
	public:
	DragonFlyBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	switch (Triple.getArch()) {
	default:
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->MCountName = ".mcount";
	break;
	}
	}
	};

	#ifndef FREEBSD_CC_VERSION
	#define FREEBSD_CC_VERSION 0U
	#endif

	// FreeBSD Target
	template<typename Target>
	class FreeBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// FreeBSD defines; list based off of gcc output

	unsigned Release = Triple.getOSMajorVersion();
	if (Release == 0U)
	Release = 8U;
	unsigned CCVersion = FREEBSD_CC_VERSION;
	if (CCVersion == 0U)
	CCVersion = Release * 100000U + 1U;

	Builder.defineMacro("__FreeBSD__", Twine(Release));
	Builder.defineMacro("__FreeBSD_cc_version", Twine(CCVersion));
	Builder.defineMacro("__KPRINTF_ATTRIBUTE__");
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");

	// On FreeBSD, wchar_t contains the number of the code point as
	// used by the character set of the locale. These character sets are
	// not necessarily a superset of ASCII.
	//
	// FIXME: This is wrong; the macro refers to the numerical values
	// of wchar_t literals, which are not locale-dependent. However,
	// FreeBSD systems apparently depend on us getting this wrong, and
	// setting this to 1 is conforming even if all the basic source
	// character literals have the same encoding as char and wchar_t.
	Builder.defineMacro("__STDC_MB_MIGHT_NEQ_WC__", "1");
	}
	public:
	FreeBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	switch (Triple.getArch()) {
	default:
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->MCountName = ".mcount";
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	this->MCountName = "_mcount";
	break;
	case llvm::Triple::arm:
	this->MCountName = "__mcount";
	break;
	}
	}
	};

	// GNU/kFreeBSD Target
	template<typename Target>
	class KFreeBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// GNU/kFreeBSD defines; list based off of gcc output

	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__FreeBSD_kernel__");
	Builder.defineMacro("__GLIBC__");
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}
	public:
	KFreeBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	// Haiku Target
	template<typename Target>
	class HaikuTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Haiku defines; list based off of gcc output
	Builder.defineMacro("__HAIKU__");
	Builder.defineMacro("__ELF__");
	DefineStd(Builder, "unix", Opts);
	}
	public:
	HaikuTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->SizeType = TargetInfo::UnsignedLong;
	this->IntPtrType = TargetInfo::SignedLong;
	this->PtrDiffType = TargetInfo::SignedLong;
	this->ProcessIDType = TargetInfo::SignedLong;
	this->TLSSupported = false;

	}
	};

	// Minix Target
	template<typename Target>
	class MinixTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Minix defines

	Builder.defineMacro("__minix", "3");
	Builder.defineMacro("_EM_WSIZE", "4");
	Builder.defineMacro("_EM_PSIZE", "4");
	Builder.defineMacro("_EM_SSIZE", "2");
	Builder.defineMacro("_EM_LSIZE", "4");
	Builder.defineMacro("_EM_FSIZE", "4");
	Builder.defineMacro("_EM_DSIZE", "8");
	Builder.defineMacro("__ELF__");
	DefineStd(Builder, "unix", Opts);
	}
	public:
	MinixTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	// Linux target
	template<typename Target>
	class LinuxTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Linux defines; list based off of gcc output
	DefineStd(Builder, "unix", Opts);
	DefineStd(Builder, "linux", Opts);
	Builder.defineMacro("__gnu_linux__");
	Builder.defineMacro("__ELF__");
	if (Triple.isAndroid()) {
	Builder.defineMacro("__ANDROID__", "1");
	unsigned Maj, Min, Rev;
	Triple.getEnvironmentVersion(Maj, Min, Rev);
	this->PlatformName = "android";
	this->PlatformMinVersion = VersionTuple(Maj, Min, Rev);
	if (Maj)
	Builder.defineMacro("__ANDROID_API__", Twine(Maj));
	}
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	if (this->HasFloat128)
	Builder.defineMacro("__FLOAT128__");
	}
	public:
	LinuxTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->WIntType = TargetInfo::UnsignedInt;

	switch (Triple.getArch()) {
	default:
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	this->MCountName = "_mcount";
	break;
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	case llvm::Triple::systemz:
	this->HasFloat128 = true;
	break;
	}
	}

	const char *getStaticInitSectionSpecifier() const override {
	return ".text.startup";
	}
	};

	// NetBSD Target
	template<typename Target>
	class NetBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// NetBSD defines; list based off of gcc output
	Builder.defineMacro("__NetBSD__");
	Builder.defineMacro("__unix__");
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");

	switch (Triple.getArch()) {
	default:
	break;
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	Builder.defineMacro("__ARM_DWARF_EH__");
	break;
	}
	}
	public:
	NetBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->MCountName = "_mcount";
	}
	};

	// OpenBSD Target
	template<typename Target>
	class OpenBSDTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// OpenBSD defines; list based off of gcc output

	Builder.defineMacro("__OpenBSD__");
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (this->HasFloat128)
	Builder.defineMacro("__FLOAT128__");
	}
	public:
	OpenBSDTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	switch (Triple.getArch()) {
	case llvm::Triple::x86:
	case llvm::Triple::x86_64:
	this->HasFloat128 = true;
	// FALLTHROUGH
	default:
	this->MCountName = "__mcount";
	break;
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	case llvm::Triple::ppc:
	case llvm::Triple::sparcv9:
	this->MCountName = "_mcount";
	break;
	}
	}
	};

	// Bitrig Target
	template<typename Target>
	class BitrigTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// Bitrig defines; list based off of gcc output

	Builder.defineMacro("__Bitrig__");
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");

	switch (Triple.getArch()) {
	default:
	break;
	case llvm::Triple::arm:
	case llvm::Triple::armeb:
	case llvm::Triple::thumb:
	case llvm::Triple::thumbeb:
	Builder.defineMacro("__ARM_DWARF_EH__");
	break;
	}
	}
	public:
	BitrigTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->MCountName = "__mcount";
	}
	};

	// PSP Target
	template<typename Target>
	class PSPTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// PSP defines; list based on the output of the pspdev gcc toolchain.
	Builder.defineMacro("PSP");
	Builder.defineMacro("_PSP");
	Builder.defineMacro("__psp__");
	Builder.defineMacro("__ELF__");
	}
	public:
	PSPTargetInfo(const llvm::Triple &Triple) : OSTargetInfo<Target>(Triple) {}
	};

	// PS3 PPU Target
	template<typename Target>
	class PS3PPUTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// PS3 PPU defines.
	Builder.defineMacro("__PPC__");
	Builder.defineMacro("__PPU__");
	Builder.defineMacro("__CELLOS_LV2__");
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__LP32__");
	Builder.defineMacro("_ARCH_PPC64");
	Builder.defineMacro("__powerpc64__");
	}
	public:
	PS3PPUTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->LongWidth = this->LongAlign = 32;
	this->PointerWidth = this->PointerAlign = 32;
	this->IntMaxType = TargetInfo::SignedLongLong;
	this->Int64Type = TargetInfo::SignedLongLong;
	this->SizeType = TargetInfo::UnsignedInt;
	this->resetDataLayout("E-m:e-p:32:32-i64:64-n32:64");
	}
	};

	template <typename Target>
	class PS4OSTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__FreeBSD__", "9");
	Builder.defineMacro("__FreeBSD_cc_version", "900001");
	Builder.defineMacro("__KPRINTF_ATTRIBUTE__");
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__ORBIS__");
	}
	public:
	PS4OSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->WCharType = this->UnsignedShort;

	// On PS4, TLS variable cannot be aligned to more than 32 bytes (256 bits).
	this->MaxTLSAlign = 256;

	// On PS4, do not honor explicit bit field alignment,
	// as in "__attribute__((aligned(2))) int b : 1;".
	this->UseExplicitBitFieldAlignment = false;

	switch (Triple.getArch()) {
	default:
	case llvm::Triple::x86_64:
	this->MCountName = ".mcount";
	break;
	}
	}
	};

	// Solaris target
	template<typename Target>
	class SolarisTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "sun", Opts);
	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__svr4__");
	Builder.defineMacro("__SVR4");
	// Solaris headers require _XOPEN_SOURCE to be set to 600 for C99 and
	// newer, but to 500 for everything else. feature_test.h has a check to
	// ensure that you are not using C99 with an old version of X/Open or C89
	// with a new version.
	if (Opts.C99)
	Builder.defineMacro("_XOPEN_SOURCE", "600");
	else
	Builder.defineMacro("_XOPEN_SOURCE", "500");
	if (Opts.CPlusPlus)
	Builder.defineMacro("__C99FEATURES__");
	Builder.defineMacro("_LARGEFILE_SOURCE");
	Builder.defineMacro("_LARGEFILE64_SOURCE");
	Builder.defineMacro("__EXTENSIONS__");
	Builder.defineMacro("_REENTRANT");
	}
	public:
	SolarisTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->WCharType = this->SignedInt;
	// FIXME: WIntType should be SignedLong
	}
	};

	// Windows target
	template<typename Target>
	class WindowsTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("_WIN32");
	}
	void getVisualStudioDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	if (Opts.CPlusPlus) {
	if (Opts.RTTIData)
	Builder.defineMacro("_CPPRTTI");

	if (Opts.CXXExceptions)
	Builder.defineMacro("_CPPUNWIND");
	}

	if (Opts.Bool)
	Builder.defineMacro("__BOOL_DEFINED");

	if (!Opts.CharIsSigned)
	Builder.defineMacro("_CHAR_UNSIGNED");

	// FIXME: POSIXThreads isn't exactly the option this should be defined for,
	// but it works for now.
	if (Opts.POSIXThreads)
	Builder.defineMacro("_MT");

	if (Opts.MSCompatibilityVersion) {
	Builder.defineMacro("_MSC_VER",
	Twine(Opts.MSCompatibilityVersion / 100000));
	Builder.defineMacro("_MSC_FULL_VER", Twine(Opts.MSCompatibilityVersion));
	// FIXME We cannot encode the revision information into 32-bits
	Builder.defineMacro("_MSC_BUILD", Twine(1));

	if (Opts.CPlusPlus11 && Opts.isCompatibleWithMSVC(LangOptions::MSVC2015))
	Builder.defineMacro("_HAS_CHAR16_T_LANGUAGE_SUPPORT", Twine(1));

	if (Opts.isCompatibleWithMSVC(LangOptions::MSVC2015)) {
	if (Opts.CPlusPlus1z)
	Builder.defineMacro("_MSVC_LANG", "201403L");
	else if (Opts.CPlusPlus14)
	Builder.defineMacro("_MSVC_LANG", "201402L");
	}
	}

	if (Opts.MicrosoftExt) {
	Builder.defineMacro("_MSC_EXTENSIONS");

	if (Opts.CPlusPlus11) {
	Builder.defineMacro("_RVALUE_REFERENCES_V2_SUPPORTED");
	Builder.defineMacro("_RVALUE_REFERENCES_SUPPORTED");
	Builder.defineMacro("_NATIVE_NULLPTR_SUPPORTED");
	}
	}

	Builder.defineMacro("_INTEGRAL_MAX_BITS", "64");
	}

	public:
	WindowsTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {}
	};

	template <typename Target>
	class NaClTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");

	DefineStd(Builder, "unix", Opts);
	Builder.defineMacro("__ELF__");
	Builder.defineMacro("__native_client__");
	}

	public:
	NaClTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->LongAlign = 32;
	this->LongWidth = 32;
	this->PointerAlign = 32;
	this->PointerWidth = 32;
	this->IntMaxType = TargetInfo::SignedLongLong;
	this->Int64Type = TargetInfo::SignedLongLong;
	this->DoubleAlign = 64;
	this->LongDoubleWidth = 64;
	this->LongDoubleAlign = 64;
	this->LongLongWidth = 64;
	this->LongLongAlign = 64;
	this->SizeType = TargetInfo::UnsignedInt;
	this->PtrDiffType = TargetInfo::SignedInt;
	this->IntPtrType = TargetInfo::SignedInt;
	// RegParmMax is inherited from the underlying architecture.
	this->LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	if (Triple.getArch() == llvm::Triple::arm) {
	// Handled in ARM's setABI().
	} else if (Triple.getArch() == llvm::Triple::x86) {
	this->resetDataLayout("e-m:e-p:32:32-i64:64-n8:16:32-S128");
	} else if (Triple.getArch() == llvm::Triple::x86_64) {
	this->resetDataLayout("e-m:e-p:32:32-i64:64-n8:16:32:64-S128");
	} else if (Triple.getArch() == llvm::Triple::mipsel) {
	// Handled on mips' setDataLayout.
	} else {
	assert(Triple.getArch() == llvm::Triple::le32);
	this->resetDataLayout("e-p:32:32-i64:64");
	}
	}
	};

	// Fuchsia Target
	template<typename Target>
	class FuchsiaTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__Fuchsia__");
	Builder.defineMacro("__ELF__");
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	// Required by the libc++ locale support.
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}
	public:
	FuchsiaTargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->MCountName = "__mcount";
	}
	};

	// WebAssembly target
	template <typename Target>
	class WebAssemblyOSTargetInfo : public OSTargetInfo<Target> {
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const final {
	// A common platform macro.
	if (Opts.POSIXThreads)
	Builder.defineMacro("_REENTRANT");
	// Follow g++ convention and predefine _GNU_SOURCE for C++.
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}

	// As an optimization, group static init code together in a section.
	const char *getStaticInitSectionSpecifier() const final {
	return ".text.__startup";
	}

	public:
	explicit WebAssemblyOSTargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	this->MCountName = "__mcount";
	this->TheCXXABI.set(TargetCXXABI::WebAssembly);
	}
	};

	//===----------------------------------------------------------------------===//
	// Specific target implementations.
	//===----------------------------------------------------------------------===//

	// PPC abstract base class
	class PPCTargetInfo : public TargetInfo {
	static const Builtin::Info BuiltinInfo[];
	static const char * const GCCRegNames[];
	static const TargetInfo::GCCRegAlias GCCRegAliases[];
	std::string CPU;

	// Target cpu features.
	bool HasAltivec;
	bool HasVSX;
	bool HasP8Vector;
	bool HasP8Crypto;
	bool HasDirectMove;
	bool HasQPX;
	bool HasHTM;
	bool HasBPERMD;
	bool HasExtDiv;
	bool HasP9Vector;

	protected:
	std::string ABI;

	public:
	PPCTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple), HasAltivec(false), HasVSX(false), HasP8Vector(false),
	HasP8Crypto(false), HasDirectMove(false), HasQPX(false), HasHTM(false),
	HasBPERMD(false), HasExtDiv(false), HasP9Vector(false) {
	SuitableAlign = 128;
	SimdDefaultAlign = 128;
	LongDoubleWidth = LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::PPCDoubleDouble();
	}

	/// \brief Flags for architecture specific defines.
	typedef enum {
	ArchDefineNone = 0,
	ArchDefineName = 1 << 0, // <name> is substituted for arch name.
	ArchDefinePpcgr = 1 << 1,
	ArchDefinePpcsq = 1 << 2,
	ArchDefine440 = 1 << 3,
	ArchDefine603 = 1 << 4,
	ArchDefine604 = 1 << 5,
	ArchDefinePwr4 = 1 << 6,
	ArchDefinePwr5 = 1 << 7,
	ArchDefinePwr5x = 1 << 8,
	ArchDefinePwr6 = 1 << 9,
	ArchDefinePwr6x = 1 << 10,
	ArchDefinePwr7 = 1 << 11,
	ArchDefinePwr8 = 1 << 12,
	ArchDefinePwr9 = 1 << 13,
	ArchDefineA2 = 1 << 14,
	ArchDefineA2q = 1 << 15
	} ArchDefineTypes;

	// Set the language option for altivec based on our value.
	void adjust(LangOptions &Opts) override {
	if (HasAltivec)
	Opts.AltiVec = 1;
	TargetInfo::adjust(Opts);
	}

	// Note: GCC recognizes the following additional cpus:
	// 401, 403, 405, 405fp, 440fp, 464, 464fp, 476, 476fp, 505, 740, 801,
	// 821, 823, 8540, 8548, e300c2, e300c3, e500mc64, e6500, 860, cell,
	// titan, rs64.
	bool setCPU(const std::string &Name) override {
	bool CPUKnown = llvm::StringSwitch<bool>(Name)
	.Case("generic", true)
	.Case("440", true)
	.Case("450", true)
	.Case("601", true)
	.Case("602", true)
	.Case("603", true)
	.Case("603e", true)
	.Case("603ev", true)
	.Case("604", true)
	.Case("604e", true)
	.Case("620", true)
	.Case("630", true)
	.Case("g3", true)
	.Case("7400", true)
	.Case("g4", true)
	.Case("7450", true)
	.Case("g4+", true)
	.Case("750", true)
	.Case("970", true)
	.Case("g5", true)
	.Case("a2", true)
	.Case("a2q", true)
	.Case("e500mc", true)
	.Case("e5500", true)
	.Case("power3", true)
	.Case("pwr3", true)
	.Case("power4", true)
	.Case("pwr4", true)
	.Case("power5", true)
	.Case("pwr5", true)
	.Case("power5x", true)
	.Case("pwr5x", true)
	.Case("power6", true)
	.Case("pwr6", true)
	.Case("power6x", true)
	.Case("pwr6x", true)
	.Case("power7", true)
	.Case("pwr7", true)
	.Case("power8", true)
	.Case("pwr8", true)
	.Case("power9", true)
	.Case("pwr9", true)
	.Case("powerpc", true)
	.Case("ppc", true)
	.Case("powerpc64", true)
	.Case("ppc64", true)
	.Case("powerpc64le", true)
	.Case("ppc64le", true)
	.Default(false);

	if (CPUKnown)
	CPU = Name;

	return CPUKnown;
	}


	StringRef getABI() const override { return ABI; }

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::PPC::LastTSBuiltin-Builtin::FirstTSBuiltin);
	}

	bool isCLZForZeroUndef() const override { return false; }

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;

	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override;

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override;
	bool hasFeature(StringRef Feature) const override;
	void setFeatureEnabled(llvm::StringMap<bool> &Features, StringRef Name,
	bool Enabled) const override;

	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default: return false;
	case 'O': // Zero
	break;
	case 'b': // Base register
	case 'f': // Floating point register
	Info.setAllowsRegister();
	break;
	// FIXME: The following are added to allow parsing.
	// I just took a guess at what the actions should be.
	// Also, is more specific checking needed? I.e. specific registers?
	case 'd': // Floating point register (containing 64-bit value)
	case 'v': // Altivec vector register
	Info.setAllowsRegister();
	break;
	case 'w':
	switch (Name[1]) {
	case 'd':// VSX vector register to hold vector double data
	case 'f':// VSX vector register to hold vector float data
	case 's':// VSX vector register to hold scalar float data
	case 'a':// Any VSX register
	case 'c':// An individual CR bit
	break;
	default:
	return false;
	}
	Info.setAllowsRegister();
	Name++; // Skip over 'w'.
	break;
	case 'h': // `MQ', `CTR', or `LINK' register
	case 'q': // `MQ' register
	case 'c': // `CTR' register
	case 'l': // `LINK' register
	case 'x': // `CR' register (condition register) number 0
	case 'y': // `CR' register (condition register)
	case 'z': // `XER[CA]' carry bit (part of the XER register)
	Info.setAllowsRegister();
	break;
	case 'I': // Signed 16-bit constant
	case 'J': // Unsigned 16-bit constant shifted left 16 bits
	// (use `L' instead for SImode constants)
	case 'K': // Unsigned 16-bit constant
	case 'L': // Signed 16-bit constant shifted left 16 bits
	case 'M': // Constant larger than 31
	case 'N': // Exact power of 2
	case 'P': // Constant whose negation is a signed 16-bit constant
	case 'G': // Floating point constant that can be loaded into a
	// register with one instruction per word
	case 'H': // Integer/Floating point constant that can be loaded
	// into a register using three instructions
	break;
	case 'm': // Memory operand. Note that on PowerPC targets, m can
	// include addresses that update the base register. It
	// is therefore only safe to use `m' in an asm statement
	// if that asm statement accesses the operand exactly once.
	// The asm statement must also use `%U<opno>' as a
	// placeholder for the "update" flag in the corresponding
	// load or store instruction. For example:
	// asm ("st%U0 %1,%0" : "=m" (mem) : "r" (val));
	// is correct but:
	// asm ("st %1,%0" : "=m" (mem) : "r" (val));
	// is not. Use es rather than m if you don't want the base
	// register to be updated.
	case 'e':
	if (Name[1] != 's')
	return false;
	// es: A "stable" memory operand; that is, one which does not
	// include any automodification of the base register. Unlike
	// `m', this constraint can be used in asm statements that
	// might access the operand several times, or that might not
	// access it at all.
	Info.setAllowsMemory();
	Name++; // Skip over 'e'.
	break;
	case 'Q': // Memory operand that is an offset from a register (it is
	// usually better to use `m' or `es' in asm statements)
	case 'Z': // Memory operand that is an indexed or indirect from a
	// register (it is usually better to use `m' or `es' in
	// asm statements)
	Info.setAllowsMemory();
	Info.setAllowsRegister();
	break;
	case 'R': // AIX TOC entry
	case 'a': // Address operand that is an indexed or indirect from a
	// register (`p' is preferable for asm statements)
	case 'S': // Constant suitable as a 64-bit mask operand
	case 'T': // Constant suitable as a 32-bit mask operand
	case 'U': // System V Release 4 small data area reference
	case 't': // AND masks that can be performed by two rldic{l, r}
	// instructions
	case 'W': // Vector constant that does not require memory
	case 'j': // Vector constant that is all zeros.
	break;
	// End FIXME.
	}
	return true;
	}
	std::string convertConstraint(const char *&Constraint) const override {
	std::string R;
	switch (*Constraint) {
	case 'e':
	case 'w':
	// Two-character constraint; add "^" hint for later parsing.
	R = std::string("^") + std::string(Constraint, 2);
	Constraint++;
	break;
	default:
	return TargetInfo::convertConstraint(Constraint);
	}
	return R;
	}
	const char *getClobbers() const override {
	return "";
	}
	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0) return 3;
	if (RegNo == 1) return 4;
	return -1;
	}

	bool hasSjLjLowering() const override {
	return true;
	}

	bool useFloat128ManglingForLongDouble() const override {
	return LongDoubleWidth == 128 &&
	LongDoubleFormat == &llvm::APFloat::PPCDoubleDouble() &&
	getTriple().isOSBinFormatELF();
	}
	};

	const Builtin::Info PPCTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{ #ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsPPC.def"
	};

	/// handleTargetFeatures - Perform initialization based on the user
	/// configured set of features.
	bool PPCTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) {
	for (const auto &Feature : Features) {
	if (Feature == "+altivec") {
	HasAltivec = true;
	} else if (Feature == "+vsx") {
	HasVSX = true;
	} else if (Feature == "+bpermd") {
	HasBPERMD = true;
	} else if (Feature == "+extdiv") {
	HasExtDiv = true;
	} else if (Feature == "+power8-vector") {
	HasP8Vector = true;
	} else if (Feature == "+crypto") {
	HasP8Crypto = true;
	} else if (Feature == "+direct-move") {
	HasDirectMove = true;
	} else if (Feature == "+qpx") {
	HasQPX = true;
	} else if (Feature == "+htm") {
	HasHTM = true;
	} else if (Feature == "+float128") {
	HasFloat128 = true;
	} else if (Feature == "+power9-vector") {
	HasP9Vector = true;
	}
	// TODO: Finish this list and add an assert that we've handled them
	// all.
	}

	return true;
	}

	/// PPCTargetInfo::getTargetDefines - Return a set of the PowerPC-specific
	/// #defines that are not tied to a specific subtarget.
	void PPCTargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	// Target identification.
	Builder.defineMacro("__ppc__");
	Builder.defineMacro("__PPC__");
	Builder.defineMacro("_ARCH_PPC");
	Builder.defineMacro("__powerpc__");
	Builder.defineMacro("__POWERPC__");
	if (PointerWidth == 64) {
	Builder.defineMacro("_ARCH_PPC64");
	Builder.defineMacro("__powerpc64__");
	Builder.defineMacro("__ppc64__");
	Builder.defineMacro("__PPC64__");
	}

	// Target properties.
	if (getTriple().getArch() == llvm::Triple::ppc64le) {
	Builder.defineMacro("_LITTLE_ENDIAN");
	} else {
	if (getTriple().getOS() != llvm::Triple::NetBSD &&
	getTriple().getOS() != llvm::Triple::OpenBSD)
	Builder.defineMacro("_BIG_ENDIAN");
	}

	// ABI options.
	if (ABI == "elfv1" \|\| ABI == "elfv1-qpx")
	Builder.defineMacro("_CALL_ELF", "1");
	if (ABI == "elfv2")
	Builder.defineMacro("_CALL_ELF", "2");

	// This typically is only for a new enough linker (bfd >= 2.16.2 or gold), but
	// our suppport post-dates this and it should work on all 64-bit ppc linux
	// platforms. It is guaranteed to work on all elfv2 platforms.
	if (getTriple().getOS() == llvm::Triple::Linux && PointerWidth == 64)
	Builder.defineMacro("_CALL_LINUX", "1");

	// Subtarget options.
	Builder.defineMacro("__NATURAL_ALIGNMENT__");
	Builder.defineMacro("__REGISTER_PREFIX__", "");

	// FIXME: Should be controlled by command line option.
	if (LongDoubleWidth == 128) {
	Builder.defineMacro("__LONG_DOUBLE_128__");
	Builder.defineMacro("__LONGDOUBLE128");
	}

	// Define this for elfv2 (64-bit only) or 64-bit darwin.
	if (ABI == "elfv2" \|\|
	(getTriple().getOS() == llvm::Triple::Darwin && PointerWidth == 64))
	Builder.defineMacro("__STRUCT_PARM_ALIGN__", "16");

	// CPU identification.
	ArchDefineTypes defs =
	(ArchDefineTypes)llvm::StringSwitch<int>(CPU)
	.Case("440", ArchDefineName)
	.Case("450", ArchDefineName \| ArchDefine440)
	.Case("601", ArchDefineName)
	.Case("602", ArchDefineName \| ArchDefinePpcgr)
	.Case("603", ArchDefineName \| ArchDefinePpcgr)
	.Case("603e", ArchDefineName \| ArchDefine603 \| ArchDefinePpcgr)
	.Case("603ev", ArchDefineName \| ArchDefine603 \| ArchDefinePpcgr)
	.Case("604", ArchDefineName \| ArchDefinePpcgr)
	.Case("604e", ArchDefineName \| ArchDefine604 \| ArchDefinePpcgr)
	.Case("620", ArchDefineName \| ArchDefinePpcgr)
	.Case("630", ArchDefineName \| ArchDefinePpcgr)
	.Case("7400", ArchDefineName \| ArchDefinePpcgr)
	.Case("7450", ArchDefineName \| ArchDefinePpcgr)
	.Case("750", ArchDefineName \| ArchDefinePpcgr)
	.Case("970", ArchDefineName \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("a2", ArchDefineA2)
	.Case("a2q", ArchDefineName \| ArchDefineA2 \| ArchDefineA2q)
	.Case("pwr3", ArchDefinePpcgr)
	.Case("pwr4", ArchDefineName \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("pwr5", ArchDefineName \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("pwr5x", ArchDefineName \| ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("pwr6", ArchDefineName \| ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("pwr6x", ArchDefineName \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("pwr7", ArchDefineName \| ArchDefinePwr6x \| ArchDefinePwr6 \|
	ArchDefinePwr5x \| ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("pwr8", ArchDefineName \| ArchDefinePwr7 \| ArchDefinePwr6x \|
	ArchDefinePwr6 \| ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("pwr9", ArchDefineName \| ArchDefinePwr8 \| ArchDefinePwr7 \|
	ArchDefinePwr6x \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("power3", ArchDefinePpcgr)
	.Case("power4", ArchDefinePwr4 \| ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("power5", ArchDefinePwr5 \| ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("power5x", ArchDefinePwr5x \| ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("power6", ArchDefinePwr6 \| ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("power6x", ArchDefinePwr6x \| ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("power7", ArchDefinePwr7 \| ArchDefinePwr6x \| ArchDefinePwr6 \|
	ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	.Case("power8", ArchDefinePwr8 \| ArchDefinePwr7 \| ArchDefinePwr6x \|
	ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Case("power9", ArchDefinePwr9 \| ArchDefinePwr8 \| ArchDefinePwr7 \|
	ArchDefinePwr6x \| ArchDefinePwr6 \|
	ArchDefinePwr5x \| ArchDefinePwr5 \|
	ArchDefinePwr4 \| ArchDefinePpcgr \|
	ArchDefinePpcsq)
	// powerpc64le automatically defaults to at least power8.
	.Case("ppc64le", ArchDefinePwr8 \| ArchDefinePwr7 \| ArchDefinePwr6x \|
	ArchDefinePwr6 \| ArchDefinePwr5x \|
	ArchDefinePwr5 \| ArchDefinePwr4 \|
	ArchDefinePpcgr \| ArchDefinePpcsq)
	.Default(ArchDefineNone);

	if (defs & ArchDefineName)
	Builder.defineMacro(Twine("_ARCH_", StringRef(CPU).upper()));
	if (defs & ArchDefinePpcgr)
	Builder.defineMacro("_ARCH_PPCGR");
	if (defs & ArchDefinePpcsq)
	Builder.defineMacro("_ARCH_PPCSQ");
	if (defs & ArchDefine440)
	Builder.defineMacro("_ARCH_440");
	if (defs & ArchDefine603)
	Builder.defineMacro("_ARCH_603");
	if (defs & ArchDefine604)
	Builder.defineMacro("_ARCH_604");
	if (defs & ArchDefinePwr4)
	Builder.defineMacro("_ARCH_PWR4");
	if (defs & ArchDefinePwr5)
	Builder.defineMacro("_ARCH_PWR5");
	if (defs & ArchDefinePwr5x)
	Builder.defineMacro("_ARCH_PWR5X");
	if (defs & ArchDefinePwr6)
	Builder.defineMacro("_ARCH_PWR6");
	if (defs & ArchDefinePwr6x)
	Builder.defineMacro("_ARCH_PWR6X");
	if (defs & ArchDefinePwr7)
	Builder.defineMacro("_ARCH_PWR7");
	if (defs & ArchDefinePwr8)
	Builder.defineMacro("_ARCH_PWR8");
	if (defs & ArchDefinePwr9)
	Builder.defineMacro("_ARCH_PWR9");
	if (defs & ArchDefineA2)
	Builder.defineMacro("_ARCH_A2");
	if (defs & ArchDefineA2q) {
	Builder.defineMacro("_ARCH_A2Q");
	Builder.defineMacro("_ARCH_QP");
	}

	if (getTriple().getVendor() == llvm::Triple::BGQ) {
	Builder.defineMacro("__bg__");
	Builder.defineMacro("__THW_BLUEGENE__");
	Builder.defineMacro("__bgq__");
	Builder.defineMacro("__TOS_BGQ__");
	}

	if (HasAltivec) {
	Builder.defineMacro("__VEC__", "10206");
	Builder.defineMacro("__ALTIVEC__");
	}
	if (HasVSX)
	Builder.defineMacro("__VSX__");
	if (HasP8Vector)
	Builder.defineMacro("__POWER8_VECTOR__");
	if (HasP8Crypto)
	Builder.defineMacro("__CRYPTO__");
	if (HasHTM)
	Builder.defineMacro("__HTM__");
	if (HasFloat128)
	Builder.defineMacro("__FLOAT128__");
	if (HasP9Vector)
	Builder.defineMacro("__POWER9_VECTOR__");

	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
	if (PointerWidth == 64)
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");

	// We have support for the bswap intrinsics so we can define this.
	Builder.defineMacro("__HAVE_BSWAP__", "1");

	// FIXME: The following are not yet generated here by Clang, but are
	// generated by GCC:
	//
	// _SOFT_FLOAT_
	// __RECIP_PRECISION__
	// __APPLE_ALTIVEC__
	// __RECIP__
	// __RECIPF__
	// __RSQRTE__
	// __RSQRTEF__
	// _SOFT_DOUBLE_
	// __NO_LWSYNC__
	// __CMODEL_MEDIUM__
	// __CMODEL_LARGE__
	// _CALL_SYSV
	// _CALL_DARWIN
	// __NO_FPRS__
	}

	// Handle explicit options being passed to the compiler here: if we've
	// explicitly turned off vsx and turned on any of:
	// - power8-vector
	// - direct-move
	// - float128
	// - power9-vector
	// then go ahead and error since the customer has expressed an incompatible
	// set of options.
	static bool ppcUserFeaturesCheck(DiagnosticsEngine &Diags,
	const std::vector<std::string> &FeaturesVec) {

	if (std::find(FeaturesVec.begin(), FeaturesVec.end(), "-vsx") !=
	FeaturesVec.end()) {
	if (std::find(FeaturesVec.begin(), FeaturesVec.end(), "+power8-vector") !=
	FeaturesVec.end()) {
	Diags.Report(diag::err_opt_not_valid_with_opt) << "-mpower8-vector"
	<< "-mno-vsx";
	return false;
	}

	if (std::find(FeaturesVec.begin(), FeaturesVec.end(), "+direct-move") !=
	FeaturesVec.end()) {
	Diags.Report(diag::err_opt_not_valid_with_opt) << "-mdirect-move"
	<< "-mno-vsx";
	return false;
	}

	if (std::find(FeaturesVec.begin(), FeaturesVec.end(), "+float128") !=
	FeaturesVec.end()) {
	Diags.Report(diag::err_opt_not_valid_with_opt) << "-mfloat128"
	<< "-mno-vsx";
	return false;
	}

	if (std::find(FeaturesVec.begin(), FeaturesVec.end(), "+power9-vector") !=
	FeaturesVec.end()) {
	Diags.Report(diag::err_opt_not_valid_with_opt) << "-mpower9-vector"
	<< "-mno-vsx";
	return false;
	}
	}

	return true;
	}

	bool PPCTargetInfo::initFeatureMap(
	llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const {
	Features["altivec"] = llvm::StringSwitch<bool>(CPU)
	.Case("7400", true)
	.Case("g4", true)
	.Case("7450", true)
	.Case("g4+", true)
	.Case("970", true)
	.Case("g5", true)
	.Case("pwr6", true)
	.Case("pwr7", true)
	.Case("pwr8", true)
	.Case("pwr9", true)
	.Case("ppc64", true)
	.Case("ppc64le", true)
	.Default(false);

	Features["qpx"] = (CPU == "a2q");
	Features["power9-vector"] = (CPU == "pwr9");
	Features["crypto"] = llvm::StringSwitch<bool>(CPU)
	.Case("ppc64le", true)
	.Case("pwr9", true)
	.Case("pwr8", true)
	.Default(false);
	Features["power8-vector"] = llvm::StringSwitch<bool>(CPU)
	.Case("ppc64le", true)
	.Case("pwr9", true)
	.Case("pwr8", true)
	.Default(false);
	Features["bpermd"] = llvm::StringSwitch<bool>(CPU)
	.Case("ppc64le", true)
	.Case("pwr9", true)
	.Case("pwr8", true)
	.Case("pwr7", true)
	.Default(false);
	Features["extdiv"] = llvm::StringSwitch<bool>(CPU)
	.Case("ppc64le", true)
	.Case("pwr9", true)
	.Case("pwr8", true)
	.Case("pwr7", true)
	.Default(false);
	Features["direct-move"] = llvm::StringSwitch<bool>(CPU)
	.Case("ppc64le", true)
	.Case("pwr9", true)
	.Case("pwr8", true)
	.Default(false);
	Features["vsx"] = llvm::StringSwitch<bool>(CPU)
	.Case("ppc64le", true)
	.Case("pwr9", true)
	.Case("pwr8", true)
	.Case("pwr7", true)
	.Default(false);
	Features["htm"] = llvm::StringSwitch<bool>(CPU)
	.Case("ppc64le", true)
	.Case("pwr9", true)
	.Case("pwr8", true)
	.Default(false);

	if (!ppcUserFeaturesCheck(Diags, FeaturesVec))
	return false;

	return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
	}

	bool PPCTargetInfo::hasFeature(StringRef Feature) const {
	return llvm::StringSwitch<bool>(Feature)
	.Case("powerpc", true)
	.Case("altivec", HasAltivec)
	.Case("vsx", HasVSX)
	.Case("power8-vector", HasP8Vector)
	.Case("crypto", HasP8Crypto)
	.Case("direct-move", HasDirectMove)
	.Case("qpx", HasQPX)
	.Case("htm", HasHTM)
	.Case("bpermd", HasBPERMD)
	.Case("extdiv", HasExtDiv)
	.Case("float128", HasFloat128)
	.Case("power9-vector", HasP9Vector)
	.Default(false);
	}

	void PPCTargetInfo::setFeatureEnabled(llvm::StringMap<bool> &Features,
	StringRef Name, bool Enabled) const {
	if (Enabled) {
	// If we're enabling any of the vsx based features then enable vsx and
	// altivec. We'll diagnose any problems later.
	bool FeatureHasVSX = llvm::StringSwitch<bool>(Name)
	.Case("vsx", true)
	.Case("direct-move", true)
	.Case("power8-vector", true)
	.Case("power9-vector", true)
	.Case("float128", true)
	.Default(false);
	if (FeatureHasVSX)
	Features["vsx"] = Features["altivec"] = true;
	if (Name == "power9-vector")
	Features["power8-vector"] = true;
	Features[Name] = true;
	} else {
	// If we're disabling altivec or vsx go ahead and disable all of the vsx
	// features.
	if ((Name == "altivec") \|\| (Name == "vsx"))
	Features["vsx"] = Features["direct-move"] = Features["power8-vector"] =
	Features["float128"] = Features["power9-vector"] = false;
	if (Name == "power8-vector")
	Features["power9-vector"] = false;
	Features[Name] = false;
	}
	}

	const char * const PPCTargetInfo::GCCRegNames[] = {
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
	"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
	"f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7",
	"f8", "f9", "f10", "f11", "f12", "f13", "f14", "f15",
	"f16", "f17", "f18", "f19", "f20", "f21", "f22", "f23",
	"f24", "f25", "f26", "f27", "f28", "f29", "f30", "f31",
	"mq", "lr", "ctr", "ap",
	"cr0", "cr1", "cr2", "cr3", "cr4", "cr5", "cr6", "cr7",
	"xer",
	"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
	"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
	"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
	"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
	"vrsave", "vscr",
	"spe_acc", "spefscr",
	"sfp"
	};

	ArrayRef<const char*> PPCTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	const TargetInfo::GCCRegAlias PPCTargetInfo::GCCRegAliases[] = {
	// While some of these aliases do map to different registers
	// they still share the same register name.
	{ { "0" }, "r0" },
	{ { "1"}, "r1" },
	{ { "2" }, "r2" },
	{ { "3" }, "r3" },
	{ { "4" }, "r4" },
	{ { "5" }, "r5" },
	{ { "6" }, "r6" },
	{ { "7" }, "r7" },
	{ { "8" }, "r8" },
	{ { "9" }, "r9" },
	{ { "10" }, "r10" },
	{ { "11" }, "r11" },
	{ { "12" }, "r12" },
	{ { "13" }, "r13" },
	{ { "14" }, "r14" },
	{ { "15" }, "r15" },
	{ { "16" }, "r16" },
	{ { "17" }, "r17" },
	{ { "18" }, "r18" },
	{ { "19" }, "r19" },
	{ { "20" }, "r20" },
	{ { "21" }, "r21" },
	{ { "22" }, "r22" },
	{ { "23" }, "r23" },
	{ { "24" }, "r24" },
	{ { "25" }, "r25" },
	{ { "26" }, "r26" },
	{ { "27" }, "r27" },
	{ { "28" }, "r28" },
	{ { "29" }, "r29" },
	{ { "30" }, "r30" },
	{ { "31" }, "r31" },
	{ { "fr0" }, "f0" },
	{ { "fr1" }, "f1" },
	{ { "fr2" }, "f2" },
	{ { "fr3" }, "f3" },
	{ { "fr4" }, "f4" },
	{ { "fr5" }, "f5" },
	{ { "fr6" }, "f6" },
	{ { "fr7" }, "f7" },
	{ { "fr8" }, "f8" },
	{ { "fr9" }, "f9" },
	{ { "fr10" }, "f10" },
	{ { "fr11" }, "f11" },
	{ { "fr12" }, "f12" },
	{ { "fr13" }, "f13" },
	{ { "fr14" }, "f14" },
	{ { "fr15" }, "f15" },
	{ { "fr16" }, "f16" },
	{ { "fr17" }, "f17" },
	{ { "fr18" }, "f18" },
	{ { "fr19" }, "f19" },
	{ { "fr20" }, "f20" },
	{ { "fr21" }, "f21" },
	{ { "fr22" }, "f22" },
	{ { "fr23" }, "f23" },
	{ { "fr24" }, "f24" },
	{ { "fr25" }, "f25" },
	{ { "fr26" }, "f26" },
	{ { "fr27" }, "f27" },
	{ { "fr28" }, "f28" },
	{ { "fr29" }, "f29" },
	{ { "fr30" }, "f30" },
	{ { "fr31" }, "f31" },
	{ { "cc" }, "cr0" },
	};

	ArrayRef<TargetInfo::GCCRegAlias> PPCTargetInfo::getGCCRegAliases() const {
	return llvm::makeArrayRef(GCCRegAliases);
	}

	class PPC32TargetInfo : public PPCTargetInfo {
	public:
	PPC32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: PPCTargetInfo(Triple, Opts) {
	resetDataLayout("E-m:e-p:32:32-i64:64-n32");

	switch (getTriple().getOS()) {
	case llvm::Triple::Linux:
	case llvm::Triple::FreeBSD:
	case llvm::Triple::NetBSD:
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;
	break;
	default:
	break;
	}

	if (getTriple().getOS() == llvm::Triple::FreeBSD) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}

	// PPC32 supports atomics up to 4 bytes.
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	// This is the ELF definition, and is overridden by the Darwin sub-target
	return TargetInfo::PowerABIBuiltinVaList;
	}
	};

	// Note: ABI differences may eventually require us to have a separate
	// TargetInfo for little endian.
	class PPC64TargetInfo : public PPCTargetInfo {
	public:
	PPC64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: PPCTargetInfo(Triple, Opts) {
	LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
	IntMaxType = SignedLong;
	Int64Type = SignedLong;

	if ((Triple.getArch() == llvm::Triple::ppc64le)) {
	resetDataLayout("e-m:e-i64:64-n32:64");
	ABI = "elfv2";
	} else {
	resetDataLayout("E-m:e-i64:64-n32:64");
	ABI = "elfv1";
	}

	switch (getTriple().getOS()) {
	case llvm::Triple::FreeBSD:
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	break;
	case llvm::Triple::NetBSD:
	IntMaxType = SignedLongLong;
	Int64Type = SignedLongLong;
	break;
	default:
	break;
	}

	// PPC64 supports atomics up to 8 bytes.
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	// PPC64 Linux-specific ABI options.
	bool setABI(const std::string &Name) override {
	if (Name == "elfv1" \|\| Name == "elfv1-qpx" \|\| Name == "elfv2") {
	ABI = Name;
	return true;
	}
	return false;
	}
	};

	class DarwinPPC32TargetInfo : public DarwinTargetInfo<PPC32TargetInfo> {
	public:
	DarwinPPC32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<PPC32TargetInfo>(Triple, Opts) {
	HasAlignMac68kSupport = true;
	BoolWidth = BoolAlign = 32; //XXX support -mone-byte-bool?
	PtrDiffType = SignedInt; // for http://llvm.org/bugs/show_bug.cgi?id=15726
	LongLongAlign = 32;
	resetDataLayout("E-m:o-p:32:32-f64:32:64-n32");
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	};

	class DarwinPPC64TargetInfo : public DarwinTargetInfo<PPC64TargetInfo> {
	public:
	DarwinPPC64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<PPC64TargetInfo>(Triple, Opts) {
	HasAlignMac68kSupport = true;
	resetDataLayout("E-m:o-i64:64-n32:64");
	}
	};

	static const unsigned NVPTXAddrSpaceMap[] = {
	0, // Default
	1, // opencl_global
	3, // opencl_local
	4, // opencl_constant
	// FIXME: generic has to be added to the target
	0, // opencl_generic
	1, // cuda_device
	4, // cuda_constant
	3, // cuda_shared
	};

	class NVPTXTargetInfo : public TargetInfo {
	static const char *const GCCRegNames[];
	static const Builtin::Info BuiltinInfo[];
	CudaArch GPU;
	std::unique_ptr<TargetInfo> HostTarget;

	public:
	NVPTXTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts,
	unsigned TargetPointerWidth)
	: TargetInfo(Triple) {
	assert((TargetPointerWidth == 32 \|\| TargetPointerWidth == 64) &&
	"NVPTX only supports 32- and 64-bit modes.");

	TLSSupported = false;
	AddrSpaceMap = &NVPTXAddrSpaceMap;
	UseAddrSpaceMapMangling = true;

	// Define available target features
	// These must be defined in sorted order!
	NoAsmVariants = true;
	GPU = CudaArch::SM_20;

	if (TargetPointerWidth == 32)
	resetDataLayout("e-p:32:32-i64:64-v16:16-v32:32-n16:32:64");
	else
	resetDataLayout("e-i64:64-v16:16-v32:32-n16:32:64");

	// If possible, get a TargetInfo for our host triple, so we can match its
	// types.
	llvm::Triple HostTriple(Opts.HostTriple);
	if (!HostTriple.isNVPTX())
	HostTarget.reset(AllocateTarget(llvm::Triple(Opts.HostTriple), Opts));

	// If no host target, make some guesses about the data layout and return.
	if (!HostTarget) {
	LongWidth = LongAlign = TargetPointerWidth;
	PointerWidth = PointerAlign = TargetPointerWidth;
	switch (TargetPointerWidth) {
	case 32:
	SizeType = TargetInfo::UnsignedInt;
	PtrDiffType = TargetInfo::SignedInt;
	IntPtrType = TargetInfo::SignedInt;
	break;
	case 64:
	SizeType = TargetInfo::UnsignedLong;
	PtrDiffType = TargetInfo::SignedLong;
	IntPtrType = TargetInfo::SignedLong;
	break;
	default:
	llvm_unreachable("TargetPointerWidth must be 32 or 64");
	}
	return;
	}

	// Copy properties from host target.
	PointerWidth = HostTarget->getPointerWidth(/* AddrSpace = */ 0);
	PointerAlign = HostTarget->getPointerAlign(/* AddrSpace = */ 0);
	BoolWidth = HostTarget->getBoolWidth();
	BoolAlign = HostTarget->getBoolAlign();
	IntWidth = HostTarget->getIntWidth();
	IntAlign = HostTarget->getIntAlign();
	HalfWidth = HostTarget->getHalfWidth();
	HalfAlign = HostTarget->getHalfAlign();
	FloatWidth = HostTarget->getFloatWidth();
	FloatAlign = HostTarget->getFloatAlign();
	DoubleWidth = HostTarget->getDoubleWidth();
	DoubleAlign = HostTarget->getDoubleAlign();
	LongWidth = HostTarget->getLongWidth();
	LongAlign = HostTarget->getLongAlign();
	LongLongWidth = HostTarget->getLongLongWidth();
	LongLongAlign = HostTarget->getLongLongAlign();
	MinGlobalAlign = HostTarget->getMinGlobalAlign();
	NewAlign = HostTarget->getNewAlign();
	DefaultAlignForAttributeAligned =
	HostTarget->getDefaultAlignForAttributeAligned();
	SizeType = HostTarget->getSizeType();
	IntMaxType = HostTarget->getIntMaxType();
	PtrDiffType = HostTarget->getPtrDiffType(/* AddrSpace = */ 0);
	IntPtrType = HostTarget->getIntPtrType();
	WCharType = HostTarget->getWCharType();
	WIntType = HostTarget->getWIntType();
	Char16Type = HostTarget->getChar16Type();
	Char32Type = HostTarget->getChar32Type();
	Int64Type = HostTarget->getInt64Type();
	SigAtomicType = HostTarget->getSigAtomicType();
	ProcessIDType = HostTarget->getProcessIDType();

	UseBitFieldTypeAlignment = HostTarget->useBitFieldTypeAlignment();
	UseZeroLengthBitfieldAlignment =
	HostTarget->useZeroLengthBitfieldAlignment();
	UseExplicitBitFieldAlignment = HostTarget->useExplicitBitFieldAlignment();
	ZeroLengthBitfieldBoundary = HostTarget->getZeroLengthBitfieldBoundary();

	// This is a bit of a lie, but it controls __GCC_ATOMIC_XXX_LOCK_FREE, and
	// we need those macros to be identical on host and device, because (among
	// other things) they affect which standard library classes are defined, and
	// we need all classes to be defined on both the host and device.
	MaxAtomicInlineWidth = HostTarget->getMaxAtomicInlineWidth();

	// Properties intentionally not copied from host:
	// - LargeArrayMinWidth, LargeArrayAlign: Not visible across the
	// host/device boundary.
	// - SuitableAlign: Not visible across the host/device boundary, and may
	// correctly be different on host/device, e.g. if host has wider vector
	// types than device.
	// - LongDoubleWidth, LongDoubleAlign: nvptx's long double type is the same
	// as its double type, but that's not necessarily true on the host.
	// TODO: nvcc emits a warning when using long double on device; we should
	// do the same.
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__PTX__");
	Builder.defineMacro("__NVPTX__");
	if (Opts.CUDAIsDevice) {
	// Set __CUDA_ARCH__ for the GPU specified.
	std::string CUDAArchCode = [this] {
	switch (GPU) {
	case CudaArch::UNKNOWN:
	assert(false && "No GPU arch when compiling CUDA device code.");
	return "";
	case CudaArch::SM_20:
	return "200";
	case CudaArch::SM_21:
	return "210";
	case CudaArch::SM_30:
	return "300";
	case CudaArch::SM_32:
	return "320";
	case CudaArch::SM_35:
	return "350";
	case CudaArch::SM_37:
	return "370";
	case CudaArch::SM_50:
	return "500";
	case CudaArch::SM_52:
	return "520";
	case CudaArch::SM_53:
	return "530";
	case CudaArch::SM_60:
	return "600";
	case CudaArch::SM_61:
	return "610";
	case CudaArch::SM_62:
	return "620";
	}
	llvm_unreachable("unhandled CudaArch");
	}();
	Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
	}
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::NVPTX::LastTSBuiltin - Builtin::FirstTSBuiltin);
	}
	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override {
	Features["satom"] = GPU >= CudaArch::SM_60;
	return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
	}

	bool hasFeature(StringRef Feature) const override {
	return llvm::StringSwitch<bool>(Feature)
	.Cases("ptx", "nvptx", true)
	.Case("satom", GPU >= CudaArch::SM_60) // Atomics w/ scope.
	.Default(false);
	}

	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	// No aliases.
	return None;
	}
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default:
	return false;
	case 'c':
	case 'h':
	case 'r':
	case 'l':
	case 'f':
	case 'd':
	Info.setAllowsRegister();
	return true;
	}
	}
	const char *getClobbers() const override {
	// FIXME: Is this really right?
	return "";
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	// FIXME: implement
	return TargetInfo::CharPtrBuiltinVaList;
	}
	bool setCPU(const std::string &Name) override {
	GPU = StringToCudaArch(Name);
	return GPU != CudaArch::UNKNOWN;
	}
	void setSupportedOpenCLOpts() override {
	auto &Opts = getSupportedOpenCLOpts();
	Opts.support("cl_clang_storage_class_specifiers");
	Opts.support("cl_khr_gl_sharing");
	Opts.support("cl_khr_icd");

	Opts.support("cl_khr_fp64");
	Opts.support("cl_khr_byte_addressable_store");
	Opts.support("cl_khr_global_int32_base_atomics");
	Opts.support("cl_khr_global_int32_extended_atomics");
	Opts.support("cl_khr_local_int32_base_atomics");
	Opts.support("cl_khr_local_int32_extended_atomics");
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	// CUDA compilations support all of the host's calling conventions.
	//
	// TODO: We should warn if you apply a non-default CC to anything other than
	// a host function.
	if (HostTarget)
	return HostTarget->checkCallingConvention(CC);
	return CCCR_Warning;
	}
	};

	const Builtin::Info NVPTXTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{ #ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr },
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE },
	#include "clang/Basic/BuiltinsNVPTX.def"
	};

	const char *const NVPTXTargetInfo::GCCRegNames[] = {"r0"};

	ArrayRef<const char *> NVPTXTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	static const LangAS::Map AMDGPUPrivIsZeroDefIsGenMap = {
	4, // Default
	1, // opencl_global
	3, // opencl_local
	2, // opencl_constant
	4, // opencl_generic
	1, // cuda_device
	2, // cuda_constant
	3 // cuda_shared
	};
	static const LangAS::Map AMDGPUGenIsZeroDefIsGenMap = {
	0, // Default
	1, // opencl_global
	3, // opencl_local
	2, // opencl_constant
	0, // opencl_generic
	1, // cuda_device
	2, // cuda_constant
	3 // cuda_shared
	};
	static const LangAS::Map AMDGPUPrivIsZeroDefIsPrivMap = {
	0, // Default
	1, // opencl_global
	3, // opencl_local
	2, // opencl_constant
	4, // opencl_generic
	1, // cuda_device
	2, // cuda_constant
	3 // cuda_shared
	};
	static const LangAS::Map AMDGPUGenIsZeroDefIsPrivMap = {
	5, // Default
	1, // opencl_global
	3, // opencl_local
	2, // opencl_constant
	0, // opencl_generic
	1, // cuda_device
	2, // cuda_constant
	3 // cuda_shared
	};

	// If you edit the description strings, make sure you update
	// getPointerWidthV().

	static const char *const DataLayoutStringR600 =
	"e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
	"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";

	static const char *const DataLayoutStringSIPrivateIsZero =
	"e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
	"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
	"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";

	static const char *const DataLayoutStringSIGenericIsZero =
	"e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
	"-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
	"-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";

	class AMDGPUTargetInfo final : public TargetInfo {
	static const Builtin::Info BuiltinInfo[];
	static const char * const GCCRegNames[];

	struct AddrSpace {
	unsigned Generic, Global, Local, Constant, Private;
	AddrSpace(bool IsGenericZero_ = false){
	if (IsGenericZero_) {
	Generic = 0;
	Global = 1;
	Local = 3;
	Constant = 2;
	Private = 5;
	} else {
	Generic = 4;
	Global = 1;
	Local = 3;
	Constant = 2;
	Private = 0;
	}
	}
	};

	/// \brief The GPU profiles supported by the AMDGPU target.
	enum GPUKind {
	GK_NONE,
	GK_R600,
	GK_R600_DOUBLE_OPS,
	GK_R700,
	GK_R700_DOUBLE_OPS,
	GK_EVERGREEN,
	GK_EVERGREEN_DOUBLE_OPS,
	GK_NORTHERN_ISLANDS,
	GK_CAYMAN,
	GK_GFX6,
	GK_GFX7,
	GK_GFX8,
	GK_GFX9
	} GPU;

	bool hasFP64:1;
	bool hasFMAF:1;
	bool hasLDEXPF:1;
	const AddrSpace AS;

	static bool hasFullSpeedFMAF32(StringRef GPUName) {
	return parseAMDGCNName(GPUName) >= GK_GFX9;
	}

	static bool isAMDGCN(const llvm::Triple &TT) {
	return TT.getArch() == llvm::Triple::amdgcn;
	}

	static bool isGenericZero(const llvm::Triple &TT) {
	return TT.getEnvironmentName() == "amdgiz" \|\|
	TT.getEnvironmentName() == "amdgizcl";
	}
	public:
	AMDGPUTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: TargetInfo(Triple) ,
	GPU(isAMDGCN(Triple) ? GK_GFX6 : GK_R600),
	hasFP64(false),
	hasFMAF(false),
	hasLDEXPF(false),
	AS(isGenericZero(Triple)){
	if (getTriple().getArch() == llvm::Triple::amdgcn) {
	hasFP64 = true;
	hasFMAF = true;
	hasLDEXPF = true;
	}
	auto IsGenericZero = isGenericZero(Triple);
	resetDataLayout(getTriple().getArch() == llvm::Triple::amdgcn ?
	(IsGenericZero ? DataLayoutStringSIGenericIsZero :
	DataLayoutStringSIPrivateIsZero)
	: DataLayoutStringR600);
	assert(DataLayout->getAllocaAddrSpace() == AS.Private);

	setAddressSpaceMap(Triple.getOS() == llvm::Triple::Mesa3D \|\|
	Triple.getEnvironment() == llvm::Triple::OpenCL \|\|
	Triple.getEnvironmentName() == "amdgizcl" \|\|
	!isAMDGCN(Triple));
	UseAddrSpaceMapMangling = true;

	// Set pointer width and alignment for target address space 0.
	PointerWidth = PointerAlign = DataLayout->getPointerSizeInBits();
	if (getMaxPointerWidth() == 64) {
	LongWidth = LongAlign = 64;
	SizeType = UnsignedLong;
	PtrDiffType = SignedLong;
	IntPtrType = SignedLong;
	}
	}

	void setAddressSpaceMap(bool DefaultIsPrivate) {
	if (isGenericZero(getTriple())) {
	AddrSpaceMap = DefaultIsPrivate ? &AMDGPUGenIsZeroDefIsPrivMap
	: &AMDGPUGenIsZeroDefIsGenMap;
	} else {
	AddrSpaceMap = DefaultIsPrivate ? &AMDGPUPrivIsZeroDefIsPrivMap
	: &AMDGPUPrivIsZeroDefIsGenMap;
	}
	}

	void adjust(LangOptions &Opts) override {
	TargetInfo::adjust(Opts);
	setAddressSpaceMap(Opts.OpenCL \|\| !isAMDGCN(getTriple()));
	}

	uint64_t getPointerWidthV(unsigned AddrSpace) const override {
	if (GPU <= GK_CAYMAN)
	return 32;

	if (AddrSpace == AS.Private \|\| AddrSpace == AS.Local) {
	return 32;
	}
	return 64;
	}

	uint64_t getPointerAlignV(unsigned AddrSpace) const override {
	return getPointerWidthV(AddrSpace);
	}

	uint64_t getMaxPointerWidth() const override {
	return getTriple().getArch() == llvm::Triple::amdgcn ? 64 : 32;
	}

	const char * getClobbers() const override {
	return "";
	}

	ArrayRef<const char *> getGCCRegNames() const override;

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default: break;
	case 'v': // vgpr
	case 's': // sgpr
	Info.setAllowsRegister();
	return true;
	}
	return false;
	}

	bool initFeatureMap(llvm::StringMap<bool> &Features,
	DiagnosticsEngine &Diags, StringRef CPU,
	const std::vector<std::string> &FeatureVec) const override;

	void adjustTargetOptions(const CodeGenOptions &CGOpts,
	TargetOptions &TargetOpts) const override {
	bool hasFP32Denormals = false;
	bool hasFP64Denormals = false;
	for (auto &I : TargetOpts.FeaturesAsWritten) {
	if (I == "+fp32-denormals" \|\| I == "-fp32-denormals")
	hasFP32Denormals = true;
	if (I == "+fp64-fp16-denormals" \|\| I == "-fp64-fp16-denormals")
	hasFP64Denormals = true;
	}
	if (!hasFP32Denormals)
	TargetOpts.Features.push_back(
	(Twine(hasFullSpeedFMAF32(TargetOpts.CPU) &&
	!CGOpts.FlushDenorm ? '+' : '-') + Twine("fp32-denormals")).str());
	// Always do not flush fp64 or fp16 denorms.
	if (!hasFP64Denormals && hasFP64)
	TargetOpts.Features.push_back("+fp64-fp16-denormals");
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::AMDGPU::LastTSBuiltin - Builtin::FirstTSBuiltin);
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	if (getTriple().getArch() == llvm::Triple::amdgcn)
	Builder.defineMacro("__AMDGCN__");
	else
	Builder.defineMacro("__R600__");

	if (hasFMAF)
	Builder.defineMacro("__HAS_FMAF__");
	if (hasLDEXPF)
	Builder.defineMacro("__HAS_LDEXPF__");
	if (hasFP64)
	Builder.defineMacro("__HAS_FP64__");
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}

	static GPUKind parseR600Name(StringRef Name) {
	return llvm::StringSwitch<GPUKind>(Name)
	.Case("r600" , GK_R600)
	.Case("rv610", GK_R600)
	.Case("rv620", GK_R600)
	.Case("rv630", GK_R600)
	.Case("rv635", GK_R600)
	.Case("rs780", GK_R600)
	.Case("rs880", GK_R600)
	.Case("rv670", GK_R600_DOUBLE_OPS)
	.Case("rv710", GK_R700)
	.Case("rv730", GK_R700)
	.Case("rv740", GK_R700_DOUBLE_OPS)
	.Case("rv770", GK_R700_DOUBLE_OPS)
	.Case("palm", GK_EVERGREEN)
	.Case("cedar", GK_EVERGREEN)
	.Case("sumo", GK_EVERGREEN)
	.Case("sumo2", GK_EVERGREEN)
	.Case("redwood", GK_EVERGREEN)
	.Case("juniper", GK_EVERGREEN)
	.Case("hemlock", GK_EVERGREEN_DOUBLE_OPS)
	.Case("cypress", GK_EVERGREEN_DOUBLE_OPS)
	.Case("barts", GK_NORTHERN_ISLANDS)
	.Case("turks", GK_NORTHERN_ISLANDS)
	.Case("caicos", GK_NORTHERN_ISLANDS)
	.Case("cayman", GK_CAYMAN)
	.Case("aruba", GK_CAYMAN)
	.Default(GK_NONE);
	}

	static GPUKind parseAMDGCNName(StringRef Name) {
	return llvm::StringSwitch<GPUKind>(Name)
	.Case("tahiti", GK_GFX6)
	.Case("pitcairn", GK_GFX6)
	.Case("verde", GK_GFX6)
	.Case("oland", GK_GFX6)
	.Case("hainan", GK_GFX6)
	.Case("bonaire", GK_GFX7)
	.Case("kabini", GK_GFX7)
	.Case("kaveri", GK_GFX7)
	.Case("hawaii", GK_GFX7)
	.Case("mullins", GK_GFX7)
	.Case("gfx700", GK_GFX7)
	.Case("gfx701", GK_GFX7)
	.Case("gfx702", GK_GFX7)
	.Case("tonga", GK_GFX8)
	.Case("iceland", GK_GFX8)
	.Case("carrizo", GK_GFX8)
	.Case("fiji", GK_GFX8)
	.Case("stoney", GK_GFX8)
	.Case("polaris10", GK_GFX8)
	.Case("polaris11", GK_GFX8)
	.Case("gfx800", GK_GFX8)
	.Case("gfx801", GK_GFX8)
	.Case("gfx802", GK_GFX8)
	.Case("gfx803", GK_GFX8)
	.Case("gfx804", GK_GFX8)
	.Case("gfx810", GK_GFX8)
	.Case("gfx900", GK_GFX9)
	.Case("gfx901", GK_GFX9)
	.Default(GK_NONE);
	}

	bool setCPU(const std::string &Name) override {
	if (getTriple().getArch() == llvm::Triple::amdgcn)
	GPU = parseAMDGCNName(Name);
	else
	GPU = parseR600Name(Name);

	return GPU != GK_NONE;
	}

	void setSupportedOpenCLOpts() override {
	auto &Opts = getSupportedOpenCLOpts();
	Opts.support("cl_clang_storage_class_specifiers");
	Opts.support("cl_khr_icd");

	if (hasFP64)
	Opts.support("cl_khr_fp64");
	if (GPU >= GK_EVERGREEN) {
	Opts.support("cl_khr_byte_addressable_store");
	Opts.support("cl_khr_global_int32_base_atomics");
	Opts.support("cl_khr_global_int32_extended_atomics");
	Opts.support("cl_khr_local_int32_base_atomics");
	Opts.support("cl_khr_local_int32_extended_atomics");
	}
	if (GPU >= GK_GFX6) {
	Opts.support("cl_khr_fp16");
	Opts.support("cl_khr_int64_base_atomics");
	Opts.support("cl_khr_int64_extended_atomics");
	Opts.support("cl_khr_mipmap_image");
	Opts.support("cl_khr_subgroups");
	Opts.support("cl_khr_3d_image_writes");
	Opts.support("cl_amd_media_ops");
	Opts.support("cl_amd_media_ops2");
	}
	}

	LangAS::ID getOpenCLImageAddrSpace() const override {
	return LangAS::opencl_constant;
	}

	llvm::Optional<unsigned> getConstantAddressSpace() const override {
	return LangAS::FirstTargetAddressSpace + AS.Constant;
	}

	/// \returns Target specific vtbl ptr address space.
	unsigned getVtblPtrAddressSpace() const override { return AS.Constant; }

	/// \returns If a target requires an address within a target specific address
	/// space \p AddressSpace to be converted in order to be used, then return the
	/// corresponding target specific DWARF address space.
	///
	/// \returns Otherwise return None and no conversion will be emitted in the
	/// DWARF.
	Optional<unsigned> getDWARFAddressSpace(
	unsigned AddressSpace) const override {
	const unsigned DWARF_Private = 1;
	const unsigned DWARF_Local = 2;
	if (AddressSpace == AS.Private) {
	return DWARF_Private;
	} else if (AddressSpace == AS.Local) {
	return DWARF_Local;
	} else {
	return None;
	}
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	default:
	return CCCR_Warning;
	case CC_C:
	case CC_OpenCLKernel:
	return CCCR_OK;
	}
	}

	// In amdgcn target the null pointer in global, constant, and generic
	// address space has value 0 but in private and local address space has
	// value ~0.
	uint64_t getNullPointerValue(unsigned AS) const override {
	return AS == LangAS::opencl_local ? ~0 : 0;
	}
	};

	const Builtin::Info AMDGPUTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE },
	#include "clang/Basic/BuiltinsAMDGPU.def"
	};
	const char * const AMDGPUTargetInfo::GCCRegNames[] = {
	"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
	"v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
	"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
	"v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
	"v32", "v33", "v34", "v35", "v36", "v37", "v38", "v39",
	"v40", "v41", "v42", "v43", "v44", "v45", "v46", "v47",
	"v48", "v49", "v50", "v51", "v52", "v53", "v54", "v55",
	"v56", "v57", "v58", "v59", "v60", "v61", "v62", "v63",
	"v64", "v65", "v66", "v67", "v68", "v69", "v70", "v71",
	"v72", "v73", "v74", "v75", "v76", "v77", "v78", "v79",
	"v80", "v81", "v82", "v83", "v84", "v85", "v86", "v87",
	"v88", "v89", "v90", "v91", "v92", "v93", "v94", "v95",
	"v96", "v97", "v98", "v99", "v100", "v101", "v102", "v103",
	"v104", "v105", "v106", "v107", "v108", "v109", "v110", "v111",
	"v112", "v113", "v114", "v115", "v116", "v117", "v118", "v119",
	"v120", "v121", "v122", "v123", "v124", "v125", "v126", "v127",
	"v128", "v129", "v130", "v131", "v132", "v133", "v134", "v135",
	"v136", "v137", "v138", "v139", "v140", "v141", "v142", "v143",
	"v144", "v145", "v146", "v147", "v148", "v149", "v150", "v151",
	"v152", "v153", "v154", "v155", "v156", "v157", "v158", "v159",
	"v160", "v161", "v162", "v163", "v164", "v165", "v166", "v167",
	"v168", "v169", "v170", "v171", "v172", "v173", "v174", "v175",
	"v176", "v177", "v178", "v179", "v180", "v181", "v182", "v183",
	"v184", "v185", "v186", "v187", "v188", "v189", "v190", "v191",
	"v192", "v193", "v194", "v195", "v196", "v197", "v198", "v199",
	"v200", "v201", "v202", "v203", "v204", "v205", "v206", "v207",
	"v208", "v209", "v210", "v211", "v212", "v213", "v214", "v215",
	"v216", "v217", "v218", "v219", "v220", "v221", "v222", "v223",
	"v224", "v225", "v226", "v227", "v228", "v229", "v230", "v231",
	"v232", "v233", "v234", "v235", "v236", "v237", "v238", "v239",
	"v240", "v241", "v242", "v243", "v244", "v245", "v246", "v247",
	"v248", "v249", "v250", "v251", "v252", "v253", "v254", "v255",
	"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
	"s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
	"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
	"s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",
	"s32", "s33", "s34", "s35", "s36", "s37", "s38", "s39",
	"s40", "s41", "s42", "s43", "s44", "s45", "s46", "s47",
	"s48", "s49", "s50", "s51", "s52", "s53", "s54", "s55",
	"s56", "s57", "s58", "s59", "s60", "s61", "s62", "s63",
	"s64", "s65", "s66", "s67", "s68", "s69", "s70", "s71",
	"s72", "s73", "s74", "s75", "s76", "s77", "s78", "s79",
	"s80", "s81", "s82", "s83", "s84", "s85", "s86", "s87",
	"s88", "s89", "s90", "s91", "s92", "s93", "s94", "s95",
	"s96", "s97", "s98", "s99", "s100", "s101", "s102", "s103",
	"s104", "s105", "s106", "s107", "s108", "s109", "s110", "s111",
	"s112", "s113", "s114", "s115", "s116", "s117", "s118", "s119",
	"s120", "s121", "s122", "s123", "s124", "s125", "s126", "s127",
	"exec", "vcc", "scc", "m0", "flat_scratch", "exec_lo", "exec_hi",
	"vcc_lo", "vcc_hi", "flat_scratch_lo", "flat_scratch_hi"
	};

	ArrayRef<const char *> AMDGPUTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	bool AMDGPUTargetInfo::initFeatureMap(
	llvm::StringMap<bool> &Features,
	DiagnosticsEngine &Diags, StringRef CPU,
	const std::vector<std::string> &FeatureVec) const {

	// XXX - What does the member GPU mean if device name string passed here?
	if (getTriple().getArch() == llvm::Triple::amdgcn) {
	if (CPU.empty())
	CPU = "tahiti";

	switch (parseAMDGCNName(CPU)) {
	case GK_GFX6:
	case GK_GFX7:
	break;

	case GK_GFX9:
	Features["gfx9-insts"] = true;
	LLVM_FALLTHROUGH;
	case GK_GFX8:
	Features["s-memrealtime"] = true;
	Features["16-bit-insts"] = true;
	Features["dpp"] = true;
	break;

	case GK_NONE:
	return false;
	default:
	llvm_unreachable("unhandled subtarget");
	}
	} else {
	if (CPU.empty())
	CPU = "r600";

	switch (parseR600Name(CPU)) {
	case GK_R600:
	case GK_R700:
	case GK_EVERGREEN:
	case GK_NORTHERN_ISLANDS:
	break;
	case GK_R600_DOUBLE_OPS:
	case GK_R700_DOUBLE_OPS:
	case GK_EVERGREEN_DOUBLE_OPS:
	case GK_CAYMAN:
	Features["fp64"] = true;
	break;
	case GK_NONE:
	return false;
	default:
	llvm_unreachable("unhandled subtarget");
	}
	}

	return TargetInfo::initFeatureMap(Features, Diags, CPU, FeatureVec);
	}

	const Builtin::Info BuiltinInfoX86[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE },
	#define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \
	{ #ID, TYPE, ATTRS, HEADER, LANGS, FEATURE },
	#include "clang/Basic/BuiltinsX86.def"

	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE },
	#define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \
	{ #ID, TYPE, ATTRS, HEADER, LANGS, FEATURE },
	#include "clang/Basic/BuiltinsX86_64.def"
	};


	static const char* const GCCRegNames[] = {
	"ax", "dx", "cx", "bx", "si", "di", "bp", "sp",
	"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)",
	"argp", "flags", "fpcr", "fpsr", "dirflag", "frame",
	"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
	"mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7",
	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
	"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15",
	"ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7",
	"ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15",
	"xmm16", "xmm17", "xmm18", "xmm19", "xmm20", "xmm21", "xmm22", "xmm23",
	"xmm24", "xmm25", "xmm26", "xmm27", "xmm28", "xmm29", "xmm30", "xmm31",
	"ymm16", "ymm17", "ymm18", "ymm19", "ymm20", "ymm21", "ymm22", "ymm23",
	"ymm24", "ymm25", "ymm26", "ymm27", "ymm28", "ymm29", "ymm30", "ymm31",
	"zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5", "zmm6", "zmm7",
	"zmm8", "zmm9", "zmm10", "zmm11", "zmm12", "zmm13", "zmm14", "zmm15",
	"zmm16", "zmm17", "zmm18", "zmm19", "zmm20", "zmm21", "zmm22", "zmm23",
	"zmm24", "zmm25", "zmm26", "zmm27", "zmm28", "zmm29", "zmm30", "zmm31",
	"k0", "k1", "k2", "k3", "k4", "k5", "k6", "k7",
	};

	const TargetInfo::AddlRegName AddlRegNames[] = {
	{ { "al", "ah", "eax", "rax" }, 0 },
	{ { "bl", "bh", "ebx", "rbx" }, 3 },
	{ { "cl", "ch", "ecx", "rcx" }, 2 },
	{ { "dl", "dh", "edx", "rdx" }, 1 },
	{ { "esi", "rsi" }, 4 },
	{ { "edi", "rdi" }, 5 },
	{ { "esp", "rsp" }, 7 },
	{ { "ebp", "rbp" }, 6 },
	{ { "r8d", "r8w", "r8b" }, 38 },
	{ { "r9d", "r9w", "r9b" }, 39 },
	{ { "r10d", "r10w", "r10b" }, 40 },
	{ { "r11d", "r11w", "r11b" }, 41 },
	{ { "r12d", "r12w", "r12b" }, 42 },
	{ { "r13d", "r13w", "r13b" }, 43 },
	{ { "r14d", "r14w", "r14b" }, 44 },
	{ { "r15d", "r15w", "r15b" }, 45 },
	};

	// X86 target abstract base class; x86-32 and x86-64 are very close, so
	// most of the implementation can be shared.
	class X86TargetInfo : public TargetInfo {
	enum X86SSEEnum {
	NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
	} SSELevel = NoSSE;
	enum MMX3DNowEnum {
	NoMMX3DNow, MMX, AMD3DNow, AMD3DNowAthlon
	} MMX3DNowLevel = NoMMX3DNow;
	enum XOPEnum {
	NoXOP,
	SSE4A,
	FMA4,
	XOP
	} XOPLevel = NoXOP;

	bool HasAES = false;
	bool HasPCLMUL = false;
	bool HasLZCNT = false;
	bool HasRDRND = false;
	bool HasFSGSBASE = false;
	bool HasBMI = false;
	bool HasBMI2 = false;
	bool HasPOPCNT = false;
	bool HasRTM = false;
	bool HasPRFCHW = false;
	bool HasRDSEED = false;
	bool HasADX = false;
	bool HasTBM = false;
	bool HasLWP = false;
	bool HasFMA = false;
	bool HasF16C = false;
	bool HasAVX512CD = false;
	bool HasAVX512VPOPCNTDQ = false;
	bool HasAVX512ER = false;
	bool HasAVX512PF = false;
	bool HasAVX512DQ = false;
	bool HasAVX512BW = false;
	bool HasAVX512VL = false;
	bool HasAVX512VBMI = false;
	bool HasAVX512IFMA = false;
	bool HasSHA = false;
	bool HasMPX = false;
	bool HasSGX = false;
	bool HasCX16 = false;
	bool HasFXSR = false;
	bool HasXSAVE = false;
	bool HasXSAVEOPT = false;
	bool HasXSAVEC = false;
	bool HasXSAVES = false;
	bool HasMWAITX = false;
	bool HasCLZERO = false;
	bool HasPKU = false;
	bool HasCLFLUSHOPT = false;
	bool HasCLWB = false;
	bool HasMOVBE = false;
	bool HasPREFETCHWT1 = false;

	/// \brief Enumeration of all of the X86 CPUs supported by Clang.
	///
	/// Each enumeration represents a particular CPU supported by Clang. These
	/// loosely correspond to the options passed to '-march' or '-mtune' flags.
	enum CPUKind {
	CK_Generic,

	/// \name i386
	/// i386-generation processors.
	//@{
	CK_i386,
	//@}

	/// \name i486
	/// i486-generation processors.
	//@{
	CK_i486,
	CK_WinChipC6,
	CK_WinChip2,
	CK_C3,
	//@}

	/// \name i586
	/// i586-generation processors, P5 microarchitecture based.
	//@{
	CK_i586,
	CK_Pentium,
	CK_PentiumMMX,
	//@}

	/// \name i686
	/// i686-generation processors, P6 / Pentium M microarchitecture based.
	//@{
	CK_i686,
	CK_PentiumPro,
	CK_Pentium2,
	CK_Pentium3,
	CK_Pentium3M,
	CK_PentiumM,
	CK_C3_2,

	/// This enumerator is a bit odd, as GCC no longer accepts -march=yonah.
	/// Clang however has some logic to support this.
	// FIXME: Warn, deprecate, and potentially remove this.
	CK_Yonah,
	//@}

	/// \name Netburst
	/// Netburst microarchitecture based processors.
	//@{
	CK_Pentium4,
	CK_Pentium4M,
	CK_Prescott,
	CK_Nocona,
	//@}

	/// \name Core
	/// Core microarchitecture based processors.
	//@{
	CK_Core2,

	/// This enumerator, like \see CK_Yonah, is a bit odd. It is another
	/// codename which GCC no longer accepts as an option to -march, but Clang
	/// has some logic for recognizing it.
	// FIXME: Warn, deprecate, and potentially remove this.
	CK_Penryn,
	//@}

	/// \name Atom
	/// Atom processors
	//@{
	CK_Bonnell,
	CK_Silvermont,
	CK_Goldmont,
	//@}

	/// \name Nehalem
	/// Nehalem microarchitecture based processors.
	CK_Nehalem,

	/// \name Westmere
	/// Westmere microarchitecture based processors.
	CK_Westmere,

	/// \name Sandy Bridge
	/// Sandy Bridge microarchitecture based processors.
	CK_SandyBridge,

	/// \name Ivy Bridge
	/// Ivy Bridge microarchitecture based processors.
	CK_IvyBridge,

	/// \name Haswell
	/// Haswell microarchitecture based processors.
	CK_Haswell,

	/// \name Broadwell
	/// Broadwell microarchitecture based processors.
	CK_Broadwell,

	/// \name Skylake Client
	/// Skylake client microarchitecture based processors.
	CK_SkylakeClient,

	/// \name Skylake Server
	/// Skylake server microarchitecture based processors.
	CK_SkylakeServer,

	/// \name Cannonlake Client
	/// Cannonlake client microarchitecture based processors.
	CK_Cannonlake,

	/// \name Knights Landing
	/// Knights Landing processor.
	CK_KNL,

	/// \name Lakemont
	/// Lakemont microarchitecture based processors.
	CK_Lakemont,

	/// \name K6
	/// K6 architecture processors.
	//@{
	CK_K6,
	CK_K6_2,
	CK_K6_3,
	//@}

	/// \name K7
	/// K7 architecture processors.
	//@{
	CK_Athlon,
	CK_AthlonThunderbird,
	CK_Athlon4,
	CK_AthlonXP,
	CK_AthlonMP,
	//@}

	/// \name K8
	/// K8 architecture processors.
	//@{
	CK_Athlon64,
	CK_Athlon64SSE3,
	CK_AthlonFX,
	CK_K8,
	CK_K8SSE3,
	CK_Opteron,
	CK_OpteronSSE3,
	CK_AMDFAM10,
	//@}

	/// \name Bobcat
	/// Bobcat architecture processors.
	//@{
	CK_BTVER1,
	CK_BTVER2,
	//@}

	/// \name Bulldozer
	/// Bulldozer architecture processors.
	//@{
	CK_BDVER1,
	CK_BDVER2,
	CK_BDVER3,
	CK_BDVER4,
	//@}

	/// \name zen
	/// Zen architecture processors.
	//@{
	CK_ZNVER1,
	//@}

	/// This specification is deprecated and will be removed in the future.
	/// Users should prefer \see CK_K8.
	// FIXME: Warn on this when the CPU is set to it.
	//@{
	CK_x86_64,
	//@}

	/// \name Geode
	/// Geode processors.
	//@{
	CK_Geode
	//@}
	} CPU = CK_Generic;

	CPUKind getCPUKind(StringRef CPU) const {
	return llvm::StringSwitch<CPUKind>(CPU)
	.Case("i386", CK_i386)
	.Case("i486", CK_i486)
	.Case("winchip-c6", CK_WinChipC6)
	.Case("winchip2", CK_WinChip2)
	.Case("c3", CK_C3)
	.Case("i586", CK_i586)
	.Case("pentium", CK_Pentium)
	.Case("pentium-mmx", CK_PentiumMMX)
	.Case("i686", CK_i686)
	.Case("pentiumpro", CK_PentiumPro)
	.Case("pentium2", CK_Pentium2)
	.Case("pentium3", CK_Pentium3)
	.Case("pentium3m", CK_Pentium3M)
	.Case("pentium-m", CK_PentiumM)
	.Case("c3-2", CK_C3_2)
	.Case("yonah", CK_Yonah)
	.Case("pentium4", CK_Pentium4)
	.Case("pentium4m", CK_Pentium4M)
	.Case("prescott", CK_Prescott)
	.Case("nocona", CK_Nocona)
	.Case("core2", CK_Core2)
	.Case("penryn", CK_Penryn)
	.Case("bonnell", CK_Bonnell)
	.Case("atom", CK_Bonnell) // Legacy name.
	.Case("silvermont", CK_Silvermont)
	.Case("slm", CK_Silvermont) // Legacy name.
	.Case("goldmont", CK_Goldmont)
	.Case("nehalem", CK_Nehalem)
	.Case("corei7", CK_Nehalem) // Legacy name.
	.Case("westmere", CK_Westmere)
	.Case("sandybridge", CK_SandyBridge)
	.Case("corei7-avx", CK_SandyBridge) // Legacy name.
	.Case("ivybridge", CK_IvyBridge)
	.Case("core-avx-i", CK_IvyBridge) // Legacy name.
	.Case("haswell", CK_Haswell)
	.Case("core-avx2", CK_Haswell) // Legacy name.
	.Case("broadwell", CK_Broadwell)
	.Case("skylake", CK_SkylakeClient)
	.Case("skylake-avx512", CK_SkylakeServer)
	.Case("skx", CK_SkylakeServer) // Legacy name.
	.Case("cannonlake", CK_Cannonlake)
	.Case("knl", CK_KNL)
	.Case("lakemont", CK_Lakemont)
	.Case("k6", CK_K6)
	.Case("k6-2", CK_K6_2)
	.Case("k6-3", CK_K6_3)
	.Case("athlon", CK_Athlon)
	.Case("athlon-tbird", CK_AthlonThunderbird)
	.Case("athlon-4", CK_Athlon4)
	.Case("athlon-xp", CK_AthlonXP)
	.Case("athlon-mp", CK_AthlonMP)
	.Case("athlon64", CK_Athlon64)
	.Case("athlon64-sse3", CK_Athlon64SSE3)
	.Case("athlon-fx", CK_AthlonFX)
	.Case("k8", CK_K8)
	.Case("k8-sse3", CK_K8SSE3)
	.Case("opteron", CK_Opteron)
	.Case("opteron-sse3", CK_OpteronSSE3)
	.Case("barcelona", CK_AMDFAM10)
	.Case("amdfam10", CK_AMDFAM10)
	.Case("btver1", CK_BTVER1)
	.Case("btver2", CK_BTVER2)
	.Case("bdver1", CK_BDVER1)
	.Case("bdver2", CK_BDVER2)
	.Case("bdver3", CK_BDVER3)
	.Case("bdver4", CK_BDVER4)
	.Case("znver1", CK_ZNVER1)
	.Case("x86-64", CK_x86_64)
	.Case("geode", CK_Geode)
	.Default(CK_Generic);
	}

	enum FPMathKind {
	FP_Default,
	FP_SSE,
	FP_387
	} FPMath = FP_Default;

	public:
	X86TargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	LongDoubleFormat = &llvm::APFloat::x87DoubleExtended();
	}
	unsigned getFloatEvalMethod() const override {
	// X87 evaluates with 80 bits "long double" precision.
	return SSELevel == NoSSE ? 2 : 0;
	}
	ArrayRef<const char *> getGCCRegNames() const override {
	return llvm::makeArrayRef(GCCRegNames);
	}
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}
	ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override {
	return llvm::makeArrayRef(AddlRegNames);
	}
	bool validateCpuSupports(StringRef Name) const override;
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override;

	bool validateGlobalRegisterVariable(StringRef RegName,
	unsigned RegSize,
	bool &HasSizeMismatch) const override {
	// esp and ebp are the only 32-bit registers the x86 backend can currently
	// handle.
	if (RegName.equals("esp") \|\| RegName.equals("ebp")) {
	// Check that the register size is 32-bit.
	HasSizeMismatch = RegSize != 32;
	return true;
	}

	return false;
	}

	bool validateOutputSize(StringRef Constraint, unsigned Size) const override;

	bool validateInputSize(StringRef Constraint, unsigned Size) const override;

	virtual bool validateOperandSize(StringRef Constraint, unsigned Size) const;

	std::string convertConstraint(const char *&Constraint) const override;
	const char *getClobbers() const override {
	return "~{dirflag},~{fpsr},~{flags}";
	}

	StringRef getConstraintRegister(const StringRef &Constraint,
	const StringRef &Expression) const override {
	StringRef::iterator I, E;
	for (I = Constraint.begin(), E = Constraint.end(); I != E; ++I) {
	if (isalpha(*I))
	break;
	}
	if (I == E)
	return "";
	switch (*I) {
	// For the register constraints, return the matching register name
	case 'a':
	return "ax";
	case 'b':
	return "bx";
	case 'c':
	return "cx";
	case 'd':
	return "dx";
	case 'S':
	return "si";
	case 'D':
	return "di";
	// In case the constraint is 'r' we need to return Expression
	case 'r':
	return Expression;
	default:
	// Default value if there is no constraint for the register
	return "";
	}
	return "";
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;
	static void setSSELevel(llvm::StringMap<bool> &Features, X86SSEEnum Level,
	bool Enabled);
	static void setMMXLevel(llvm::StringMap<bool> &Features, MMX3DNowEnum Level,
	bool Enabled);
	static void setXOPLevel(llvm::StringMap<bool> &Features, XOPEnum Level,
	bool Enabled);
	void setFeatureEnabled(llvm::StringMap<bool> &Features,
	StringRef Name, bool Enabled) const override {
	setFeatureEnabledImpl(Features, Name, Enabled);
	}
	// This exists purely to cut down on the number of virtual calls in
	// initFeatureMap which calls this repeatedly.
	static void setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
	StringRef Name, bool Enabled);
	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override;
	bool hasFeature(StringRef Feature) const override;
	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override;
	StringRef getABI() const override {
	if (getTriple().getArch() == llvm::Triple::x86_64 && SSELevel >= AVX512F)
	return "avx512";
	if (getTriple().getArch() == llvm::Triple::x86_64 && SSELevel >= AVX)
	return "avx";
	if (getTriple().getArch() == llvm::Triple::x86 &&
	MMX3DNowLevel == NoMMX3DNow)
	return "no-mmx";
	return "";
	}
	bool setCPU(const std::string &Name) override {
	CPU = getCPUKind(Name);

	// Perform any per-CPU checks necessary to determine if this CPU is
	// acceptable.
	// FIXME: This results in terrible diagnostics. Clang just says the CPU is
	// invalid without explaining why.
	switch (CPU) {
	case CK_Generic:
	// No processor selected!
	return false;

	case CK_i386:
	case CK_i486:
	case CK_WinChipC6:
	case CK_WinChip2:
	case CK_C3:
	case CK_i586:
	case CK_Pentium:
	case CK_PentiumMMX:
	case CK_i686:
	case CK_PentiumPro:
	case CK_Pentium2:
	case CK_Pentium3:
	case CK_Pentium3M:
	case CK_PentiumM:
	case CK_Yonah:
	case CK_C3_2:
	case CK_Pentium4:
	case CK_Pentium4M:
	case CK_Lakemont:
	case CK_Prescott:
	case CK_K6:
	case CK_K6_2:
	case CK_K6_3:
	case CK_Athlon:
	case CK_AthlonThunderbird:
	case CK_Athlon4:
	case CK_AthlonXP:
	case CK_AthlonMP:
	case CK_Geode:
	// Only accept certain architectures when compiling in 32-bit mode.
	if (getTriple().getArch() != llvm::Triple::x86)
	return false;

	// Fallthrough
	case CK_Nocona:
	case CK_Core2:
	case CK_Penryn:
	case CK_Bonnell:
	case CK_Silvermont:
	case CK_Goldmont:
	case CK_Nehalem:
	case CK_Westmere:
	case CK_SandyBridge:
	case CK_IvyBridge:
	case CK_Haswell:
	case CK_Broadwell:
	case CK_SkylakeClient:
	case CK_SkylakeServer:
	case CK_Cannonlake:
	case CK_KNL:
	case CK_Athlon64:
	case CK_Athlon64SSE3:
	case CK_AthlonFX:
	case CK_K8:
	case CK_K8SSE3:
	case CK_Opteron:
	case CK_OpteronSSE3:
	case CK_AMDFAM10:
	case CK_BTVER1:
	case CK_BTVER2:
	case CK_BDVER1:
	case CK_BDVER2:
	case CK_BDVER3:
	case CK_BDVER4:
	case CK_ZNVER1:
	case CK_x86_64:
	return true;
	}
	llvm_unreachable("Unhandled CPU kind");
	}

	bool setFPMath(StringRef Name) override;

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	// Most of the non-ARM calling conventions are i386 conventions.
	switch (CC) {
	case CC_X86ThisCall:
	case CC_X86FastCall:
	case CC_X86StdCall:
	case CC_X86VectorCall:
	case CC_X86RegCall:
	case CC_C:
	case CC_Swift:
	case CC_X86Pascal:
	case CC_IntelOclBicc:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}

	CallingConv getDefaultCallingConv(CallingConvMethodType MT) const override {
	return MT == CCMT_Member ? CC_X86ThisCall : CC_C;
	}

	bool hasSjLjLowering() const override {
	return true;
	}

	void setSupportedOpenCLOpts() override {
	getSupportedOpenCLOpts().supportAll();
	}
	};

	bool X86TargetInfo::setFPMath(StringRef Name) {
	if (Name == "387") {
	FPMath = FP_387;
	return true;
	}
	if (Name == "sse") {
	FPMath = FP_SSE;
	return true;
	}
	return false;
	}

	bool X86TargetInfo::initFeatureMap(
	llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const {
	// FIXME: This really should not be here.
	// X86_64 always has SSE2.
	if (getTriple().getArch() == llvm::Triple::x86_64)
	setFeatureEnabledImpl(Features, "sse2", true);

	const CPUKind Kind = getCPUKind(CPU);

	// Enable X87 for all X86 processors but Lakemont.
	if (Kind != CK_Lakemont)
	setFeatureEnabledImpl(Features, "x87", true);

	switch (Kind) {
	case CK_Generic:
	case CK_i386:
	case CK_i486:
	case CK_i586:
	case CK_Pentium:
	case CK_i686:
	case CK_PentiumPro:
	case CK_Lakemont:
	break;
	case CK_PentiumMMX:
	case CK_Pentium2:
	case CK_K6:
	case CK_WinChipC6:
	setFeatureEnabledImpl(Features, "mmx", true);
	break;
	case CK_Pentium3:
	case CK_Pentium3M:
	case CK_C3_2:
	setFeatureEnabledImpl(Features, "sse", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	break;
	case CK_PentiumM:
	case CK_Pentium4:
	case CK_Pentium4M:
	case CK_x86_64:
	setFeatureEnabledImpl(Features, "sse2", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	break;
	case CK_Yonah:
	case CK_Prescott:
	case CK_Nocona:
	setFeatureEnabledImpl(Features, "sse3", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	break;
	case CK_Core2:
	setFeatureEnabledImpl(Features, "ssse3", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	break;
	case CK_Penryn:
	setFeatureEnabledImpl(Features, "sse4.1", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	break;
	case CK_Cannonlake:
	setFeatureEnabledImpl(Features, "avx512ifma", true);
	setFeatureEnabledImpl(Features, "avx512vbmi", true);
	setFeatureEnabledImpl(Features, "sha", true);
	LLVM_FALLTHROUGH;
	case CK_SkylakeServer:
	setFeatureEnabledImpl(Features, "avx512f", true);
	setFeatureEnabledImpl(Features, "avx512cd", true);
	setFeatureEnabledImpl(Features, "avx512dq", true);
	setFeatureEnabledImpl(Features, "avx512bw", true);
	setFeatureEnabledImpl(Features, "avx512vl", true);
	setFeatureEnabledImpl(Features, "pku", true);
	setFeatureEnabledImpl(Features, "clwb", true);
	LLVM_FALLTHROUGH;
	case CK_SkylakeClient:
	setFeatureEnabledImpl(Features, "xsavec", true);
	setFeatureEnabledImpl(Features, "xsaves", true);
	setFeatureEnabledImpl(Features, "mpx", true);
	setFeatureEnabledImpl(Features, "sgx", true);
	setFeatureEnabledImpl(Features, "clflushopt", true);
	setFeatureEnabledImpl(Features, "rtm", true);
	LLVM_FALLTHROUGH;
	case CK_Broadwell:
	setFeatureEnabledImpl(Features, "rdseed", true);
	setFeatureEnabledImpl(Features, "adx", true);
	LLVM_FALLTHROUGH;
	case CK_Haswell:
	setFeatureEnabledImpl(Features, "avx2", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "bmi2", true);
	setFeatureEnabledImpl(Features, "fma", true);
	setFeatureEnabledImpl(Features, "movbe", true);
	LLVM_FALLTHROUGH;
	case CK_IvyBridge:
	setFeatureEnabledImpl(Features, "rdrnd", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	LLVM_FALLTHROUGH;
	case CK_SandyBridge:
	setFeatureEnabledImpl(Features, "avx", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	LLVM_FALLTHROUGH;
	case CK_Westmere:
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	LLVM_FALLTHROUGH;
	case CK_Nehalem:
	setFeatureEnabledImpl(Features, "sse4.2", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	break;
	case CK_Goldmont:
	setFeatureEnabledImpl(Features, "sha", true);
	setFeatureEnabledImpl(Features, "rdrnd", true);
	setFeatureEnabledImpl(Features, "rdseed", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	setFeatureEnabledImpl(Features, "xsavec", true);
	setFeatureEnabledImpl(Features, "xsaves", true);
	setFeatureEnabledImpl(Features, "clflushopt", true);
	setFeatureEnabledImpl(Features, "mpx", true);
	LLVM_FALLTHROUGH;
	case CK_Silvermont:
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "sse4.2", true);
	LLVM_FALLTHROUGH;
	case CK_Bonnell:
	setFeatureEnabledImpl(Features, "movbe", true);
	setFeatureEnabledImpl(Features, "ssse3", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	break;
	case CK_KNL:
	setFeatureEnabledImpl(Features, "avx512f", true);
	setFeatureEnabledImpl(Features, "avx512cd", true);
	setFeatureEnabledImpl(Features, "avx512er", true);
	setFeatureEnabledImpl(Features, "avx512pf", true);
	setFeatureEnabledImpl(Features, "prefetchwt1", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "rdseed", true);
	setFeatureEnabledImpl(Features, "adx", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "bmi2", true);
	setFeatureEnabledImpl(Features, "rtm", true);
	setFeatureEnabledImpl(Features, "fma", true);
	setFeatureEnabledImpl(Features, "rdrnd", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	setFeatureEnabledImpl(Features, "movbe", true);
	break;
	case CK_K6_2:
	case CK_K6_3:
	case CK_WinChip2:
	case CK_C3:
	setFeatureEnabledImpl(Features, "3dnow", true);
	break;
	case CK_Athlon:
	case CK_AthlonThunderbird:
	case CK_Geode:
	setFeatureEnabledImpl(Features, "3dnowa", true);
	break;
	case CK_Athlon4:
	case CK_AthlonXP:
	case CK_AthlonMP:
	setFeatureEnabledImpl(Features, "sse", true);
	setFeatureEnabledImpl(Features, "3dnowa", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	break;
	case CK_K8:
	case CK_Opteron:
	case CK_Athlon64:
	case CK_AthlonFX:
	setFeatureEnabledImpl(Features, "sse2", true);
	setFeatureEnabledImpl(Features, "3dnowa", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	break;
	case CK_AMDFAM10:
	setFeatureEnabledImpl(Features, "sse4a", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "popcnt", true);
	LLVM_FALLTHROUGH;
	case CK_K8SSE3:
	case CK_OpteronSSE3:
	case CK_Athlon64SSE3:
	setFeatureEnabledImpl(Features, "sse3", true);
	setFeatureEnabledImpl(Features, "3dnowa", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	break;
	case CK_BTVER2:
	setFeatureEnabledImpl(Features, "avx", true);
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	setFeatureEnabledImpl(Features, "movbe", true);
	LLVM_FALLTHROUGH;
	case CK_BTVER1:
	setFeatureEnabledImpl(Features, "ssse3", true);
	setFeatureEnabledImpl(Features, "sse4a", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "popcnt", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	break;
	case CK_ZNVER1:
	setFeatureEnabledImpl(Features, "adx", true);
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "avx2", true);
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "bmi2", true);
	setFeatureEnabledImpl(Features, "clflushopt", true);
	setFeatureEnabledImpl(Features, "clzero", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "fma", true);
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "mwaitx", true);
	setFeatureEnabledImpl(Features, "movbe", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "popcnt", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	setFeatureEnabledImpl(Features, "rdrnd", true);
	setFeatureEnabledImpl(Features, "rdseed", true);
	setFeatureEnabledImpl(Features, "sha", true);
	setFeatureEnabledImpl(Features, "sse4a", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	setFeatureEnabledImpl(Features, "xsavec", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	setFeatureEnabledImpl(Features, "xsaves", true);
	break;
	case CK_BDVER4:
	setFeatureEnabledImpl(Features, "avx2", true);
	setFeatureEnabledImpl(Features, "bmi2", true);
	setFeatureEnabledImpl(Features, "mwaitx", true);
	LLVM_FALLTHROUGH;
	case CK_BDVER3:
	setFeatureEnabledImpl(Features, "fsgsbase", true);
	setFeatureEnabledImpl(Features, "xsaveopt", true);
	LLVM_FALLTHROUGH;
	case CK_BDVER2:
	setFeatureEnabledImpl(Features, "bmi", true);
	setFeatureEnabledImpl(Features, "fma", true);
	setFeatureEnabledImpl(Features, "f16c", true);
	setFeatureEnabledImpl(Features, "tbm", true);
	LLVM_FALLTHROUGH;
	case CK_BDVER1:
	// xop implies avx, sse4a and fma4.
	setFeatureEnabledImpl(Features, "xop", true);
	setFeatureEnabledImpl(Features, "lwp", true);
	setFeatureEnabledImpl(Features, "lzcnt", true);
	setFeatureEnabledImpl(Features, "aes", true);
	setFeatureEnabledImpl(Features, "pclmul", true);
	setFeatureEnabledImpl(Features, "prfchw", true);
	setFeatureEnabledImpl(Features, "cx16", true);
	setFeatureEnabledImpl(Features, "fxsr", true);
	setFeatureEnabledImpl(Features, "xsave", true);
	break;
	}
	if (!TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec))
	return false;

	// Can't do this earlier because we need to be able to explicitly enable
	// or disable these features and the things that they depend upon.

	// Enable popcnt if sse4.2 is enabled and popcnt is not explicitly disabled.
	auto I = Features.find("sse4.2");
	if (I != Features.end() && I->getValue() &&
	std::find(FeaturesVec.begin(), FeaturesVec.end(), "-popcnt") ==
	FeaturesVec.end())
	Features["popcnt"] = true;

	// Enable prfchw if 3DNow! is enabled and prfchw is not explicitly disabled.
	I = Features.find("3dnow");
	if (I != Features.end() && I->getValue() &&
	std::find(FeaturesVec.begin(), FeaturesVec.end(), "-prfchw") ==
	FeaturesVec.end())
	Features["prfchw"] = true;

	// Additionally, if SSE is enabled and mmx is not explicitly disabled,
	// then enable MMX.
	I = Features.find("sse");
	if (I != Features.end() && I->getValue() &&
	std::find(FeaturesVec.begin(), FeaturesVec.end(), "-mmx") ==
	FeaturesVec.end())
	Features["mmx"] = true;

	return true;
	}

	void X86TargetInfo::setSSELevel(llvm::StringMap<bool> &Features,
	X86SSEEnum Level, bool Enabled) {
	if (Enabled) {
	switch (Level) {
	case AVX512F:
	Features["avx512f"] = true;
	LLVM_FALLTHROUGH;
	case AVX2:
	Features["avx2"] = true;
	LLVM_FALLTHROUGH;
	case AVX:
	Features["avx"] = true;
	Features["xsave"] = true;
	LLVM_FALLTHROUGH;
	case SSE42:
	Features["sse4.2"] = true;
	LLVM_FALLTHROUGH;
	case SSE41:
	Features["sse4.1"] = true;
	LLVM_FALLTHROUGH;
	case SSSE3:
	Features["ssse3"] = true;
	LLVM_FALLTHROUGH;
	case SSE3:
	Features["sse3"] = true;
	LLVM_FALLTHROUGH;
	case SSE2:
	Features["sse2"] = true;
	LLVM_FALLTHROUGH;
	case SSE1:
	Features["sse"] = true;
	LLVM_FALLTHROUGH;
	case NoSSE:
	break;
	}
	return;
	}

	switch (Level) {
	case NoSSE:
	case SSE1:
	Features["sse"] = false;
	LLVM_FALLTHROUGH;
	case SSE2:
	Features["sse2"] = Features["pclmul"] = Features["aes"] =
	Features["sha"] = false;
	LLVM_FALLTHROUGH;
	case SSE3:
	Features["sse3"] = false;
	setXOPLevel(Features, NoXOP, false);
	LLVM_FALLTHROUGH;
	case SSSE3:
	Features["ssse3"] = false;
	LLVM_FALLTHROUGH;
	case SSE41:
	Features["sse4.1"] = false;
	LLVM_FALLTHROUGH;
	case SSE42:
	Features["sse4.2"] = false;
	LLVM_FALLTHROUGH;
	case AVX:
	Features["fma"] = Features["avx"] = Features["f16c"] = Features["xsave"] =
	Features["xsaveopt"] = false;
	setXOPLevel(Features, FMA4, false);
	LLVM_FALLTHROUGH;
	case AVX2:
	Features["avx2"] = false;
	LLVM_FALLTHROUGH;
	case AVX512F:
	Features["avx512f"] = Features["avx512cd"] = Features["avx512er"] =
	Features["avx512pf"] = Features["avx512dq"] = Features["avx512bw"] =
	Features["avx512vl"] = Features["avx512vbmi"] =
	Features["avx512ifma"] = Features["avx512vpopcntdq"] = false;
	break;
	}
	}

	void X86TargetInfo::setMMXLevel(llvm::StringMap<bool> &Features,
	MMX3DNowEnum Level, bool Enabled) {
	if (Enabled) {
	switch (Level) {
	case AMD3DNowAthlon:
	Features["3dnowa"] = true;
	LLVM_FALLTHROUGH;
	case AMD3DNow:
	Features["3dnow"] = true;
	LLVM_FALLTHROUGH;
	case MMX:
	Features["mmx"] = true;
	LLVM_FALLTHROUGH;
	case NoMMX3DNow:
	break;
	}
	return;
	}

	switch (Level) {
	case NoMMX3DNow:
	case MMX:
	Features["mmx"] = false;
	LLVM_FALLTHROUGH;
	case AMD3DNow:
	Features["3dnow"] = false;
	LLVM_FALLTHROUGH;
	case AMD3DNowAthlon:
	Features["3dnowa"] = false;
	break;
	}
	}

	void X86TargetInfo::setXOPLevel(llvm::StringMap<bool> &Features, XOPEnum Level,
	bool Enabled) {
	if (Enabled) {
	switch (Level) {
	case XOP:
	Features["xop"] = true;
	LLVM_FALLTHROUGH;
	case FMA4:
	Features["fma4"] = true;
	setSSELevel(Features, AVX, true);
	LLVM_FALLTHROUGH;
	case SSE4A:
	Features["sse4a"] = true;
	setSSELevel(Features, SSE3, true);
	LLVM_FALLTHROUGH;
	case NoXOP:
	break;
	}
	return;
	}

	switch (Level) {
	case NoXOP:
	case SSE4A:
	Features["sse4a"] = false;
	LLVM_FALLTHROUGH;
	case FMA4:
	Features["fma4"] = false;
	LLVM_FALLTHROUGH;
	case XOP:
	Features["xop"] = false;
	break;
	}
	}

	void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
	StringRef Name, bool Enabled) {
	// This is a bit of a hack to deal with the sse4 target feature when used
	// as part of the target attribute. We handle sse4 correctly everywhere
	// else. See below for more information on how we handle the sse4 options.
	if (Name != "sse4")
	Features[Name] = Enabled;

	if (Name == "mmx") {
	setMMXLevel(Features, MMX, Enabled);
	} else if (Name == "sse") {
	setSSELevel(Features, SSE1, Enabled);
	} else if (Name == "sse2") {
	setSSELevel(Features, SSE2, Enabled);
	} else if (Name == "sse3") {
	setSSELevel(Features, SSE3, Enabled);
	} else if (Name == "ssse3") {
	setSSELevel(Features, SSSE3, Enabled);
	} else if (Name == "sse4.2") {
	setSSELevel(Features, SSE42, Enabled);
	} else if (Name == "sse4.1") {
	setSSELevel(Features, SSE41, Enabled);
	} else if (Name == "3dnow") {
	setMMXLevel(Features, AMD3DNow, Enabled);
	} else if (Name == "3dnowa") {
	setMMXLevel(Features, AMD3DNowAthlon, Enabled);
	} else if (Name == "aes") {
	if (Enabled)
	setSSELevel(Features, SSE2, Enabled);
	} else if (Name == "pclmul") {
	if (Enabled)
	setSSELevel(Features, SSE2, Enabled);
	} else if (Name == "avx") {
	setSSELevel(Features, AVX, Enabled);
	} else if (Name == "avx2") {
	setSSELevel(Features, AVX2, Enabled);
	} else if (Name == "avx512f") {
	setSSELevel(Features, AVX512F, Enabled);
	} else if (Name == "avx512cd" \|\| Name == "avx512er" \|\| Name == "avx512pf" \|\|
	Name == "avx512dq" \|\| Name == "avx512bw" \|\| Name == "avx512vl" \|\|
	Name == "avx512vbmi" \|\| Name == "avx512ifma" \|\|
	Name == "avx512vpopcntdq") {
	if (Enabled)
	setSSELevel(Features, AVX512F, Enabled);
	// Enable BWI instruction if VBMI is being enabled.
	if (Name == "avx512vbmi" && Enabled)
	Features["avx512bw"] = true;
	// Also disable VBMI if BWI is being disabled.
	if (Name == "avx512bw" && !Enabled)
	Features["avx512vbmi"] = false;
	} else if (Name == "fma") {
	if (Enabled)
	setSSELevel(Features, AVX, Enabled);
	} else if (Name == "fma4") {
	setXOPLevel(Features, FMA4, Enabled);
	} else if (Name == "xop") {
	setXOPLevel(Features, XOP, Enabled);
	} else if (Name == "sse4a") {
	setXOPLevel(Features, SSE4A, Enabled);
	} else if (Name == "f16c") {
	if (Enabled)
	setSSELevel(Features, AVX, Enabled);
	} else if (Name == "sha") {
	if (Enabled)
	setSSELevel(Features, SSE2, Enabled);
	} else if (Name == "sse4") {
	// We can get here via the __target__ attribute since that's not controlled
	// via the -msse4/-mno-sse4 command line alias. Handle this the same way
	// here - turn on the sse4.2 if enabled, turn off the sse4.1 level if
	// disabled.
	if (Enabled)
	setSSELevel(Features, SSE42, Enabled);
	else
	setSSELevel(Features, SSE41, Enabled);
	} else if (Name == "xsave") {
	if (!Enabled)
	Features["xsaveopt"] = false;
	} else if (Name == "xsaveopt" \|\| Name == "xsavec" \|\| Name == "xsaves") {
	if (Enabled)
	Features["xsave"] = true;
	}
	}

	/// handleTargetFeatures - Perform initialization based on the user
	/// configured set of features.
	bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) {
	for (const auto &Feature : Features) {
	if (Feature[0] != '+')
	continue;

	if (Feature == "+aes") {
	HasAES = true;
	} else if (Feature == "+pclmul") {
	HasPCLMUL = true;
	} else if (Feature == "+lzcnt") {
	HasLZCNT = true;
	} else if (Feature == "+rdrnd") {
	HasRDRND = true;
	} else if (Feature == "+fsgsbase") {
	HasFSGSBASE = true;
	} else if (Feature == "+bmi") {
	HasBMI = true;
	} else if (Feature == "+bmi2") {
	HasBMI2 = true;
	} else if (Feature == "+popcnt") {
	HasPOPCNT = true;
	} else if (Feature == "+rtm") {
	HasRTM = true;
	} else if (Feature == "+prfchw") {
	HasPRFCHW = true;
	} else if (Feature == "+rdseed") {
	HasRDSEED = true;
	} else if (Feature == "+adx") {
	HasADX = true;
	} else if (Feature == "+tbm") {
	HasTBM = true;
	} else if (Feature == "+lwp") {
	HasLWP = true;
	} else if (Feature == "+fma") {
	HasFMA = true;
	} else if (Feature == "+f16c") {
	HasF16C = true;
	} else if (Feature == "+avx512cd") {
	HasAVX512CD = true;
	} else if (Feature == "+avx512vpopcntdq") {
	HasAVX512VPOPCNTDQ = true;
	} else if (Feature == "+avx512er") {
	HasAVX512ER = true;
	} else if (Feature == "+avx512pf") {
	HasAVX512PF = true;
	} else if (Feature == "+avx512dq") {
	HasAVX512DQ = true;
	} else if (Feature == "+avx512bw") {
	HasAVX512BW = true;
	} else if (Feature == "+avx512vl") {
	HasAVX512VL = true;
	} else if (Feature == "+avx512vbmi") {
	HasAVX512VBMI = true;
	} else if (Feature == "+avx512ifma") {
	HasAVX512IFMA = true;
	} else if (Feature == "+sha") {
	HasSHA = true;
	} else if (Feature == "+mpx") {
	HasMPX = true;
	} else if (Feature == "+movbe") {
	HasMOVBE = true;
	} else if (Feature == "+sgx") {
	HasSGX = true;
	} else if (Feature == "+cx16") {
	HasCX16 = true;
	} else if (Feature == "+fxsr") {
	HasFXSR = true;
	} else if (Feature == "+xsave") {
	HasXSAVE = true;
	} else if (Feature == "+xsaveopt") {
	HasXSAVEOPT = true;
	} else if (Feature == "+xsavec") {
	HasXSAVEC = true;
	} else if (Feature == "+xsaves") {
	HasXSAVES = true;
	} else if (Feature == "+mwaitx") {
	HasMWAITX = true;
	} else if (Feature == "+pku") {
	HasPKU = true;
	} else if (Feature == "+clflushopt") {
	HasCLFLUSHOPT = true;
	} else if (Feature == "+clwb") {
	HasCLWB = true;
	} else if (Feature == "+prefetchwt1") {
	HasPREFETCHWT1 = true;
	} else if (Feature == "+clzero") {
	HasCLZERO = true;
	}

	X86SSEEnum Level = llvm::StringSwitch<X86SSEEnum>(Feature)
	.Case("+avx512f", AVX512F)
	.Case("+avx2", AVX2)
	.Case("+avx", AVX)
	.Case("+sse4.2", SSE42)
	.Case("+sse4.1", SSE41)
	.Case("+ssse3", SSSE3)
	.Case("+sse3", SSE3)
	.Case("+sse2", SSE2)
	.Case("+sse", SSE1)
	.Default(NoSSE);
	SSELevel = std::max(SSELevel, Level);

	MMX3DNowEnum ThreeDNowLevel =
	llvm::StringSwitch<MMX3DNowEnum>(Feature)
	.Case("+3dnowa", AMD3DNowAthlon)
	.Case("+3dnow", AMD3DNow)
	.Case("+mmx", MMX)
	.Default(NoMMX3DNow);
	MMX3DNowLevel = std::max(MMX3DNowLevel, ThreeDNowLevel);

	XOPEnum XLevel = llvm::StringSwitch<XOPEnum>(Feature)
	.Case("+xop", XOP)
	.Case("+fma4", FMA4)
	.Case("+sse4a", SSE4A)
	.Default(NoXOP);
	XOPLevel = std::max(XOPLevel, XLevel);
	}

	// LLVM doesn't have a separate switch for fpmath, so only accept it if it
	// matches the selected sse level.
	if ((FPMath == FP_SSE && SSELevel < SSE1) \|\|
	(FPMath == FP_387 && SSELevel >= SSE1)) {
	Diags.Report(diag::err_target_unsupported_fpmath) <<
	(FPMath == FP_SSE ? "sse" : "387");
	return false;
	}

	SimdDefaultAlign =
	hasFeature("avx512f") ? 512 : hasFeature("avx") ? 256 : 128;
	return true;
	}

	/// X86TargetInfo::getTargetDefines - Return the set of the X86-specific macro
	/// definitions for this particular subtarget.
	void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	// Target identification.
	if (getTriple().getArch() == llvm::Triple::x86_64) {
	Builder.defineMacro("__amd64__");
	Builder.defineMacro("__amd64");
	Builder.defineMacro("__x86_64");
	Builder.defineMacro("__x86_64__");
	if (getTriple().getArchName() == "x86_64h") {
	Builder.defineMacro("__x86_64h");
	Builder.defineMacro("__x86_64h__");
	}
	} else {
	DefineStd(Builder, "i386", Opts);
	}

	// Subtarget options.
	// FIXME: We are hard-coding the tune parameters based on the CPU, but they
	// truly should be based on -mtune options.
	switch (CPU) {
	case CK_Generic:
	break;
	case CK_i386:
	// The rest are coming from the i386 define above.
	Builder.defineMacro("__tune_i386__");
	break;
	case CK_i486:
	case CK_WinChipC6:
	case CK_WinChip2:
	case CK_C3:
	defineCPUMacros(Builder, "i486");
	break;
	case CK_PentiumMMX:
	Builder.defineMacro("__pentium_mmx__");
	Builder.defineMacro("__tune_pentium_mmx__");
	LLVM_FALLTHROUGH;
	case CK_i586:
	case CK_Pentium:
	defineCPUMacros(Builder, "i586");
	defineCPUMacros(Builder, "pentium");
	break;
	case CK_Pentium3:
	case CK_Pentium3M:
	case CK_PentiumM:
	Builder.defineMacro("__tune_pentium3__");
	LLVM_FALLTHROUGH;
	case CK_Pentium2:
	case CK_C3_2:
	Builder.defineMacro("__tune_pentium2__");
	LLVM_FALLTHROUGH;
	case CK_PentiumPro:
	Builder.defineMacro("__tune_i686__");
	Builder.defineMacro("__tune_pentiumpro__");
	LLVM_FALLTHROUGH;
	case CK_i686:
	Builder.defineMacro("__i686");
	Builder.defineMacro("__i686__");
	// Strangely, __tune_i686__ isn't defined by GCC when CPU == i686.
	Builder.defineMacro("__pentiumpro");
	Builder.defineMacro("__pentiumpro__");
	break;
	case CK_Pentium4:
	case CK_Pentium4M:
	defineCPUMacros(Builder, "pentium4");
	break;
	case CK_Yonah:
	case CK_Prescott:
	case CK_Nocona:
	defineCPUMacros(Builder, "nocona");
	break;
	case CK_Core2:
	case CK_Penryn:
	defineCPUMacros(Builder, "core2");
	break;
	case CK_Bonnell:
	defineCPUMacros(Builder, "atom");
	break;
	case CK_Silvermont:
	defineCPUMacros(Builder, "slm");
	break;
	case CK_Goldmont:
	defineCPUMacros(Builder, "goldmont");
	break;
	case CK_Nehalem:
	case CK_Westmere:
	case CK_SandyBridge:
	case CK_IvyBridge:
	case CK_Haswell:
	case CK_Broadwell:
	case CK_SkylakeClient:
	// FIXME: Historically, we defined this legacy name, it would be nice to
	// remove it at some point. We've never exposed fine-grained names for
	// recent primary x86 CPUs, and we should keep it that way.
	defineCPUMacros(Builder, "corei7");
	break;
	case CK_SkylakeServer:
	defineCPUMacros(Builder, "skx");
	break;
	case CK_Cannonlake:
	break;
	case CK_KNL:
	defineCPUMacros(Builder, "knl");
	break;
	case CK_Lakemont:
	Builder.defineMacro("__tune_lakemont__");
	break;
	case CK_K6_2:
	Builder.defineMacro("__k6_2__");
	Builder.defineMacro("__tune_k6_2__");
	LLVM_FALLTHROUGH;
	case CK_K6_3:
	if (CPU != CK_K6_2) { // In case of fallthrough
	// FIXME: GCC may be enabling these in cases where some other k6
	// architecture is specified but -m3dnow is explicitly provided. The
	// exact semantics need to be determined and emulated here.
	Builder.defineMacro("__k6_3__");
	Builder.defineMacro("__tune_k6_3__");
	}
	LLVM_FALLTHROUGH;
	case CK_K6:
	defineCPUMacros(Builder, "k6");
	break;
	case CK_Athlon:
	case CK_AthlonThunderbird:
	case CK_Athlon4:
	case CK_AthlonXP:
	case CK_AthlonMP:
	defineCPUMacros(Builder, "athlon");
	if (SSELevel != NoSSE) {
	Builder.defineMacro("__athlon_sse__");
	Builder.defineMacro("__tune_athlon_sse__");
	}
	break;
	case CK_K8:
	case CK_K8SSE3:
	case CK_x86_64:
	case CK_Opteron:
	case CK_OpteronSSE3:
	case CK_Athlon64:
	case CK_Athlon64SSE3:
	case CK_AthlonFX:
	defineCPUMacros(Builder, "k8");
	break;
	case CK_AMDFAM10:
	defineCPUMacros(Builder, "amdfam10");
	break;
	case CK_BTVER1:
	defineCPUMacros(Builder, "btver1");
	break;
	case CK_BTVER2:
	defineCPUMacros(Builder, "btver2");
	break;
	case CK_BDVER1:
	defineCPUMacros(Builder, "bdver1");
	break;
	case CK_BDVER2:
	defineCPUMacros(Builder, "bdver2");
	break;
	case CK_BDVER3:
	defineCPUMacros(Builder, "bdver3");
	break;
	case CK_BDVER4:
	defineCPUMacros(Builder, "bdver4");
	break;
	case CK_ZNVER1:
	defineCPUMacros(Builder, "znver1");
	break;
	case CK_Geode:
	defineCPUMacros(Builder, "geode");
	break;
	}

	// Target properties.
	Builder.defineMacro("__REGISTER_PREFIX__", "");

	// Define __NO_MATH_INLINES on linux/x86 so that we don't get inline
	// functions in glibc header files that use FP Stack inline asm which the
	// backend can't deal with (PR879).
	Builder.defineMacro("__NO_MATH_INLINES");

	if (HasAES)
	Builder.defineMacro("__AES__");

	if (HasPCLMUL)
	Builder.defineMacro("__PCLMUL__");

	if (HasLZCNT)
	Builder.defineMacro("__LZCNT__");

	if (HasRDRND)
	Builder.defineMacro("__RDRND__");

	if (HasFSGSBASE)
	Builder.defineMacro("__FSGSBASE__");

	if (HasBMI)
	Builder.defineMacro("__BMI__");

	if (HasBMI2)
	Builder.defineMacro("__BMI2__");

	if (HasPOPCNT)
	Builder.defineMacro("__POPCNT__");

	if (HasRTM)
	Builder.defineMacro("__RTM__");

	if (HasPRFCHW)
	Builder.defineMacro("__PRFCHW__");

	if (HasRDSEED)
	Builder.defineMacro("__RDSEED__");

	if (HasADX)
	Builder.defineMacro("__ADX__");

	if (HasTBM)
	Builder.defineMacro("__TBM__");

	if (HasLWP)
	Builder.defineMacro("__LWP__");

	if (HasMWAITX)
	Builder.defineMacro("__MWAITX__");

	switch (XOPLevel) {
	case XOP:
	Builder.defineMacro("__XOP__");
	LLVM_FALLTHROUGH;
	case FMA4:
	Builder.defineMacro("__FMA4__");
	LLVM_FALLTHROUGH;
	case SSE4A:
	Builder.defineMacro("__SSE4A__");
	LLVM_FALLTHROUGH;
	case NoXOP:
	break;
	}

	if (HasFMA)
	Builder.defineMacro("__FMA__");

	if (HasF16C)
	Builder.defineMacro("__F16C__");

	if (HasAVX512CD)
	Builder.defineMacro("__AVX512CD__");
	if (HasAVX512VPOPCNTDQ)
	Builder.defineMacro("__AVX512VPOPCNTDQ__");
	if (HasAVX512ER)
	Builder.defineMacro("__AVX512ER__");
	if (HasAVX512PF)
	Builder.defineMacro("__AVX512PF__");
	if (HasAVX512DQ)
	Builder.defineMacro("__AVX512DQ__");
	if (HasAVX512BW)
	Builder.defineMacro("__AVX512BW__");
	if (HasAVX512VL)
	Builder.defineMacro("__AVX512VL__");
	if (HasAVX512VBMI)
	Builder.defineMacro("__AVX512VBMI__");
	if (HasAVX512IFMA)
	Builder.defineMacro("__AVX512IFMA__");

	if (HasSHA)
	Builder.defineMacro("__SHA__");

	if (HasFXSR)
	Builder.defineMacro("__FXSR__");
	if (HasXSAVE)
	Builder.defineMacro("__XSAVE__");
	if (HasXSAVEOPT)
	Builder.defineMacro("__XSAVEOPT__");
	if (HasXSAVEC)
	Builder.defineMacro("__XSAVEC__");
	if (HasXSAVES)
	Builder.defineMacro("__XSAVES__");
	if (HasPKU)
	Builder.defineMacro("__PKU__");
	if (HasCX16)
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_16");
	if (HasCLFLUSHOPT)
	Builder.defineMacro("__CLFLUSHOPT__");
	if (HasCLWB)
	Builder.defineMacro("__CLWB__");
	if (HasMPX)
	Builder.defineMacro("__MPX__");
	if (HasSGX)
	Builder.defineMacro("__SGX__");
	if (HasPREFETCHWT1)
	Builder.defineMacro("__PREFETCHWT1__");
	if (HasCLZERO)
	Builder.defineMacro("__CLZERO__");

	// Each case falls through to the previous one here.
	switch (SSELevel) {
	case AVX512F:
	Builder.defineMacro("__AVX512F__");
	LLVM_FALLTHROUGH;
	case AVX2:
	Builder.defineMacro("__AVX2__");
	LLVM_FALLTHROUGH;
	case AVX:
	Builder.defineMacro("__AVX__");
	LLVM_FALLTHROUGH;
	case SSE42:
	Builder.defineMacro("__SSE4_2__");
	LLVM_FALLTHROUGH;
	case SSE41:
	Builder.defineMacro("__SSE4_1__");
	LLVM_FALLTHROUGH;
	case SSSE3:
	Builder.defineMacro("__SSSE3__");
	LLVM_FALLTHROUGH;
	case SSE3:
	Builder.defineMacro("__SSE3__");
	LLVM_FALLTHROUGH;
	case SSE2:
	Builder.defineMacro("__SSE2__");
	Builder.defineMacro("__SSE2_MATH__"); // -mfp-math=sse always implied.
	LLVM_FALLTHROUGH;
	case SSE1:
	Builder.defineMacro("__SSE__");
	Builder.defineMacro("__SSE_MATH__"); // -mfp-math=sse always implied.
	LLVM_FALLTHROUGH;
	case NoSSE:
	break;
	}

	if (Opts.MicrosoftExt && getTriple().getArch() == llvm::Triple::x86) {
	switch (SSELevel) {
	case AVX512F:
	case AVX2:
	case AVX:
	case SSE42:
	case SSE41:
	case SSSE3:
	case SSE3:
	case SSE2:
	Builder.defineMacro("_M_IX86_FP", Twine(2));
	break;
	case SSE1:
	Builder.defineMacro("_M_IX86_FP", Twine(1));
	break;
	default:
	Builder.defineMacro("_M_IX86_FP", Twine(0));
	break;
	}
	}

	// Each case falls through to the previous one here.
	switch (MMX3DNowLevel) {
	case AMD3DNowAthlon:
	Builder.defineMacro("__3dNOW_A__");
	LLVM_FALLTHROUGH;
	case AMD3DNow:
	Builder.defineMacro("__3dNOW__");
	LLVM_FALLTHROUGH;
	case MMX:
	Builder.defineMacro("__MMX__");
	LLVM_FALLTHROUGH;
	case NoMMX3DNow:
	break;
	}

	if (CPU >= CK_i486) {
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
	}
	if (CPU >= CK_i586)
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");

	if (HasFloat128)
	Builder.defineMacro("__SIZEOF_FLOAT128__", "16");
	}

	bool X86TargetInfo::hasFeature(StringRef Feature) const {
	return llvm::StringSwitch<bool>(Feature)
	.Case("aes", HasAES)
	.Case("avx", SSELevel >= AVX)
	.Case("avx2", SSELevel >= AVX2)
	.Case("avx512f", SSELevel >= AVX512F)
	.Case("avx512cd", HasAVX512CD)
	.Case("avx512vpopcntdq", HasAVX512VPOPCNTDQ)
	.Case("avx512er", HasAVX512ER)
	.Case("avx512pf", HasAVX512PF)
	.Case("avx512dq", HasAVX512DQ)
	.Case("avx512bw", HasAVX512BW)
	.Case("avx512vl", HasAVX512VL)
	.Case("avx512vbmi", HasAVX512VBMI)
	.Case("avx512ifma", HasAVX512IFMA)
	.Case("bmi", HasBMI)
	.Case("bmi2", HasBMI2)
	.Case("clflushopt", HasCLFLUSHOPT)
	.Case("clwb", HasCLWB)
	.Case("clzero", HasCLZERO)
	.Case("cx16", HasCX16)
	.Case("f16c", HasF16C)
	.Case("fma", HasFMA)
	.Case("fma4", XOPLevel >= FMA4)
	.Case("fsgsbase", HasFSGSBASE)
	.Case("fxsr", HasFXSR)
	.Case("lzcnt", HasLZCNT)
	.Case("mm3dnow", MMX3DNowLevel >= AMD3DNow)
	.Case("mm3dnowa", MMX3DNowLevel >= AMD3DNowAthlon)
	.Case("mmx", MMX3DNowLevel >= MMX)
	.Case("movbe", HasMOVBE)
	.Case("mpx", HasMPX)
	.Case("pclmul", HasPCLMUL)
	.Case("pku", HasPKU)
	.Case("popcnt", HasPOPCNT)
	.Case("prefetchwt1", HasPREFETCHWT1)
	.Case("prfchw", HasPRFCHW)
	.Case("rdrnd", HasRDRND)
	.Case("rdseed", HasRDSEED)
	.Case("rtm", HasRTM)
	.Case("sgx", HasSGX)
	.Case("sha", HasSHA)
	.Case("sse", SSELevel >= SSE1)
	.Case("sse2", SSELevel >= SSE2)
	.Case("sse3", SSELevel >= SSE3)
	.Case("ssse3", SSELevel >= SSSE3)
	.Case("sse4.1", SSELevel >= SSE41)
	.Case("sse4.2", SSELevel >= SSE42)
	.Case("sse4a", XOPLevel >= SSE4A)
	.Case("tbm", HasTBM)
	.Case("lwp", HasLWP)
	.Case("x86", true)
	.Case("x86_32", getTriple().getArch() == llvm::Triple::x86)
	.Case("x86_64", getTriple().getArch() == llvm::Triple::x86_64)
	.Case("xop", XOPLevel >= XOP)
	.Case("xsave", HasXSAVE)
	.Case("xsavec", HasXSAVEC)
	.Case("xsaves", HasXSAVES)
	.Case("xsaveopt", HasXSAVEOPT)
	.Default(false);
	}

	// We can't use a generic validation scheme for the features accepted here
	// versus subtarget features accepted in the target attribute because the
	// bitfield structure that's initialized in the runtime only supports the
	// below currently rather than the full range of subtarget features. (See
	// X86TargetInfo::hasFeature for a somewhat comprehensive list).
	bool X86TargetInfo::validateCpuSupports(StringRef FeatureStr) const {
	return llvm::StringSwitch<bool>(FeatureStr)
	.Case("cmov", true)
	.Case("mmx", true)
	.Case("popcnt", true)
	.Case("sse", true)
	.Case("sse2", true)
	.Case("sse3", true)
	.Case("ssse3", true)
	.Case("sse4.1", true)
	.Case("sse4.2", true)
	.Case("avx", true)
	.Case("avx2", true)
	.Case("sse4a", true)
	.Case("fma4", true)
	.Case("xop", true)
	.Case("fma", true)
	.Case("avx512f", true)
	.Case("bmi", true)
	.Case("bmi2", true)
	.Case("aes", true)
	.Case("pclmul", true)
	.Case("avx512vl", true)
	.Case("avx512bw", true)
	.Case("avx512dq", true)
	.Case("avx512cd", true)
	.Case("avx512vpopcntdq", true)
	.Case("avx512er", true)
	.Case("avx512pf", true)
	.Case("avx512vbmi", true)
	.Case("avx512ifma", true)
	.Default(false);
	}

	bool
	X86TargetInfo::validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const {
	switch (*Name) {
	default: return false;
	// Constant constraints.
	case 'e': // 32-bit signed integer constant for use with sign-extending x86_64
	// instructions.
	case 'Z': // 32-bit unsigned integer constant for use with zero-extending
	// x86_64 instructions.
	case 's':
	Info.setRequiresImmediate();
	return true;
	case 'I':
	Info.setRequiresImmediate(0, 31);
	return true;
	case 'J':
	Info.setRequiresImmediate(0, 63);
	return true;
	case 'K':
	Info.setRequiresImmediate(-128, 127);
	return true;
	case 'L':
	Info.setRequiresImmediate({ int(0xff), int(0xffff), int(0xffffffff) });
	return true;
	case 'M':
	Info.setRequiresImmediate(0, 3);
	return true;
	case 'N':
	Info.setRequiresImmediate(0, 255);
	return true;
	case 'O':
	Info.setRequiresImmediate(0, 127);
	return true;
	// Register constraints.
	case 'Y': // 'Y' is the first character for several 2-character constraints.
	// Shift the pointer to the second character of the constraint.
	Name++;
	switch (*Name) {
	default:
	return false;
	case '0': // First SSE register.
	case 't': // Any SSE register, when SSE2 is enabled.
	case 'i': // Any SSE register, when SSE2 and inter-unit moves enabled.
	case 'm': // Any MMX register, when inter-unit moves enabled.
	case 'k': // AVX512 arch mask registers: k1-k7.
	Info.setAllowsRegister();
	return true;
	}
	case 'f': // Any x87 floating point stack register.
	// Constraint 'f' cannot be used for output operands.
	if (Info.ConstraintStr[0] == '=')
	return false;
	Info.setAllowsRegister();
	return true;
	case 'a': // eax.
	case 'b': // ebx.
	case 'c': // ecx.
	case 'd': // edx.
	case 'S': // esi.
	case 'D': // edi.
	case 'A': // edx:eax.
	case 't': // Top of floating point stack.
	case 'u': // Second from top of floating point stack.
	case 'q': // Any register accessible as [r]l: a, b, c, and d.
	case 'y': // Any MMX register.
	case 'v': // Any {X,Y,Z}MM register (Arch & context dependent)
	case 'x': // Any SSE register.
	case 'k': // Any AVX512 mask register (same as Yk, additionaly allows k0
	// for intermideate k reg operations).
	case 'Q': // Any register accessible as [r]h: a, b, c, and d.
	case 'R': // "Legacy" registers: ax, bx, cx, dx, di, si, sp, bp.
	case 'l': // "Index" registers: any general register that can be used as an
	// index in a base+index memory access.
	Info.setAllowsRegister();
	return true;
	// Floating point constant constraints.
	case 'C': // SSE floating point constant.
	case 'G': // x87 floating point constant.
	return true;
	}
	}

	bool X86TargetInfo::validateOutputSize(StringRef Constraint,
	unsigned Size) const {
	// Strip off constraint modifiers.
	while (Constraint[0] == '=' \|\|
	Constraint[0] == '+' \|\|
	Constraint[0] == '&')
	Constraint = Constraint.substr(1);

	return validateOperandSize(Constraint, Size);
	}

	bool X86TargetInfo::validateInputSize(StringRef Constraint,
	unsigned Size) const {
	return validateOperandSize(Constraint, Size);
	}

	bool X86TargetInfo::validateOperandSize(StringRef Constraint,
	unsigned Size) const {
	switch (Constraint[0]) {
	default: break;
	case 'k':
	// Registers k0-k7 (AVX512) size limit is 64 bit.
	case 'y':
	return Size <= 64;
	case 'f':
	case 't':
	case 'u':
	return Size <= 128;
	case 'v':
	case 'x':
	if (SSELevel >= AVX512F)
	// 512-bit zmm registers can be used if target supports AVX512F.
	return Size <= 512U;
	else if (SSELevel >= AVX)
	// 256-bit ymm registers can be used if target supports AVX.
	return Size <= 256U;
	return Size <= 128U;
	case 'Y':
	// 'Y' is the first character for several 2-character constraints.
	switch (Constraint[1]) {
	default: break;
	case 'm':
	// 'Ym' is synonymous with 'y'.
	case 'k':
	return Size <= 64;
	case 'i':
	case 't':
	// 'Yi' and 'Yt' are synonymous with 'x' when SSE2 is enabled.
	if (SSELevel >= AVX512F)
	return Size <= 512U;
	else if (SSELevel >= AVX)
	return Size <= 256U;
	return SSELevel >= SSE2 && Size <= 128U;
	}

	}

	return true;
	}

	std::string
	X86TargetInfo::convertConstraint(const char *&Constraint) const {
	switch (*Constraint) {
	case 'a': return std::string("{ax}");
	case 'b': return std::string("{bx}");
	case 'c': return std::string("{cx}");
	case 'd': return std::string("{dx}");
	case 'S': return std::string("{si}");
	case 'D': return std::string("{di}");
	case 'p': // address
	return std::string("im");
	case 't': // top of floating point stack.
	return std::string("{st}");
	case 'u': // second from top of floating point stack.
	return std::string("{st(1)}"); // second from top of floating point stack.
	case 'Y':
	switch (Constraint[1]) {
	default:
	// Break from inner switch and fall through (copy single char),
	// continue parsing after copying the current constraint into
	// the return string.
	break;
	case 'k':
	// "^" hints llvm that this is a 2 letter constraint.
	// "Constraint++" is used to promote the string iterator
	// to the next constraint.
	return std::string("^") + std::string(Constraint++, 2);
	}
	LLVM_FALLTHROUGH;
	default:
	return std::string(1, *Constraint);
	}
	}

	// X86-32 generic target
	class X86_32TargetInfo : public X86TargetInfo {
	public:
	X86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86TargetInfo(Triple, Opts) {
	DoubleAlign = LongLongAlign = 32;
	LongDoubleWidth = 96;
	LongDoubleAlign = 32;
	SuitableAlign = 128;
	resetDataLayout("e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128");
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;
	RegParmMax = 3;

	// Use fpret for all types.
	RealTypeUsesObjCFPRet = ((1 << TargetInfo::Float) \|
	(1 << TargetInfo::Double) \|
	(1 << TargetInfo::LongDouble));

	// x86-32 has atomics up to 8 bytes
	// FIXME: Check that we actually have cmpxchg8b before setting
	// MaxAtomicInlineWidth. (cmpxchg8b is an i586 instruction.)
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0) return 0;
	if (RegNo == 1) return 2;
	return -1;
	}
	bool validateOperandSize(StringRef Constraint,
	unsigned Size) const override {
	switch (Constraint[0]) {
	default: break;
	case 'R':
	case 'q':
	case 'Q':
	case 'a':
	case 'b':
	case 'c':
	case 'd':
	case 'S':
	case 'D':
	return Size <= 32;
	case 'A':
	return Size <= 64;
	}

	return X86TargetInfo::validateOperandSize(Constraint, Size);
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfoX86, clang::X86::LastX86CommonBuiltin -
	Builtin::FirstTSBuiltin + 1);
	}
	};

	class NetBSDI386TargetInfo : public NetBSDTargetInfo<X86_32TargetInfo> {
	public:
	NetBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: NetBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {}

	unsigned getFloatEvalMethod() const override {
	unsigned Major, Minor, Micro;
	getTriple().getOSVersion(Major, Minor, Micro);
	// New NetBSD uses the default rounding mode.
	if (Major >= 7 \|\| (Major == 6 && Minor == 99 && Micro >= 26) \|\| Major == 0)
	return X86_32TargetInfo::getFloatEvalMethod();
	// NetBSD before 6.99.26 defaults to "double" rounding.
	return 1;
	}
	};

	class OpenBSDI386TargetInfo : public OpenBSDTargetInfo<X86_32TargetInfo> {
	public:
	OpenBSDI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OpenBSDTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	SizeType = UnsignedLong;
	IntPtrType = SignedLong;
	PtrDiffType = SignedLong;
	}
	};

	class BitrigI386TargetInfo : public BitrigTargetInfo<X86_32TargetInfo> {
	public:
	BitrigI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: BitrigTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	SizeType = UnsignedLong;
	IntPtrType = SignedLong;
	PtrDiffType = SignedLong;
	}
	};

	class DarwinI386TargetInfo : public DarwinTargetInfo<X86_32TargetInfo> {
	public:
	DarwinI386TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	LongDoubleWidth = 128;
	LongDoubleAlign = 128;
	SuitableAlign = 128;
	MaxVectorAlign = 256;
	// The watchOS simulator uses the builtin bool type for Objective-C.
	llvm::Triple T = llvm::Triple(Triple);
	if (T.isWatchOS())
	UseSignedCharForObjCBool = false;
	SizeType = UnsignedLong;
	IntPtrType = SignedLong;
	resetDataLayout("e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128");
	HasAlignMac68kSupport = true;
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	if (!DarwinTargetInfo<X86_32TargetInfo>::handleTargetFeatures(Features,
	Diags))
	return false;
	// We now know the features we have: we can decide how to align vectors.
	MaxVectorAlign =
	hasFeature("avx512f") ? 512 : hasFeature("avx") ? 256 : 128;
	return true;
	}
	};

	// x86-32 Windows target
	class WindowsX86_32TargetInfo : public WindowsTargetInfo<X86_32TargetInfo> {
	public:
	WindowsX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	WCharType = UnsignedShort;
	DoubleAlign = LongLongAlign = 64;
	bool IsWinCOFF =
	getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
	resetDataLayout(IsWinCOFF
	? "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
	: "e-m:e-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32");
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsTargetInfo<X86_32TargetInfo>::getTargetDefines(Opts, Builder);
	}
	};

	// x86-32 Windows Visual Studio target
	class MicrosoftX86_32TargetInfo : public WindowsX86_32TargetInfo {
	public:
	MicrosoftX86_32TargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: WindowsX86_32TargetInfo(Triple, Opts) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsX86_32TargetInfo::getTargetDefines(Opts, Builder);
	WindowsX86_32TargetInfo::getVisualStudioDefines(Opts, Builder);
	// The value of the following reflects processor type.
	// 300=386, 400=486, 500=Pentium, 600=Blend (default)
	// We lost the original triple, so we use the default.
	Builder.defineMacro("_M_IX86", "600");
	}
	};

	static void addCygMingDefines(const LangOptions &Opts, MacroBuilder &Builder) {
	// Mingw and cygwin define __declspec(a) to __attribute__((a)). Clang
	// supports __declspec natively under -fms-extensions, but we define a no-op
	// __declspec macro anyway for pre-processor compatibility.
	if (Opts.MicrosoftExt)
	Builder.defineMacro("__declspec", "__declspec");
	else
	Builder.defineMacro("__declspec(a)", "__attribute__((a))");

	if (!Opts.MicrosoftExt) {
	// Provide macros for all the calling convention keywords. Provide both
	// single and double underscore prefixed variants. These are available on
	// x64 as well as x86, even though they have no effect.
	const char *CCs[] = {"cdecl", "stdcall", "fastcall", "thiscall", "pascal"};
	for (const char *CC : CCs) {
	std::string GCCSpelling = "__attribute__((__";
	GCCSpelling += CC;
	GCCSpelling += "__))";
	Builder.defineMacro(Twine("_") + CC, GCCSpelling);
	Builder.defineMacro(Twine("__") + CC, GCCSpelling);
	}
	}
	}

	static void addMinGWDefines(const LangOptions &Opts, MacroBuilder &Builder) {
	Builder.defineMacro("__MSVCRT__");
	Builder.defineMacro("__MINGW32__");
	addCygMingDefines(Opts, Builder);
	}

	// x86-32 MinGW target
	class MinGWX86_32TargetInfo : public WindowsX86_32TargetInfo {
	public:
	MinGWX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsX86_32TargetInfo(Triple, Opts) {
	HasFloat128 = true;
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsX86_32TargetInfo::getTargetDefines(Opts, Builder);
	DefineStd(Builder, "WIN32", Opts);
	DefineStd(Builder, "WINNT", Opts);
	Builder.defineMacro("_X86_");
	addMinGWDefines(Opts, Builder);
	}
	};

	// x86-32 Cygwin target
	class CygwinX86_32TargetInfo : public X86_32TargetInfo {
	public:
	CygwinX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86_32TargetInfo(Triple, Opts) {
	WCharType = UnsignedShort;
	DoubleAlign = LongLongAlign = 64;
	resetDataLayout("e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32");
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	X86_32TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("_X86_");
	Builder.defineMacro("__CYGWIN__");
	Builder.defineMacro("__CYGWIN32__");
	addCygMingDefines(Opts, Builder);
	DefineStd(Builder, "unix", Opts);
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}
	};

	// x86-32 Haiku target
	class HaikuX86_32TargetInfo : public HaikuTargetInfo<X86_32TargetInfo> {
	public:
	HaikuX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: HaikuTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	HaikuTargetInfo<X86_32TargetInfo>::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__INTEL__");
	}
	};

	// X86-32 MCU target
	class MCUX86_32TargetInfo : public X86_32TargetInfo {
	public:
	MCUX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86_32TargetInfo(Triple, Opts) {
	LongDoubleWidth = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	resetDataLayout("e-m:e-p:32:32-i64:32-f64:32-f128:32-n8:16:32-a:0:32-S32");
	WIntType = UnsignedInt;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	// On MCU we support only C calling convention.
	return CC == CC_C ? CCCR_OK : CCCR_Warning;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	X86_32TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__iamcu");
	Builder.defineMacro("__iamcu__");
	}

	bool allowsLargerPreferedTypeAlignment() const override {
	return false;
	}
	};

	// RTEMS Target
	template<typename Target>
	class RTEMSTargetInfo : public OSTargetInfo<Target> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	// RTEMS defines; list based off of gcc output

	Builder.defineMacro("__rtems__");
	Builder.defineMacro("__ELF__");
	}

	public:
	RTEMSTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OSTargetInfo<Target>(Triple, Opts) {
	switch (Triple.getArch()) {
	default:
	case llvm::Triple::x86:
	// this->MCountName = ".mcount";
	break;
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	case llvm::Triple::ppc:
	case llvm::Triple::ppc64:
	case llvm::Triple::ppc64le:
	// this->MCountName = "_mcount";
	break;
	case llvm::Triple::arm:
	// this->MCountName = "__mcount";
	break;
	}
	}
	};

	// x86-32 RTEMS target
	class RTEMSX86_32TargetInfo : public X86_32TargetInfo {
	public:
	RTEMSX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86_32TargetInfo(Triple, Opts) {
	SizeType = UnsignedLong;
	IntPtrType = SignedLong;
	PtrDiffType = SignedLong;
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	X86_32TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__INTEL__");
	Builder.defineMacro("__rtems__");
	}
	};

	// x86-64 generic target
	class X86_64TargetInfo : public X86TargetInfo {
	public:
	X86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86TargetInfo(Triple, Opts) {
	const bool IsX32 = getTriple().getEnvironment() == llvm::Triple::GNUX32;
	bool IsWinCOFF =
	getTriple().isOSWindows() && getTriple().isOSBinFormatCOFF();
	LongWidth = LongAlign = PointerWidth = PointerAlign = IsX32 ? 32 : 64;
	LongDoubleWidth = 128;
	LongDoubleAlign = 128;
	LargeArrayMinWidth = 128;
	LargeArrayAlign = 128;
	SuitableAlign = 128;
	SizeType = IsX32 ? UnsignedInt : UnsignedLong;
	PtrDiffType = IsX32 ? SignedInt : SignedLong;
	IntPtrType = IsX32 ? SignedInt : SignedLong;
	IntMaxType = IsX32 ? SignedLongLong : SignedLong;
	Int64Type = IsX32 ? SignedLongLong : SignedLong;
	RegParmMax = 6;

	// Pointers are 32-bit in x32.
	resetDataLayout(IsX32
	? "e-m:e-p:32:32-i64:64-f80:128-n8:16:32:64-S128"
	: IsWinCOFF ? "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
	: "e-m:e-i64:64-f80:128-n8:16:32:64-S128");

	// Use fpret only for long double.
	RealTypeUsesObjCFPRet = (1 << TargetInfo::LongDouble);

	// Use fp2ret for _Complex long double.
	ComplexLongDoubleUsesFP2Ret = true;

	// Make __builtin_ms_va_list available.
	HasBuiltinMSVaList = true;

	// x86-64 has atomics up to 16 bytes.
	MaxAtomicPromoteWidth = 128;
	MaxAtomicInlineWidth = 128;
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::X86_64ABIBuiltinVaList;
	}

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0) return 0;
	if (RegNo == 1) return 1;
	return -1;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_C:
	case CC_Swift:
	case CC_X86VectorCall:
	case CC_IntelOclBicc:
	case CC_Win64:
	case CC_PreserveMost:
	case CC_PreserveAll:
	case CC_X86RegCall:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}

	CallingConv getDefaultCallingConv(CallingConvMethodType MT) const override {
	return CC_C;
	}

	// for x32 we need it here explicitly
	bool hasInt128Type() const override { return true; }
	unsigned getUnwindWordWidth() const override { return 64; }
	unsigned getRegisterWidth() const override { return 64; }

	bool validateGlobalRegisterVariable(StringRef RegName,
	unsigned RegSize,
	bool &HasSizeMismatch) const override {
	// rsp and rbp are the only 64-bit registers the x86 backend can currently
	// handle.
	if (RegName.equals("rsp") \|\| RegName.equals("rbp")) {
	// Check that the register size is 64-bit.
	HasSizeMismatch = RegSize != 64;
	return true;
	}

	// Check if the register is a 32-bit register the backend can handle.
	return X86TargetInfo::validateGlobalRegisterVariable(RegName, RegSize,
	HasSizeMismatch);
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfoX86,
	X86::LastTSBuiltin - Builtin::FirstTSBuiltin);
	}
	};

	// x86-64 Windows target
	class WindowsX86_64TargetInfo : public WindowsTargetInfo<X86_64TargetInfo> {
	public:
	WindowsX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	WCharType = UnsignedShort;
	LongWidth = LongAlign = 32;
	DoubleAlign = LongLongAlign = 64;
	IntMaxType = SignedLongLong;
	Int64Type = SignedLongLong;
	SizeType = UnsignedLongLong;
	PtrDiffType = SignedLongLong;
	IntPtrType = SignedLongLong;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsTargetInfo<X86_64TargetInfo>::getTargetDefines(Opts, Builder);
	Builder.defineMacro("_WIN64");
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_X86StdCall:
	case CC_X86ThisCall:
	case CC_X86FastCall:
	return CCCR_Ignore;
	case CC_C:
	case CC_X86VectorCall:
	case CC_IntelOclBicc:
	case CC_X86_64SysV:
	case CC_Swift:
	case CC_X86RegCall:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}
	};

	// x86-64 Windows Visual Studio target
	class MicrosoftX86_64TargetInfo : public WindowsX86_64TargetInfo {
	public:
	MicrosoftX86_64TargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: WindowsX86_64TargetInfo(Triple, Opts) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsX86_64TargetInfo::getTargetDefines(Opts, Builder);
	WindowsX86_64TargetInfo::getVisualStudioDefines(Opts, Builder);
	Builder.defineMacro("_M_X64", "100");
	Builder.defineMacro("_M_AMD64", "100");
	}
	};

	// x86-64 MinGW target
	class MinGWX86_64TargetInfo : public WindowsX86_64TargetInfo {
	public:
	MinGWX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsX86_64TargetInfo(Triple, Opts) {
	// Mingw64 rounds long double size and alignment up to 16 bytes, but sticks
	// with x86 FP ops. Weird.
	LongDoubleWidth = LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::x87DoubleExtended();
	HasFloat128 = true;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsX86_64TargetInfo::getTargetDefines(Opts, Builder);
	DefineStd(Builder, "WIN64", Opts);
	Builder.defineMacro("__MINGW64__");
	addMinGWDefines(Opts, Builder);

	// GCC defines this macro when it is using __gxx_personality_seh0.
	if (!Opts.SjLjExceptions)
	Builder.defineMacro("__SEH__");
	}
	};

	// x86-64 Cygwin target
	class CygwinX86_64TargetInfo : public X86_64TargetInfo {
	public:
	CygwinX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: X86_64TargetInfo(Triple, Opts) {
	TLSSupported = false;
	WCharType = UnsignedShort;
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	X86_64TargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__x86_64__");
	Builder.defineMacro("__CYGWIN__");
	Builder.defineMacro("__CYGWIN64__");
	addCygMingDefines(Opts, Builder);
	DefineStd(Builder, "unix", Opts);
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");

	// GCC defines this macro when it is using __gxx_personality_seh0.
	if (!Opts.SjLjExceptions)
	Builder.defineMacro("__SEH__");
	}
	};

	class DarwinX86_64TargetInfo : public DarwinTargetInfo<X86_64TargetInfo> {
	public:
	DarwinX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	Int64Type = SignedLongLong;
	// The 64-bit iOS simulator uses the builtin bool type for Objective-C.
	llvm::Triple T = llvm::Triple(Triple);
	if (T.isiOS())
	UseSignedCharForObjCBool = false;
	resetDataLayout("e-m:o-i64:64-f80:128-n8:16:32:64-S128");
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	if (!DarwinTargetInfo<X86_64TargetInfo>::handleTargetFeatures(Features,
	Diags))
	return false;
	// We now know the features we have: we can decide how to align vectors.
	MaxVectorAlign =
	hasFeature("avx512f") ? 512 : hasFeature("avx") ? 256 : 128;
	return true;
	}
	};

	class OpenBSDX86_64TargetInfo : public OpenBSDTargetInfo<X86_64TargetInfo> {
	public:
	OpenBSDX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: OpenBSDTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	IntMaxType = SignedLongLong;
	Int64Type = SignedLongLong;
	}
	};

	class BitrigX86_64TargetInfo : public BitrigTargetInfo<X86_64TargetInfo> {
	public:
	BitrigX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: BitrigTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	IntMaxType = SignedLongLong;
	Int64Type = SignedLongLong;
	}
	};

	class ARMTargetInfo : public TargetInfo {
	// Possible FPU choices.
	enum FPUMode {
	VFP2FPU = (1 << 0),
	VFP3FPU = (1 << 1),
	VFP4FPU = (1 << 2),
	NeonFPU = (1 << 3),
	FPARMV8 = (1 << 4)
	};

	// Possible HWDiv features.
	enum HWDivMode {
	HWDivThumb = (1 << 0),
	HWDivARM = (1 << 1)
	};

	static bool FPUModeIsVFP(FPUMode Mode) {
	return Mode & (VFP2FPU \| VFP3FPU \| VFP4FPU \| NeonFPU \| FPARMV8);
	}

	static const TargetInfo::GCCRegAlias GCCRegAliases[];
	static const char * const GCCRegNames[];

	std::string ABI, CPU;

	StringRef CPUProfile;
	StringRef CPUAttr;

	enum {
	FP_Default,
	FP_VFP,
	FP_Neon
	} FPMath;

	unsigned ArchISA;
	unsigned ArchKind = llvm::ARM::AK_ARMV4T;
	unsigned ArchProfile;
	unsigned ArchVersion;

	unsigned FPU : 5;

	unsigned IsAAPCS : 1;
	unsigned HWDiv : 2;

	// Initialized via features.
	unsigned SoftFloat : 1;
	unsigned SoftFloatABI : 1;

	unsigned CRC : 1;
	unsigned Crypto : 1;
	unsigned DSP : 1;
	unsigned Unaligned : 1;

	enum {
	LDREX_B = (1 << 0), /// byte (8-bit)
	LDREX_H = (1 << 1), /// half (16-bit)
	LDREX_W = (1 << 2), /// word (32-bit)
	LDREX_D = (1 << 3), /// double (64-bit)
	};

	uint32_t LDREX;

	// ACLE 6.5.1 Hardware floating point
	enum {
	HW_FP_HP = (1 << 1), /// half (16-bit)
	HW_FP_SP = (1 << 2), /// single (32-bit)
	HW_FP_DP = (1 << 3), /// double (64-bit)
	};
	uint32_t HW_FP;

	static const Builtin::Info BuiltinInfo[];

	void setABIAAPCS() {
	IsAAPCS = true;

	DoubleAlign = LongLongAlign = LongDoubleAlign = SuitableAlign = 64;
	const llvm::Triple &T = getTriple();

	// size_t is unsigned long on MachO-derived environments, NetBSD,
	// OpenBSD and Bitrig.
	if (T.isOSBinFormatMachO() \|\| T.getOS() == llvm::Triple::NetBSD \|\|
	T.getOS() == llvm::Triple::OpenBSD \|\|
	T.getOS() == llvm::Triple::Bitrig)
	SizeType = UnsignedLong;
	else
	SizeType = UnsignedInt;

	switch (T.getOS()) {
	case llvm::Triple::NetBSD:
	case llvm::Triple::OpenBSD:
	WCharType = SignedInt;
	break;
	case llvm::Triple::Win32:
	WCharType = UnsignedShort;
	break;
	case llvm::Triple::Linux:
	default:
	// AAPCS 7.1.1, ARM-Linux ABI 2.4: type of wchar_t is unsigned int.
	WCharType = UnsignedInt;
	break;
	}

	UseBitFieldTypeAlignment = true;

	ZeroLengthBitfieldBoundary = 0;

	// Thumb1 add sp, #imm requires the immediate value be multiple of 4,
	// so set preferred for small types to 32.
	if (T.isOSBinFormatMachO()) {
	resetDataLayout(BigEndian
	? "E-m:o-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
	: "e-m:o-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64");
	} else if (T.isOSWindows()) {
	assert(!BigEndian && "Windows on ARM does not support big endian");
	resetDataLayout("e"
	"-m:w"
	"-p:32:32"
	"-i64:64"
	"-v128:64:128"
	"-a:0:32"
	"-n32"
	"-S64");
	} else if (T.isOSNaCl()) {
	assert(!BigEndian && "NaCl on ARM does not support big endian");
	resetDataLayout("e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S128");
	} else {
	resetDataLayout(BigEndian
	? "E-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
	: "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64");
	}

	// FIXME: Enumerated types are variable width in straight AAPCS.
	}

	void setABIAPCS(bool IsAAPCS16) {
	const llvm::Triple &T = getTriple();

	IsAAPCS = false;

	if (IsAAPCS16)
	DoubleAlign = LongLongAlign = LongDoubleAlign = SuitableAlign = 64;
	else
	DoubleAlign = LongLongAlign = LongDoubleAlign = SuitableAlign = 32;

	// size_t is unsigned int on FreeBSD.
	if (T.getOS() == llvm::Triple::FreeBSD)
	SizeType = UnsignedInt;
	else
	SizeType = UnsignedLong;

	// Revert to using SignedInt on apcs-gnu to comply with existing behaviour.
	WCharType = SignedInt;

	// Do not respect the alignment of bit-field types when laying out
	// structures. This corresponds to PCC_BITFIELD_TYPE_MATTERS in gcc.
	UseBitFieldTypeAlignment = false;

	/// gcc forces the alignment to 4 bytes, regardless of the type of the
	/// zero length bitfield. This corresponds to EMPTY_FIELD_BOUNDARY in
	/// gcc.
	ZeroLengthBitfieldBoundary = 32;

	if (T.isOSBinFormatMachO() && IsAAPCS16) {
	assert(!BigEndian && "AAPCS16 does not support big-endian");
	resetDataLayout("e-m:o-p:32:32-i64:64-a:0:32-n32-S128");
	} else if (T.isOSBinFormatMachO())
	resetDataLayout(
	BigEndian
	? "E-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
	: "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32");
	else
	resetDataLayout(
	BigEndian
	? "E-m:e-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
	: "e-m:e-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32");

	// FIXME: Override "preferred align" for double and long long.
	}

	void setArchInfo() {
	StringRef ArchName = getTriple().getArchName();

	ArchISA = llvm::ARM::parseArchISA(ArchName);
	CPU = llvm::ARM::getDefaultCPU(ArchName);
	unsigned AK = llvm::ARM::parseArch(ArchName);
	if (AK != llvm::ARM::AK_INVALID)
	ArchKind = AK;
	setArchInfo(ArchKind);
	}

	void setArchInfo(unsigned Kind) {
	StringRef SubArch;

	// cache TargetParser info
	ArchKind = Kind;
	SubArch = llvm::ARM::getSubArch(ArchKind);
	ArchProfile = llvm::ARM::parseArchProfile(SubArch);
	ArchVersion = llvm::ARM::parseArchVersion(SubArch);

	// cache CPU related strings
	CPUAttr = getCPUAttr();
	CPUProfile = getCPUProfile();
	}

	void setAtomic() {
	// when triple does not specify a sub arch,
	// then we are not using inline atomics
	bool ShouldUseInlineAtomic =
	(ArchISA == llvm::ARM::IK_ARM && ArchVersion >= 6) \|\|
	(ArchISA == llvm::ARM::IK_THUMB && ArchVersion >= 7);
	// Cortex M does not support 8 byte atomics, while general Thumb2 does.
	if (ArchProfile == llvm::ARM::PK_M) {
	MaxAtomicPromoteWidth = 32;
	if (ShouldUseInlineAtomic)
	MaxAtomicInlineWidth = 32;
	}
	else {
	MaxAtomicPromoteWidth = 64;
	if (ShouldUseInlineAtomic)
	MaxAtomicInlineWidth = 64;
	}
	}

	bool isThumb() const {
	return (ArchISA == llvm::ARM::IK_THUMB);
	}

	bool supportsThumb() const {
	return CPUAttr.count('T') \|\| ArchVersion >= 6;
	}

	bool supportsThumb2() const {
	return CPUAttr.equals("6T2") \|\|
	(ArchVersion >= 7 && !CPUAttr.equals("8M_BASE"));
	}

	StringRef getCPUAttr() const {
	// For most sub-arches, the build attribute CPU name is enough.
	// For Cortex variants, it's slightly different.
	switch(ArchKind) {
	default:
	return llvm::ARM::getCPUAttr(ArchKind);
	case llvm::ARM::AK_ARMV6M:
	return "6M";
	case llvm::ARM::AK_ARMV7S:
	return "7S";
	case llvm::ARM::AK_ARMV7A:
	return "7A";
	case llvm::ARM::AK_ARMV7R:
	return "7R";
	case llvm::ARM::AK_ARMV7M:
	return "7M";
	case llvm::ARM::AK_ARMV7EM:
	return "7EM";
	case llvm::ARM::AK_ARMV7VE:
	return "7VE";
	case llvm::ARM::AK_ARMV8A:
	return "8A";
	case llvm::ARM::AK_ARMV8_1A:
	return "8_1A";
	case llvm::ARM::AK_ARMV8_2A:
	return "8_2A";
	case llvm::ARM::AK_ARMV8MBaseline:
	return "8M_BASE";
	case llvm::ARM::AK_ARMV8MMainline:
	return "8M_MAIN";
	case llvm::ARM::AK_ARMV8R:
	return "8R";
	}
	}

	StringRef getCPUProfile() const {
	switch(ArchProfile) {
	case llvm::ARM::PK_A:
	return "A";
	case llvm::ARM::PK_R:
	return "R";
	case llvm::ARM::PK_M:
	return "M";
	default:
	return "";
	}
	}

	public:
	ARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: TargetInfo(Triple), FPMath(FP_Default), IsAAPCS(true), LDREX(0),
	HW_FP(0) {

	switch (getTriple().getOS()) {
	case llvm::Triple::NetBSD:
	case llvm::Triple::OpenBSD:
	PtrDiffType = SignedLong;
	break;
	default:
	PtrDiffType = SignedInt;
	break;
	}

	// Cache arch related info.
	setArchInfo();

	// {} in inline assembly are neon specifiers, not assembly variant
	// specifiers.
	NoAsmVariants = true;

	// FIXME: This duplicates code from the driver that sets the -target-abi
	// option - this code is used if -target-abi isn't passed and should
	// be unified in some way.
	if (Triple.isOSBinFormatMachO()) {
	// The backend is hardwired to assume AAPCS for M-class processors, ensure
	// the frontend matches that.
	if (Triple.getEnvironment() == llvm::Triple::EABI \|\|
	Triple.getOS() == llvm::Triple::UnknownOS \|\|
	ArchProfile == llvm::ARM::PK_M) {
	setABI("aapcs");
	} else if (Triple.isWatchABI()) {
	setABI("aapcs16");
	} else {
	setABI("apcs-gnu");
	}
	} else if (Triple.isOSWindows()) {
	// FIXME: this is invalid for WindowsCE
	setABI("aapcs");
	} else {
	// Select the default based on the platform.
	switch (Triple.getEnvironment()) {
	case llvm::Triple::Android:
	case llvm::Triple::GNUEABI:
	case llvm::Triple::GNUEABIHF:
	case llvm::Triple::MuslEABI:
	case llvm::Triple::MuslEABIHF:
	setABI("aapcs-linux");
	break;
	case llvm::Triple::EABIHF:
	case llvm::Triple::EABI:
	setABI("aapcs");
	break;
	case llvm::Triple::GNU:
	setABI("apcs-gnu");
	break;
	default:
	if (Triple.getOS() == llvm::Triple::NetBSD)
	setABI("apcs-gnu");
	else if (Triple.getOS() == llvm::Triple::OpenBSD)
	setABI("aapcs-linux");
	else
	setABI("aapcs");
	break;
	}
	}

	// ARM targets default to using the ARM C++ ABI.
	TheCXXABI.set(TargetCXXABI::GenericARM);

	// ARM has atomics up to 8 bytes
	setAtomic();

	// Maximum alignment for ARM NEON data types should be 64-bits (AAPCS)
	if (IsAAPCS && (Triple.getEnvironment() != llvm::Triple::Android))
	MaxVectorAlign = 64;

	// Do force alignment of members that follow zero length bitfields. If
	// the alignment of the zero-length bitfield is greater than the member
	// that follows it, `bar', `bar' will be aligned as the type of the
	// zero length bitfield.
	UseZeroLengthBitfieldAlignment = true;

	if (Triple.getOS() == llvm::Triple::Linux \|\|
	Triple.getOS() == llvm::Triple::UnknownOS)
	this->MCountName =
	Opts.EABIVersion == llvm::EABI::GNU ? "\01__gnu_mcount_nc" : "\01mcount";
	}

	StringRef getABI() const override { return ABI; }

	bool setABI(const std::string &Name) override {
	ABI = Name;

	// The defaults (above) are for AAPCS, check if we need to change them.
	//
	// FIXME: We need support for -meabi... we could just mangle it into the
	// name.
	if (Name == "apcs-gnu" \|\| Name == "aapcs16") {
	setABIAPCS(Name == "aapcs16");
	return true;
	}
	if (Name == "aapcs" \|\| Name == "aapcs-vfp" \|\| Name == "aapcs-linux") {
	setABIAAPCS();
	return true;
	}
	return false;
	}

	// FIXME: This should be based on Arch attributes, not CPU names.
	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override {

	std::vector<StringRef> TargetFeatures;
	unsigned Arch = llvm::ARM::parseArch(getTriple().getArchName());

	// get default FPU features
	unsigned FPUKind = llvm::ARM::getDefaultFPU(CPU, Arch);
	llvm::ARM::getFPUFeatures(FPUKind, TargetFeatures);

	// get default Extension features
	unsigned Extensions = llvm::ARM::getDefaultExtensions(CPU, Arch);
	llvm::ARM::getExtensionFeatures(Extensions, TargetFeatures);

	for (auto Feature : TargetFeatures)
	if (Feature[0] == '+')
	Features[Feature.drop_front(1)] = true;

	// Enable or disable thumb-mode explicitly per function to enable mixed
	// ARM and Thumb code generation.
	if (isThumb())
	Features["thumb-mode"] = true;
	else
	Features["thumb-mode"] = false;

	// Convert user-provided arm and thumb GNU target attributes to
	// [-\|+]thumb-mode target features respectively.
	std::vector<std::string> UpdatedFeaturesVec(FeaturesVec);
	for (auto &Feature : UpdatedFeaturesVec) {
	if (Feature.compare("+arm") == 0)
	Feature = "-thumb-mode";
	else if (Feature.compare("+thumb") == 0)
	Feature = "+thumb-mode";
	}

	return TargetInfo::initFeatureMap(Features, Diags, CPU, UpdatedFeaturesVec);
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	FPU = 0;
	CRC = 0;
	Crypto = 0;
	DSP = 0;
	Unaligned = 1;
	SoftFloat = SoftFloatABI = false;
	HWDiv = 0;

	// This does not diagnose illegal cases like having both
	// "+vfpv2" and "+vfpv3" or having "+neon" and "+fp-only-sp".
	uint32_t HW_FP_remove = 0;
	for (const auto &Feature : Features) {
	if (Feature == "+soft-float") {
	SoftFloat = true;
	} else if (Feature == "+soft-float-abi") {
	SoftFloatABI = true;
	} else if (Feature == "+vfp2") {
	FPU \|= VFP2FPU;
	HW_FP \|= HW_FP_SP \| HW_FP_DP;
	} else if (Feature == "+vfp3") {
	FPU \|= VFP3FPU;
	HW_FP \|= HW_FP_SP \| HW_FP_DP;
	} else if (Feature == "+vfp4") {
	FPU \|= VFP4FPU;
	HW_FP \|= HW_FP_SP \| HW_FP_DP \| HW_FP_HP;
	} else if (Feature == "+fp-armv8") {
	FPU \|= FPARMV8;
	HW_FP \|= HW_FP_SP \| HW_FP_DP \| HW_FP_HP;
	} else if (Feature == "+neon") {
	FPU \|= NeonFPU;
	HW_FP \|= HW_FP_SP \| HW_FP_DP;
	} else if (Feature == "+hwdiv") {
	HWDiv \|= HWDivThumb;
	} else if (Feature == "+hwdiv-arm") {
	HWDiv \|= HWDivARM;
	} else if (Feature == "+crc") {
	CRC = 1;
	} else if (Feature == "+crypto") {
	Crypto = 1;
	} else if (Feature == "+dsp") {
	DSP = 1;
	} else if (Feature == "+fp-only-sp") {
	HW_FP_remove \|= HW_FP_DP;
	} else if (Feature == "+strict-align") {
	Unaligned = 0;
	} else if (Feature == "+fp16") {
	HW_FP \|= HW_FP_HP;
	}
	}
	HW_FP &= ~HW_FP_remove;

	switch (ArchVersion) {
	case 6:
	if (ArchProfile == llvm::ARM::PK_M)
	LDREX = 0;
	else if (ArchKind == llvm::ARM::AK_ARMV6K)
	LDREX = LDREX_D \| LDREX_W \| LDREX_H \| LDREX_B ;
	else
	LDREX = LDREX_W;
	break;
	case 7:
	if (ArchProfile == llvm::ARM::PK_M)
	LDREX = LDREX_W \| LDREX_H \| LDREX_B ;
	else
	LDREX = LDREX_D \| LDREX_W \| LDREX_H \| LDREX_B ;
	break;
	case 8:
	LDREX = LDREX_D \| LDREX_W \| LDREX_H \| LDREX_B ;
	}

	if (!(FPU & NeonFPU) && FPMath == FP_Neon) {
	Diags.Report(diag::err_target_unsupported_fpmath) << "neon";
	return false;
	}

	if (FPMath == FP_Neon)
	Features.push_back("+neonfp");
	else if (FPMath == FP_VFP)
	Features.push_back("-neonfp");

	// Remove front-end specific options which the backend handles differently.
	auto Feature =
	std::find(Features.begin(), Features.end(), "+soft-float-abi");
	if (Feature != Features.end())
	Features.erase(Feature);

	return true;
	}

	bool hasFeature(StringRef Feature) const override {
	return llvm::StringSwitch<bool>(Feature)
	.Case("arm", true)
	.Case("aarch32", true)
	.Case("softfloat", SoftFloat)
	.Case("thumb", isThumb())
	.Case("neon", (FPU & NeonFPU) && !SoftFloat)
	.Case("vfp", FPU && !SoftFloat)
	.Case("hwdiv", HWDiv & HWDivThumb)
	.Case("hwdiv-arm", HWDiv & HWDivARM)
	.Default(false);
	}

	bool setCPU(const std::string &Name) override {
	if (Name != "generic")
	setArchInfo(llvm::ARM::parseCPUArch(Name));

	if (ArchKind == llvm::ARM::AK_INVALID)
	return false;
	setAtomic();
	CPU = Name;
	return true;
	}

	bool setFPMath(StringRef Name) override;

	void getTargetDefinesARMV81A(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	Builder.defineMacro("__ARM_FEATURE_QRDMX", "1");
	}

	void getTargetDefinesARMV82A(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	// Also include the ARMv8.1-A defines
	getTargetDefinesARMV81A(Opts, Builder);
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	// Target identification.
	Builder.defineMacro("__arm");
	Builder.defineMacro("__arm__");
	// For bare-metal none-eabi.
	if (getTriple().getOS() == llvm::Triple::UnknownOS &&
	(getTriple().getEnvironment() == llvm::Triple::EABI \|\|
	getTriple().getEnvironment() == llvm::Triple::EABIHF))
	Builder.defineMacro("__ELF__");


	// Target properties.
	Builder.defineMacro("__REGISTER_PREFIX__", "");

	// Unfortunately, __ARM_ARCH_7K__ is now more of an ABI descriptor. The CPU
	// happens to be Cortex-A7 though, so it should still get __ARM_ARCH_7A__.
	if (getTriple().isWatchABI())
	Builder.defineMacro("__ARM_ARCH_7K__", "2");

	if (!CPUAttr.empty())
	Builder.defineMacro("__ARM_ARCH_" + CPUAttr + "__");

	// ACLE 6.4.1 ARM/Thumb instruction set architecture
	// __ARM_ARCH is defined as an integer value indicating the current ARM ISA
	Builder.defineMacro("__ARM_ARCH", Twine(ArchVersion));

	if (ArchVersion >= 8) {
	// ACLE 6.5.7 Crypto Extension
	if (Crypto)
	Builder.defineMacro("__ARM_FEATURE_CRYPTO", "1");
	// ACLE 6.5.8 CRC32 Extension
	if (CRC)
	Builder.defineMacro("__ARM_FEATURE_CRC32", "1");
	// ACLE 6.5.10 Numeric Maximum and Minimum
	Builder.defineMacro("__ARM_FEATURE_NUMERIC_MAXMIN", "1");
	// ACLE 6.5.9 Directed Rounding
	Builder.defineMacro("__ARM_FEATURE_DIRECTED_ROUNDING", "1");
	}

	// __ARM_ARCH_ISA_ARM is defined to 1 if the core supports the ARM ISA. It
	// is not defined for the M-profile.
	// NOTE that the default profile is assumed to be 'A'
	if (CPUProfile.empty() \|\| ArchProfile != llvm::ARM::PK_M)
	Builder.defineMacro("__ARM_ARCH_ISA_ARM", "1");

	// __ARM_ARCH_ISA_THUMB is defined to 1 if the core supports the original
	// Thumb ISA (including v6-M and v8-M Baseline). It is set to 2 if the
	// core supports the Thumb-2 ISA as found in the v6T2 architecture and all
	// v7 and v8 architectures excluding v8-M Baseline.
	if (supportsThumb2())
	Builder.defineMacro("__ARM_ARCH_ISA_THUMB", "2");
	else if (supportsThumb())
	Builder.defineMacro("__ARM_ARCH_ISA_THUMB", "1");

	// __ARM_32BIT_STATE is defined to 1 if code is being generated for a 32-bit
	// instruction set such as ARM or Thumb.
	Builder.defineMacro("__ARM_32BIT_STATE", "1");

	// ACLE 6.4.2 Architectural Profile (A, R, M or pre-Cortex)

	// __ARM_ARCH_PROFILE is defined as 'A', 'R', 'M' or 'S', or unset.
	if (!CPUProfile.empty())
	Builder.defineMacro("__ARM_ARCH_PROFILE", "'" + CPUProfile + "'");

	// ACLE 6.4.3 Unaligned access supported in hardware
	if (Unaligned)
	Builder.defineMacro("__ARM_FEATURE_UNALIGNED", "1");

	// ACLE 6.4.4 LDREX/STREX
	if (LDREX)
	Builder.defineMacro("__ARM_FEATURE_LDREX", "0x" + llvm::utohexstr(LDREX));

	// ACLE 6.4.5 CLZ
	if (ArchVersion == 5 \|\|
	(ArchVersion == 6 && CPUProfile != "M") \|\|
	ArchVersion > 6)
	Builder.defineMacro("__ARM_FEATURE_CLZ", "1");

	// ACLE 6.5.1 Hardware Floating Point
	if (HW_FP)
	Builder.defineMacro("__ARM_FP", "0x" + llvm::utohexstr(HW_FP));

	// ACLE predefines.
	Builder.defineMacro("__ARM_ACLE", "200");

	// FP16 support (we currently only support IEEE format).
	Builder.defineMacro("__ARM_FP16_FORMAT_IEEE", "1");
	Builder.defineMacro("__ARM_FP16_ARGS", "1");

	// ACLE 6.5.3 Fused multiply-accumulate (FMA)
	if (ArchVersion >= 7 && (FPU & VFP4FPU))
	Builder.defineMacro("__ARM_FEATURE_FMA", "1");

	// Subtarget options.

	// FIXME: It's more complicated than this and we don't really support
	// interworking.
	// Windows on ARM does not "support" interworking
	if (5 <= ArchVersion && ArchVersion <= 8 && !getTriple().isOSWindows())
	Builder.defineMacro("__THUMB_INTERWORK__");

	if (ABI == "aapcs" \|\| ABI == "aapcs-linux" \|\| ABI == "aapcs-vfp") {
	// Embedded targets on Darwin follow AAPCS, but not EABI.
	// Windows on ARM follows AAPCS VFP, but does not conform to EABI.
	if (!getTriple().isOSBinFormatMachO() && !getTriple().isOSWindows())
	Builder.defineMacro("__ARM_EABI__");
	Builder.defineMacro("__ARM_PCS", "1");
	}

	if ((!SoftFloat && !SoftFloatABI) \|\| ABI == "aapcs-vfp" \|\|
	ABI == "aapcs16")
	Builder.defineMacro("__ARM_PCS_VFP", "1");

	if (SoftFloat)
	Builder.defineMacro("__SOFTFP__");

	if (ArchKind == llvm::ARM::AK_XSCALE)
	Builder.defineMacro("__XSCALE__");

	if (isThumb()) {
	Builder.defineMacro("__THUMBEL__");
	Builder.defineMacro("__thumb__");
	if (supportsThumb2())
	Builder.defineMacro("__thumb2__");
	}

	// ACLE 6.4.9 32-bit SIMD instructions
	if (ArchVersion >= 6 && (CPUProfile != "M" \|\| CPUAttr == "7EM"))
	Builder.defineMacro("__ARM_FEATURE_SIMD32", "1");

	// ACLE 6.4.10 Hardware Integer Divide
	if (((HWDiv & HWDivThumb) && isThumb()) \|\|
	((HWDiv & HWDivARM) && !isThumb())) {
	Builder.defineMacro("__ARM_FEATURE_IDIV", "1");
	Builder.defineMacro("__ARM_ARCH_EXT_IDIV__", "1");
	}

	// Note, this is always on in gcc, even though it doesn't make sense.
	Builder.defineMacro("__APCS_32__");

	if (FPUModeIsVFP((FPUMode) FPU)) {
	Builder.defineMacro("__VFP_FP__");
	if (FPU & VFP2FPU)
	Builder.defineMacro("__ARM_VFPV2__");
	if (FPU & VFP3FPU)
	Builder.defineMacro("__ARM_VFPV3__");
	if (FPU & VFP4FPU)
	Builder.defineMacro("__ARM_VFPV4__");
	if (FPU & FPARMV8)
	Builder.defineMacro("__ARM_FPV5__");
	}

	// This only gets set when Neon instructions are actually available, unlike
	// the VFP define, hence the soft float and arch check. This is subtly
	// different from gcc, we follow the intent which was that it should be set
	// when Neon instructions are actually available.
	if ((FPU & NeonFPU) && !SoftFloat && ArchVersion >= 7) {
	Builder.defineMacro("__ARM_NEON", "1");
	Builder.defineMacro("__ARM_NEON__");
	// current AArch32 NEON implementations do not support double-precision
	// floating-point even when it is present in VFP.
	Builder.defineMacro("__ARM_NEON_FP",
	"0x" + llvm::utohexstr(HW_FP & ~HW_FP_DP));
	}

	Builder.defineMacro("__ARM_SIZEOF_WCHAR_T",
	Opts.ShortWChar ? "2" : "4");

	Builder.defineMacro("__ARM_SIZEOF_MINIMAL_ENUM",
	Opts.ShortEnums ? "1" : "4");

	if (ArchVersion >= 6 && CPUAttr != "6M" && CPUAttr != "8M_BASE") {
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
	}

	// ACLE 6.4.7 DSP instructions
	if (DSP) {
	Builder.defineMacro("__ARM_FEATURE_DSP", "1");
	}

	// ACLE 6.4.8 Saturation instructions
	bool SAT = false;
	if ((ArchVersion == 6 && CPUProfile != "M") \|\| ArchVersion > 6 ) {
	Builder.defineMacro("__ARM_FEATURE_SAT", "1");
	SAT = true;
	}

	// ACLE 6.4.6 Q (saturation) flag
	if (DSP \|\| SAT)
	Builder.defineMacro("__ARM_FEATURE_QBIT", "1");

	if (Opts.UnsafeFPMath)
	Builder.defineMacro("__ARM_FP_FAST", "1");

	switch(ArchKind) {
	default: break;
	case llvm::ARM::AK_ARMV8_1A:
	getTargetDefinesARMV81A(Opts, Builder);
	break;
	case llvm::ARM::AK_ARMV8_2A:
	getTargetDefinesARMV82A(Opts, Builder);
	break;
	}
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::ARM::LastTSBuiltin-Builtin::FirstTSBuiltin);
	}
	bool isCLZForZeroUndef() const override { return false; }
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return IsAAPCS
	? AAPCSABIBuiltinVaList
	: (getTriple().isWatchABI() ? TargetInfo::CharPtrBuiltinVaList
	: TargetInfo::VoidPtrBuiltinVaList);
	}
	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default: break;
	case 'l': // r0-r7
	case 'h': // r8-r15
	case 't': // VFP Floating point register single precision
	case 'w': // VFP Floating point register double precision
	Info.setAllowsRegister();
	return true;
	case 'I':
	case 'J':
	case 'K':
	case 'L':
	case 'M':
	// FIXME
	return true;
	case 'Q': // A memory address that is a single base register.
	Info.setAllowsMemory();
	return true;
	case 'U': // a memory reference...
	switch (Name[1]) {
	case 'q': // ...ARMV4 ldrsb
	case 'v': // ...VFP load/store (reg+constant offset)
	case 'y': // ...iWMMXt load/store
	case 't': // address valid for load/store opaque types wider
	// than 128-bits
	case 'n': // valid address for Neon doubleword vector load/store
	case 'm': // valid address for Neon element and structure load/store
	case 's': // valid address for non-offset loads/stores of quad-word
	// values in four ARM registers
	Info.setAllowsMemory();
	Name++;
	return true;
	}
	}
	return false;
	}
	std::string convertConstraint(const char *&Constraint) const override {
	std::string R;
	switch (*Constraint) {
	case 'U': // Two-character constraint; add "^" hint for later parsing.
	R = std::string("^") + std::string(Constraint, 2);
	Constraint++;
	break;
	case 'p': // 'p' should be translated to 'r' by default.
	R = std::string("r");
	break;
	default:
	return std::string(1, *Constraint);
	}
	return R;
	}
	bool
	validateConstraintModifier(StringRef Constraint, char Modifier, unsigned Size,
	std::string &SuggestedModifier) const override {
	bool isOutput = (Constraint[0] == '=');
	bool isInOut = (Constraint[0] == '+');

	// Strip off constraint modifiers.
	while (Constraint[0] == '=' \|\|
	Constraint[0] == '+' \|\|
	Constraint[0] == '&')
	Constraint = Constraint.substr(1);

	switch (Constraint[0]) {
	default: break;
	case 'r': {
	switch (Modifier) {
	default:
	return (isInOut \|\| isOutput \|\| Size <= 64);
	case 'q':
	// A register of size 32 cannot fit a vector type.
	return false;
	}
	}
	}

	return true;
	}
	const char *getClobbers() const override {
	// FIXME: Is this really right?
	return "";
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_AAPCS:
	case CC_AAPCS_VFP:
	case CC_Swift:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0) return 0;
	if (RegNo == 1) return 1;
	return -1;
	}

	bool hasSjLjLowering() const override {
	return true;
	}
	};

	bool ARMTargetInfo::setFPMath(StringRef Name) {
	if (Name == "neon") {
	FPMath = FP_Neon;
	return true;
	} else if (Name == "vfp" \|\| Name == "vfp2" \|\| Name == "vfp3" \|\|
	Name == "vfp4") {
	FPMath = FP_VFP;
	return true;
	}
	return false;
	}

	const char * const ARMTargetInfo::GCCRegNames[] = {
	// Integer registers
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
	"r8", "r9", "r10", "r11", "r12", "sp", "lr", "pc",

	// Float registers
	"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
	"s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
	"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
	"s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",

	// Double registers
	"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
	"d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15",
	"d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
	"d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",

	// Quad registers
	"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
	"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
	};

	ArrayRef<const char *> ARMTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	const TargetInfo::GCCRegAlias ARMTargetInfo::GCCRegAliases[] = {
	{ { "a1" }, "r0" },
	{ { "a2" }, "r1" },
	{ { "a3" }, "r2" },
	{ { "a4" }, "r3" },
	{ { "v1" }, "r4" },
	{ { "v2" }, "r5" },
	{ { "v3" }, "r6" },
	{ { "v4" }, "r7" },
	{ { "v5" }, "r8" },
	{ { "v6", "rfp" }, "r9" },
	{ { "sl" }, "r10" },
	{ { "fp" }, "r11" },
	{ { "ip" }, "r12" },
	{ { "r13" }, "sp" },
	{ { "r14" }, "lr" },
	{ { "r15" }, "pc" },
	// The S, D and Q registers overlap, but aren't really aliases; we
	// don't want to substitute one of these for a different-sized one.
	};

	ArrayRef<TargetInfo::GCCRegAlias> ARMTargetInfo::getGCCRegAliases() const {
	return llvm::makeArrayRef(GCCRegAliases);
	}

	const Builtin::Info ARMTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{ #ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsNEON.def"

	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define LANGBUILTIN(ID, TYPE, ATTRS, LANG) \
	{ #ID, TYPE, ATTRS, nullptr, LANG, nullptr },
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{ #ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr },
	#define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANGS, FEATURE) \
	{ #ID, TYPE, ATTRS, HEADER, LANGS, FEATURE },
	#include "clang/Basic/BuiltinsARM.def"
	};

	class ARMleTargetInfo : public ARMTargetInfo {
	public:
	ARMleTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: ARMTargetInfo(Triple, Opts) {}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__ARMEL__");
	ARMTargetInfo::getTargetDefines(Opts, Builder);
	}
	};

	class ARMbeTargetInfo : public ARMTargetInfo {
	public:
	ARMbeTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: ARMTargetInfo(Triple, Opts) {}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__ARMEB__");
	Builder.defineMacro("__ARM_BIG_ENDIAN");
	ARMTargetInfo::getTargetDefines(Opts, Builder);
	}
	};

	class WindowsARMTargetInfo : public WindowsTargetInfo<ARMleTargetInfo> {
	const llvm::Triple Triple;
	public:
	WindowsARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsTargetInfo<ARMleTargetInfo>(Triple, Opts), Triple(Triple) {
	WCharType = UnsignedShort;
	SizeType = UnsignedInt;
	}
	void getVisualStudioDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	WindowsTargetInfo<ARMleTargetInfo>::getVisualStudioDefines(Opts, Builder);

	// FIXME: this is invalid for WindowsCE
	Builder.defineMacro("_M_ARM_NT", "1");
	Builder.defineMacro("_M_ARMT", "_M_ARM");
	Builder.defineMacro("_M_THUMB", "_M_ARM");

	assert((Triple.getArch() == llvm::Triple::arm \|\|
	Triple.getArch() == llvm::Triple::thumb) &&
	"invalid architecture for Windows ARM target info");
	unsigned Offset = Triple.getArch() == llvm::Triple::arm ? 4 : 6;
	Builder.defineMacro("_M_ARM", Triple.getArchName().substr(Offset));

	// TODO map the complete set of values
	// 31: VFPv3 40: VFPv4
	Builder.defineMacro("_M_ARM_FP", "31");
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_X86StdCall:
	case CC_X86ThisCall:
	case CC_X86FastCall:
	case CC_X86VectorCall:
	return CCCR_Ignore;
	case CC_C:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}
	};

	// Windows ARM + Itanium C++ ABI Target
	class ItaniumWindowsARMleTargetInfo : public WindowsARMTargetInfo {
	public:
	ItaniumWindowsARMleTargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: WindowsARMTargetInfo(Triple, Opts) {
	TheCXXABI.set(TargetCXXABI::GenericARM);
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsARMTargetInfo::getTargetDefines(Opts, Builder);

	if (Opts.MSVCCompat)
	WindowsARMTargetInfo::getVisualStudioDefines(Opts, Builder);
	}
	};

	// Windows ARM, MS (C++) ABI
	class MicrosoftARMleTargetInfo : public WindowsARMTargetInfo {
	public:
	MicrosoftARMleTargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: WindowsARMTargetInfo(Triple, Opts) {
	TheCXXABI.set(TargetCXXABI::Microsoft);
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsARMTargetInfo::getTargetDefines(Opts, Builder);
	WindowsARMTargetInfo::getVisualStudioDefines(Opts, Builder);
	}
	};

	// ARM MinGW target
	class MinGWARMTargetInfo : public WindowsARMTargetInfo {
	public:
	MinGWARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: WindowsARMTargetInfo(Triple, Opts) {
	TheCXXABI.set(TargetCXXABI::GenericARM);
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsARMTargetInfo::getTargetDefines(Opts, Builder);
	DefineStd(Builder, "WIN32", Opts);
	DefineStd(Builder, "WINNT", Opts);
	Builder.defineMacro("_ARM_");
	addMinGWDefines(Opts, Builder);
	}
	};

	// ARM Cygwin target
	class CygwinARMTargetInfo : public ARMleTargetInfo {
	public:
	CygwinARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: ARMleTargetInfo(Triple, Opts) {
	TLSSupported = false;
	WCharType = UnsignedShort;
	DoubleAlign = LongLongAlign = 64;
	resetDataLayout("e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64");
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	ARMleTargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("_ARM_");
	Builder.defineMacro("__CYGWIN__");
	Builder.defineMacro("__CYGWIN32__");
	DefineStd(Builder, "unix", Opts);
	if (Opts.CPlusPlus)
	Builder.defineMacro("_GNU_SOURCE");
	}
	};

	class DarwinARMTargetInfo : public DarwinTargetInfo<ARMleTargetInfo> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	getDarwinDefines(Builder, Opts, Triple, PlatformName, PlatformMinVersion);
	}

	public:
	DarwinARMTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<ARMleTargetInfo>(Triple, Opts) {
	HasAlignMac68kSupport = true;
	// iOS always has 64-bit atomic instructions.
	// FIXME: This should be based off of the target features in
	// ARMleTargetInfo.
	MaxAtomicInlineWidth = 64;

	if (Triple.isWatchABI()) {
	// Darwin on iOS uses a variant of the ARM C++ ABI.
	TheCXXABI.set(TargetCXXABI::WatchOS);

	// The 32-bit ABI is silent on what ptrdiff_t should be, but given that
	// size_t is long, it's a bit weird for it to be int.
	PtrDiffType = SignedLong;

	// BOOL should be a real boolean on the new ABI
	UseSignedCharForObjCBool = false;
	} else
	TheCXXABI.set(TargetCXXABI::iOS);
	}
	};

	class AArch64TargetInfo : public TargetInfo {
	virtual void setDataLayout() = 0;
	static const TargetInfo::GCCRegAlias GCCRegAliases[];
	static const char *const GCCRegNames[];

	enum FPUModeEnum {
	FPUMode,
	NeonMode = (1 << 0),
	SveMode = (1 << 1)
	};

	unsigned FPU;
	unsigned CRC;
	unsigned Crypto;
	unsigned Unaligned;
	unsigned HasFullFP16;
	llvm::AArch64::ArchKind ArchKind;

	static const Builtin::Info BuiltinInfo[];

	std::string ABI;

	public:
	AArch64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: TargetInfo(Triple), ABI("aapcs") {
	if (getTriple().getOS() == llvm::Triple::NetBSD \|\|
	getTriple().getOS() == llvm::Triple::OpenBSD) {
	WCharType = SignedInt;

	// NetBSD apparently prefers consistency across ARM targets to consistency
	// across 64-bit targets.
	Int64Type = SignedLongLong;
	IntMaxType = SignedLongLong;
	} else {
	WCharType = UnsignedInt;
	Int64Type = SignedLong;
	IntMaxType = SignedLong;
	}

	LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
	MaxVectorAlign = 128;
	MaxAtomicInlineWidth = 128;
	MaxAtomicPromoteWidth = 128;

	LongDoubleWidth = LongDoubleAlign = SuitableAlign = 128;
	LongDoubleFormat = &llvm::APFloat::IEEEquad();

	// Make __builtin_ms_va_list available.
	HasBuiltinMSVaList = true;

	// {} in inline assembly are neon specifiers, not assembly variant
	// specifiers.
	NoAsmVariants = true;

	// AAPCS gives rules for bitfields. 7.1.7 says: "The container type
	// contributes to the alignment of the containing aggregate in the same way
	// a plain (non bit-field) member of that type would, without exception for
	// zero-sized or anonymous bit-fields."
	assert(UseBitFieldTypeAlignment && "bitfields affect type alignment");
	UseZeroLengthBitfieldAlignment = true;

	// AArch64 targets default to using the ARM C++ ABI.
	TheCXXABI.set(TargetCXXABI::GenericAArch64);

	if (Triple.getOS() == llvm::Triple::Linux)
	this->MCountName = "\01_mcount";
	else if (Triple.getOS() == llvm::Triple::UnknownOS)
	this->MCountName = Opts.EABIVersion == llvm::EABI::GNU ? "\01_mcount" : "mcount";
	}

	StringRef getABI() const override { return ABI; }
	bool setABI(const std::string &Name) override {
	if (Name != "aapcs" && Name != "darwinpcs")
	return false;

	ABI = Name;
	return true;
	}

	bool setCPU(const std::string &Name) override {
	return Name == "generic" \|\|
	llvm::AArch64::parseCPUArch(Name) !=
	static_cast<unsigned>(llvm::AArch64::ArchKind::AK_INVALID);
	}

	void getTargetDefinesARMV81A(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	Builder.defineMacro("__ARM_FEATURE_QRDMX", "1");
	}

	void getTargetDefinesARMV82A(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	// Also include the ARMv8.1 defines
	getTargetDefinesARMV81A(Opts, Builder);
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	// Target identification.
	Builder.defineMacro("__aarch64__");
	// For bare-metal none-eabi.
	if (getTriple().getOS() == llvm::Triple::UnknownOS &&
	(getTriple().getEnvironment() == llvm::Triple::EABI \|\|
	getTriple().getEnvironment() == llvm::Triple::EABIHF))
	Builder.defineMacro("__ELF__");

	// Target properties.
	Builder.defineMacro("_LP64");
	Builder.defineMacro("__LP64__");

	// ACLE predefines. Many can only have one possible value on v8 AArch64.
	Builder.defineMacro("__ARM_ACLE", "200");
	Builder.defineMacro("__ARM_ARCH", "8");
	Builder.defineMacro("__ARM_ARCH_PROFILE", "'A'");

	Builder.defineMacro("__ARM_64BIT_STATE", "1");
	Builder.defineMacro("__ARM_PCS_AAPCS64", "1");
	Builder.defineMacro("__ARM_ARCH_ISA_A64", "1");

	Builder.defineMacro("__ARM_FEATURE_CLZ", "1");
	Builder.defineMacro("__ARM_FEATURE_FMA", "1");
	Builder.defineMacro("__ARM_FEATURE_LDREX", "0xF");
	Builder.defineMacro("__ARM_FEATURE_IDIV", "1"); // As specified in ACLE
	Builder.defineMacro("__ARM_FEATURE_DIV"); // For backwards compatibility
	Builder.defineMacro("__ARM_FEATURE_NUMERIC_MAXMIN", "1");
	Builder.defineMacro("__ARM_FEATURE_DIRECTED_ROUNDING", "1");

	Builder.defineMacro("__ARM_ALIGN_MAX_STACK_PWR", "4");

	// 0xe implies support for half, single and double precision operations.
	Builder.defineMacro("__ARM_FP", "0xE");

	// PCS specifies this for SysV variants, which is all we support. Other ABIs
	// may choose __ARM_FP16_FORMAT_ALTERNATIVE.
	Builder.defineMacro("__ARM_FP16_FORMAT_IEEE", "1");
	Builder.defineMacro("__ARM_FP16_ARGS", "1");

	if (Opts.UnsafeFPMath)
	Builder.defineMacro("__ARM_FP_FAST", "1");

	Builder.defineMacro("__ARM_SIZEOF_WCHAR_T", Opts.ShortWChar ? "2" : "4");

	Builder.defineMacro("__ARM_SIZEOF_MINIMAL_ENUM",
	Opts.ShortEnums ? "1" : "4");

	if (FPU & NeonMode) {
	Builder.defineMacro("__ARM_NEON", "1");
	// 64-bit NEON supports half, single and double precision operations.
	Builder.defineMacro("__ARM_NEON_FP", "0xE");
	}

	if (FPU & SveMode)
	Builder.defineMacro("__ARM_FEATURE_SVE", "1");

	if (CRC)
	Builder.defineMacro("__ARM_FEATURE_CRC32", "1");

	if (Crypto)
	Builder.defineMacro("__ARM_FEATURE_CRYPTO", "1");

	if (Unaligned)
	Builder.defineMacro("__ARM_FEATURE_UNALIGNED", "1");

	switch(ArchKind) {
	default: break;
	case llvm::AArch64::ArchKind::AK_ARMV8_1A:
	getTargetDefinesARMV81A(Opts, Builder);
	break;
	case llvm::AArch64::ArchKind::AK_ARMV8_2A:
	getTargetDefinesARMV82A(Opts, Builder);
	break;
	}

	// All of the __sync_(bool\|val)_compare_and_swap_(1\|2\|4\|8) builtins work.
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::AArch64::LastTSBuiltin - Builtin::FirstTSBuiltin);
	}

	bool hasFeature(StringRef Feature) const override {
	return Feature == "aarch64" \|\|
	Feature == "arm64" \|\|
	Feature == "arm" \|\|
	(Feature == "neon" && (FPU & NeonMode)) \|\|
	(Feature == "sve" && (FPU & SveMode));
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	FPU = FPUMode;
	CRC = 0;
	Crypto = 0;
	Unaligned = 1;
	HasFullFP16 = 0;
	ArchKind = llvm::AArch64::ArchKind::AK_ARMV8A;

	for (const auto &Feature : Features) {
	if (Feature == "+neon")
	FPU \|= NeonMode;
	if (Feature == "+sve")
	FPU \|= SveMode;
	if (Feature == "+crc")
	CRC = 1;
	if (Feature == "+crypto")
	Crypto = 1;
	if (Feature == "+strict-align")
	Unaligned = 0;
	if (Feature == "+v8.1a")
	ArchKind = llvm::AArch64::ArchKind::AK_ARMV8_1A;
	if (Feature == "+v8.2a")
	ArchKind = llvm::AArch64::ArchKind::AK_ARMV8_2A;
	if (Feature == "+fullfp16")
	HasFullFP16 = 1;
	}

	setDataLayout();

	return true;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_C:
	case CC_Swift:
	case CC_PreserveMost:
	case CC_PreserveAll:
	case CC_OpenCLKernel:
	case CC_Win64:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}

	bool isCLZForZeroUndef() const override { return false; }

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::AArch64ABIBuiltinVaList;
	}

	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default:
	return false;
	case 'w': // Floating point and SIMD registers (V0-V31)
	Info.setAllowsRegister();
	return true;
	case 'I': // Constant that can be used with an ADD instruction
	case 'J': // Constant that can be used with a SUB instruction
	case 'K': // Constant that can be used with a 32-bit logical instruction
	case 'L': // Constant that can be used with a 64-bit logical instruction
	case 'M': // Constant that can be used as a 32-bit MOV immediate
	case 'N': // Constant that can be used as a 64-bit MOV immediate
	case 'Y': // Floating point constant zero
	case 'Z': // Integer constant zero
	return true;
	case 'Q': // A memory reference with base register and no offset
	Info.setAllowsMemory();
	return true;
	case 'S': // A symbolic address
	Info.setAllowsRegister();
	return true;
	case 'U':
	// Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes.
	// Utf: A memory address suitable for ldp/stp in TF mode.
	// Usa: An absolute symbolic address.
	// Ush: The high part (bits 32:12) of a pc-relative symbolic address.
	llvm_unreachable("FIXME: Unimplemented support for U* constraints.");
	case 'z': // Zero register, wzr or xzr
	Info.setAllowsRegister();
	return true;
	case 'x': // Floating point and SIMD registers (V0-V15)
	Info.setAllowsRegister();
	return true;
	}
	return false;
	}

	bool
	validateConstraintModifier(StringRef Constraint, char Modifier, unsigned Size,
	std::string &SuggestedModifier) const override {
	// Strip off constraint modifiers.
	while (Constraint[0] == '=' \|\| Constraint[0] == '+' \|\| Constraint[0] == '&')
	Constraint = Constraint.substr(1);

	switch (Constraint[0]) {
	default:
	return true;
	case 'z':
	case 'r': {
	switch (Modifier) {
	case 'x':
	case 'w':
	// For now assume that the person knows what they're
	// doing with the modifier.
	return true;
	default:
	// By default an 'r' constraint will be in the 'x'
	// registers.
	if (Size == 64)
	return true;

	SuggestedModifier = "w";
	return false;
	}
	}
	}
	}

	const char *getClobbers() const override { return ""; }

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0)
	return 0;
	if (RegNo == 1)
	return 1;
	return -1;
	}
	};

	const char *const AArch64TargetInfo::GCCRegNames[] = {
	// 32-bit Integer registers
	"w0", "w1", "w2", "w3", "w4", "w5", "w6", "w7", "w8", "w9", "w10",
	"w11", "w12", "w13", "w14", "w15", "w16", "w17", "w18", "w19", "w20", "w21",
	"w22", "w23", "w24", "w25", "w26", "w27", "w28", "w29", "w30", "wsp",

	// 64-bit Integer registers
	"x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10",
	"x11", "x12", "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21",
	"x22", "x23", "x24", "x25", "x26", "x27", "x28", "fp", "lr", "sp",

	// 32-bit floating point regsisters
	"s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s10",
	"s11", "s12", "s13", "s14", "s15", "s16", "s17", "s18", "s19", "s20", "s21",
	"s22", "s23", "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31",

	// 64-bit floating point regsisters
	"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", "d9", "d10",
	"d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", "d21",
	"d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",

	// Vector registers
	"v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
	"v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
	"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
	};

	ArrayRef<const char *> AArch64TargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	const TargetInfo::GCCRegAlias AArch64TargetInfo::GCCRegAliases[] = {
	{ { "w31" }, "wsp" },
	{ { "x29" }, "fp" },
	{ { "x30" }, "lr" },
	{ { "x31" }, "sp" },
	// The S/D/Q and W/X registers overlap, but aren't really aliases; we
	// don't want to substitute one of these for a different-sized one.
	};

	ArrayRef<TargetInfo::GCCRegAlias> AArch64TargetInfo::getGCCRegAliases() const {
	return llvm::makeArrayRef(GCCRegAliases);
	}

	const Builtin::Info AArch64TargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsNEON.def"

	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsAArch64.def"
	};

	class AArch64leTargetInfo : public AArch64TargetInfo {
	void setDataLayout() override {
	if (getTriple().isOSBinFormatMachO())
	resetDataLayout("e-m:o-i64:64-i128:128-n32:64-S128");
	else
	resetDataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128");
	}

	public:
	AArch64leTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: AArch64TargetInfo(Triple, Opts) {
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__AARCH64EL__");
	AArch64TargetInfo::getTargetDefines(Opts, Builder);
	}
	};

	class MicrosoftARM64TargetInfo
	: public WindowsTargetInfo<AArch64leTargetInfo> {
	const llvm::Triple Triple;

	public:
	MicrosoftARM64TargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: WindowsTargetInfo<AArch64leTargetInfo>(Triple, Opts), Triple(Triple) {

	// This is an LLP64 platform.
	// int:4, long:4, long long:8, long double:8.
	WCharType = UnsignedShort;
	IntWidth = IntAlign = 32;
	LongWidth = LongAlign = 32;
	DoubleAlign = LongLongAlign = 64;
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	IntMaxType = SignedLongLong;
	Int64Type = SignedLongLong;
	SizeType = UnsignedLongLong;
	PtrDiffType = SignedLongLong;
	IntPtrType = SignedLongLong;

	TheCXXABI.set(TargetCXXABI::Microsoft);
	}

	void setDataLayout() override {
	resetDataLayout("e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128");
	}

	void getVisualStudioDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	WindowsTargetInfo<AArch64leTargetInfo>::getVisualStudioDefines(Opts,
	Builder);
	Builder.defineMacro("_WIN32", "1");
	Builder.defineMacro("_WIN64", "1");
	Builder.defineMacro("_M_ARM64", "1");
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WindowsTargetInfo::getTargetDefines(Opts, Builder);
	getVisualStudioDefines(Opts, Builder);
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	};

	class AArch64beTargetInfo : public AArch64TargetInfo {
	void setDataLayout() override {
	assert(!getTriple().isOSBinFormatMachO());
	resetDataLayout("E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128");
	}

	public:
	AArch64beTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: AArch64TargetInfo(Triple, Opts) {}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__AARCH64EB__");
	Builder.defineMacro("__AARCH_BIG_ENDIAN");
	Builder.defineMacro("__ARM_BIG_ENDIAN");
	AArch64TargetInfo::getTargetDefines(Opts, Builder);
	}
	};

	class DarwinAArch64TargetInfo : public DarwinTargetInfo<AArch64leTargetInfo> {
	protected:
	void getOSDefines(const LangOptions &Opts, const llvm::Triple &Triple,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__AARCH64_SIMD__");
	Builder.defineMacro("__ARM64_ARCH_8__");
	Builder.defineMacro("__ARM_NEON__");
	Builder.defineMacro("__LITTLE_ENDIAN__");
	Builder.defineMacro("__REGISTER_PREFIX__", "");
	Builder.defineMacro("__arm64", "1");
	Builder.defineMacro("__arm64__", "1");

	getDarwinDefines(Builder, Opts, Triple, PlatformName, PlatformMinVersion);
	}

	public:
	DarwinAArch64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: DarwinTargetInfo<AArch64leTargetInfo>(Triple, Opts) {
	Int64Type = SignedLongLong;
	WCharType = SignedInt;
	UseSignedCharForObjCBool = false;

	LongDoubleWidth = LongDoubleAlign = SuitableAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();

	TheCXXABI.set(TargetCXXABI::iOS64);
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	};

	// Hexagon abstract base class
	class HexagonTargetInfo : public TargetInfo {
	static const Builtin::Info BuiltinInfo[];
	static const char * const GCCRegNames[];
	static const TargetInfo::GCCRegAlias GCCRegAliases[];
	std::string CPU;
	bool HasHVX, HasHVXDouble;
	bool UseLongCalls;

	public:
	HexagonTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	// Specify the vector alignment explicitly. For v512x1, the calculated
	// alignment would be 512*alignment(i1), which is 512 bytes, instead of
	// the required minimum of 64 bytes.
	resetDataLayout("e-m:e-p:32:32:32-a:0-n16:32-"
	"i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-"
	"v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048");
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;

	// {} in inline assembly are packet specifiers, not assembly variant
	// specifiers.
	NoAsmVariants = true;

	LargeArrayMinWidth = 64;
	LargeArrayAlign = 64;
	UseBitFieldTypeAlignment = true;
	ZeroLengthBitfieldBoundary = 32;
	HasHVX = HasHVXDouble = false;
	UseLongCalls = false;
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::Hexagon::LastTSBuiltin-Builtin::FirstTSBuiltin);
	}

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	case 'v':
	case 'q':
	if (HasHVX) {
	Info.setAllowsRegister();
	return true;
	}
	break;
	case 's':
	// Relocatable constant.
	return true;
	}
	return false;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override;

	bool isCLZForZeroUndef() const override { return false; }

	bool hasFeature(StringRef Feature) const override {
	return llvm::StringSwitch<bool>(Feature)
	.Case("hexagon", true)
	.Case("hvx", HasHVX)
	.Case("hvx-double", HasHVXDouble)
	.Case("long-calls", UseLongCalls)
	.Default(false);
	}

	bool initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU, const std::vector<std::string> &FeaturesVec)
	const override;

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override;

	void setFeatureEnabled(llvm::StringMap<bool> &Features, StringRef Name,
	bool Enabled) const override;

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::CharPtrBuiltinVaList;
	}
	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
	const char *getClobbers() const override {
	return "";
	}

	static const char *getHexagonCPUSuffix(StringRef Name) {
	return llvm::StringSwitch<const char*>(Name)
	.Case("hexagonv4", "4")
	.Case("hexagonv5", "5")
	.Case("hexagonv55", "55")
	.Case("hexagonv60", "60")
	.Case("hexagonv62", "62")
	.Default(nullptr);
	}

	bool setCPU(const std::string &Name) override {
	if (!getHexagonCPUSuffix(Name))
	return false;
	CPU = Name;
	return true;
	}

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	return RegNo < 2 ? RegNo : -1;
	}
	};

	void HexagonTargetInfo::getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	Builder.defineMacro("__qdsp6__", "1");
	Builder.defineMacro("__hexagon__", "1");

	if (CPU == "hexagonv4") {
	Builder.defineMacro("__HEXAGON_V4__");
	Builder.defineMacro("__HEXAGON_ARCH__", "4");
	if (Opts.HexagonQdsp6Compat) {
	Builder.defineMacro("__QDSP6_V4__");
	Builder.defineMacro("__QDSP6_ARCH__", "4");
	}
	} else if (CPU == "hexagonv5") {
	Builder.defineMacro("__HEXAGON_V5__");
	Builder.defineMacro("__HEXAGON_ARCH__", "5");
	if(Opts.HexagonQdsp6Compat) {
	Builder.defineMacro("__QDSP6_V5__");
	Builder.defineMacro("__QDSP6_ARCH__", "5");
	}
	} else if (CPU == "hexagonv55") {
	Builder.defineMacro("__HEXAGON_V55__");
	Builder.defineMacro("__HEXAGON_ARCH__", "55");
	Builder.defineMacro("__QDSP6_V55__");
	Builder.defineMacro("__QDSP6_ARCH__", "55");
	} else if (CPU == "hexagonv60") {
	Builder.defineMacro("__HEXAGON_V60__");
	Builder.defineMacro("__HEXAGON_ARCH__", "60");
	Builder.defineMacro("__QDSP6_V60__");
	Builder.defineMacro("__QDSP6_ARCH__", "60");
	} else if (CPU == "hexagonv62") {
	Builder.defineMacro("__HEXAGON_V62__");
	Builder.defineMacro("__HEXAGON_ARCH__", "62");
	}

	if (hasFeature("hvx")) {
	Builder.defineMacro("__HVX__");
	if (hasFeature("hvx-double"))
	Builder.defineMacro("__HVXDBL__");
	}
	}

	bool HexagonTargetInfo::initFeatureMap(llvm::StringMap<bool> &Features,
	DiagnosticsEngine &Diags, StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const {
	// Default for v60: -hvx, -hvx-double.
	Features["hvx"] = false;
	Features["hvx-double"] = false;
	Features["long-calls"] = false;

	return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
	}

	bool HexagonTargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) {
	for (auto &F : Features) {
	if (F == "+hvx")
	HasHVX = true;
	else if (F == "-hvx")
	HasHVX = HasHVXDouble = false;
	else if (F == "+hvx-double")
	HasHVX = HasHVXDouble = true;
	else if (F == "-hvx-double")
	HasHVXDouble = false;

	if (F == "+long-calls")
	UseLongCalls = true;
	else if (F == "-long-calls")
	UseLongCalls = false;
	}
	return true;
	}

	void HexagonTargetInfo::setFeatureEnabled(llvm::StringMap<bool> &Features,
	StringRef Name, bool Enabled) const {
	if (Enabled) {
	if (Name == "hvx-double")
	Features["hvx"] = true;
	} else {
	if (Name == "hvx")
	Features["hvx-double"] = false;
	}
	Features[Name] = Enabled;
	}

	const char *const HexagonTargetInfo::GCCRegNames[] = {
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
	"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31",
	"p0", "p1", "p2", "p3",
	"sa0", "lc0", "sa1", "lc1", "m0", "m1", "usr", "ugp"
	};

	ArrayRef<const char*> HexagonTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	const TargetInfo::GCCRegAlias HexagonTargetInfo::GCCRegAliases[] = {
	{ { "sp" }, "r29" },
	{ { "fp" }, "r30" },
	{ { "lr" }, "r31" },
	};

	ArrayRef<TargetInfo::GCCRegAlias> HexagonTargetInfo::getGCCRegAliases() const {
	return llvm::makeArrayRef(GCCRegAliases);
	}


	const Builtin::Info HexagonTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{ #ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsHexagon.def"
	};

	class LanaiTargetInfo : public TargetInfo {
	// Class for Lanai (32-bit).
	// The CPU profiles supported by the Lanai backend
	enum CPUKind {
	CK_NONE,
	CK_V11,
	} CPU;

	static const TargetInfo::GCCRegAlias GCCRegAliases[];
	static const char *const GCCRegNames[];

	public:
	LanaiTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	// Description string has to be kept in sync with backend.
	resetDataLayout("E" // Big endian
	"-m:e" // ELF name manging
	"-p:32:32" // 32 bit pointers, 32 bit aligned
	"-i64:64" // 64 bit integers, 64 bit aligned
	"-a:0:32" // 32 bit alignment of objects of aggregate type
	"-n32" // 32 bit native integer width
	"-S64" // 64 bit natural stack alignment
	);

	// Setting RegParmMax equal to what mregparm was set to in the old
	// toolchain
	RegParmMax = 4;

	// Set the default CPU to V11
	CPU = CK_V11;

	// Temporary approach to make everything at least word-aligned and allow for
	// safely casting between pointers with different alignment requirements.
	// TODO: Remove this when there are no more cast align warnings on the
	// firmware.
	MinGlobalAlign = 32;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	// Define __lanai__ when building for target lanai.
	Builder.defineMacro("__lanai__");

	// Set define for the CPU specified.
	switch (CPU) {
	case CK_V11:
	Builder.defineMacro("__LANAI_V11__");
	break;
	case CK_NONE:
	llvm_unreachable("Unhandled target CPU");
	}
	}

	bool setCPU(const std::string &Name) override {
	CPU = llvm::StringSwitch<CPUKind>(Name)
	.Case("v11", CK_V11)
	.Default(CK_NONE);

	return CPU != CK_NONE;
	}

	bool hasFeature(StringRef Feature) const override {
	return llvm::StringSwitch<bool>(Feature).Case("lanai", true).Default(false);
	}

	ArrayRef<const char *> getGCCRegNames() const override;

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override { return None; }

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override {
	return false;
	}

	const char *getClobbers() const override { return ""; }
	};

	const char *const LanaiTargetInfo::GCCRegNames[] = {
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
	"r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20", "r21",
	"r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"};

	ArrayRef<const char *> LanaiTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	const TargetInfo::GCCRegAlias LanaiTargetInfo::GCCRegAliases[] = {
	{{"pc"}, "r2"},
	{{"sp"}, "r4"},
	{{"fp"}, "r5"},
	{{"rv"}, "r8"},
	{{"rr1"}, "r10"},
	{{"rr2"}, "r11"},
	{{"rca"}, "r15"},
	};

	ArrayRef<TargetInfo::GCCRegAlias> LanaiTargetInfo::getGCCRegAliases() const {
	return llvm::makeArrayRef(GCCRegAliases);
	}

	// Shared base class for SPARC v8 (32-bit) and SPARC v9 (64-bit).
	class SparcTargetInfo : public TargetInfo {
	static const TargetInfo::GCCRegAlias GCCRegAliases[];
	static const char * const GCCRegNames[];
	bool SoftFloat;
	public:
	SparcTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple), SoftFloat(false) {}

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0) return 24;
	if (RegNo == 1) return 25;
	return -1;
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	// Check if software floating point is enabled
	auto Feature = std::find(Features.begin(), Features.end(), "+soft-float");
	if (Feature != Features.end()) {
	SoftFloat = true;
	}
	return true;
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "sparc", Opts);
	Builder.defineMacro("__REGISTER_PREFIX__", "");

	if (SoftFloat)
	Builder.defineMacro("SOFT_FLOAT", "1");
	}

	bool hasFeature(StringRef Feature) const override {
	return llvm::StringSwitch<bool>(Feature)
	.Case("softfloat", SoftFloat)
	.Case("sparc", true)
	.Default(false);
	}

	bool hasSjLjLowering() const override {
	return true;
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	// FIXME: Implement!
	return None;
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}
	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override {
	// FIXME: Implement!
	switch (*Name) {
	case 'I': // Signed 13-bit constant
	case 'J': // Zero
	case 'K': // 32-bit constant with the low 12 bits clear
	case 'L': // A constant in the range supported by movcc (11-bit signed imm)
	case 'M': // A constant in the range supported by movrcc (19-bit signed imm)
	case 'N': // Same as 'K' but zext (required for SIMode)
	case 'O': // The constant 4096
	return true;

	case 'f':
	case 'e':
	info.setAllowsRegister();
	return true;
	}
	return false;
	}
	const char *getClobbers() const override {
	// FIXME: Implement!
	return "";
	}

	// No Sparc V7 for now, the backend doesn't support it anyway.
	enum CPUKind {
	CK_GENERIC,
	CK_V8,
	CK_SUPERSPARC,
	CK_SPARCLITE,
	CK_F934,
	CK_HYPERSPARC,
	CK_SPARCLITE86X,
	CK_SPARCLET,
	CK_TSC701,
	CK_V9,
	CK_ULTRASPARC,
	CK_ULTRASPARC3,
	CK_NIAGARA,
	CK_NIAGARA2,
	CK_NIAGARA3,
	CK_NIAGARA4,
	CK_MYRIAD2100,
	CK_MYRIAD2150,
	CK_MYRIAD2450,
	CK_LEON2,
	CK_LEON2_AT697E,
	CK_LEON2_AT697F,
	CK_LEON3,
	CK_LEON3_UT699,
	CK_LEON3_GR712RC,
	CK_LEON4,
	CK_LEON4_GR740
	} CPU = CK_GENERIC;

	enum CPUGeneration {
	CG_V8,
	CG_V9,
	};

	CPUGeneration getCPUGeneration(CPUKind Kind) const {
	switch (Kind) {
	case CK_GENERIC:
	case CK_V8:
	case CK_SUPERSPARC:
	case CK_SPARCLITE:
	case CK_F934:
	case CK_HYPERSPARC:
	case CK_SPARCLITE86X:
	case CK_SPARCLET:
	case CK_TSC701:
	case CK_MYRIAD2100:
	case CK_MYRIAD2150:
	case CK_MYRIAD2450:
	case CK_LEON2:
	case CK_LEON2_AT697E:
	case CK_LEON2_AT697F:
	case CK_LEON3:
	case CK_LEON3_UT699:
	case CK_LEON3_GR712RC:
	case CK_LEON4:
	case CK_LEON4_GR740:
	return CG_V8;
	case CK_V9:
	case CK_ULTRASPARC:
	case CK_ULTRASPARC3:
	case CK_NIAGARA:
	case CK_NIAGARA2:
	case CK_NIAGARA3:
	case CK_NIAGARA4:
	return CG_V9;
	}
	llvm_unreachable("Unexpected CPU kind");
	}

	CPUKind getCPUKind(StringRef Name) const {
	return llvm::StringSwitch<CPUKind>(Name)
	.Case("v8", CK_V8)
	.Case("supersparc", CK_SUPERSPARC)
	.Case("sparclite", CK_SPARCLITE)
	.Case("f934", CK_F934)
	.Case("hypersparc", CK_HYPERSPARC)
	.Case("sparclite86x", CK_SPARCLITE86X)
	.Case("sparclet", CK_SPARCLET)
	.Case("tsc701", CK_TSC701)
	.Case("v9", CK_V9)
	.Case("ultrasparc", CK_ULTRASPARC)
	.Case("ultrasparc3", CK_ULTRASPARC3)
	.Case("niagara", CK_NIAGARA)
	.Case("niagara2", CK_NIAGARA2)
	.Case("niagara3", CK_NIAGARA3)
	.Case("niagara4", CK_NIAGARA4)
	.Case("ma2100", CK_MYRIAD2100)
	.Case("ma2150", CK_MYRIAD2150)
	.Case("ma2450", CK_MYRIAD2450)
	// FIXME: the myriad2[.n] spellings are obsolete,
	// but a grace period is needed to allow updating dependent builds.
	.Case("myriad2", CK_MYRIAD2100)
	.Case("myriad2.1", CK_MYRIAD2100)
	.Case("myriad2.2", CK_MYRIAD2150)
	.Case("leon2", CK_LEON2)
	.Case("at697e", CK_LEON2_AT697E)
	.Case("at697f", CK_LEON2_AT697F)
	.Case("leon3", CK_LEON3)
	.Case("ut699", CK_LEON3_UT699)
	.Case("gr712rc", CK_LEON3_GR712RC)
	.Case("leon4", CK_LEON4)
	.Case("gr740", CK_LEON4_GR740)
	.Default(CK_GENERIC);
	}

	bool setCPU(const std::string &Name) override {
	CPU = getCPUKind(Name);
	return CPU != CK_GENERIC;
	}
	};

	const char * const SparcTargetInfo::GCCRegNames[] = {
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
	"r24", "r25", "r26", "r27", "r28", "r29", "r30", "r31"
	};

	ArrayRef<const char *> SparcTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	const TargetInfo::GCCRegAlias SparcTargetInfo::GCCRegAliases[] = {
	{ { "g0" }, "r0" },
	{ { "g1" }, "r1" },
	{ { "g2" }, "r2" },
	{ { "g3" }, "r3" },
	{ { "g4" }, "r4" },
	{ { "g5" }, "r5" },
	{ { "g6" }, "r6" },
	{ { "g7" }, "r7" },
	{ { "o0" }, "r8" },
	{ { "o1" }, "r9" },
	{ { "o2" }, "r10" },
	{ { "o3" }, "r11" },
	{ { "o4" }, "r12" },
	{ { "o5" }, "r13" },
	{ { "o6", "sp" }, "r14" },
	{ { "o7" }, "r15" },
	{ { "l0" }, "r16" },
	{ { "l1" }, "r17" },
	{ { "l2" }, "r18" },
	{ { "l3" }, "r19" },
	{ { "l4" }, "r20" },
	{ { "l5" }, "r21" },
	{ { "l6" }, "r22" },
	{ { "l7" }, "r23" },
	{ { "i0" }, "r24" },
	{ { "i1" }, "r25" },
	{ { "i2" }, "r26" },
	{ { "i3" }, "r27" },
	{ { "i4" }, "r28" },
	{ { "i5" }, "r29" },
	{ { "i6", "fp" }, "r30" },
	{ { "i7" }, "r31" },
	};

	ArrayRef<TargetInfo::GCCRegAlias> SparcTargetInfo::getGCCRegAliases() const {
	return llvm::makeArrayRef(GCCRegAliases);
	}

	// SPARC v8 is the 32-bit mode selected by Triple::sparc.
	class SparcV8TargetInfo : public SparcTargetInfo {
	public:
	SparcV8TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: SparcTargetInfo(Triple, Opts) {
	resetDataLayout("E-m:e-p:32:32-i64:64-f128:64-n32-S64");
	// NetBSD / OpenBSD use long (same as llvm default); everyone else uses int.
	switch (getTriple().getOS()) {
	default:
	SizeType = UnsignedInt;
	IntPtrType = SignedInt;
	PtrDiffType = SignedInt;
	break;
	case llvm::Triple::NetBSD:
	case llvm::Triple::OpenBSD:
	SizeType = UnsignedLong;
	IntPtrType = SignedLong;
	PtrDiffType = SignedLong;
	break;
	}
	// Up to 32 bits are lock-free atomic, but we're willing to do atomic ops
	// on up to 64 bits.
	MaxAtomicPromoteWidth = 64;
	MaxAtomicInlineWidth = 32;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	SparcTargetInfo::getTargetDefines(Opts, Builder);
	switch (getCPUGeneration(CPU)) {
	case CG_V8:
	Builder.defineMacro("__sparcv8");
	if (getTriple().getOS() != llvm::Triple::Solaris)
	Builder.defineMacro("__sparcv8__");
	break;
	case CG_V9:
	Builder.defineMacro("__sparcv9");
	if (getTriple().getOS() != llvm::Triple::Solaris) {
	Builder.defineMacro("__sparcv9__");
	Builder.defineMacro("__sparc_v9__");
	}
	break;
	}
	if (getTriple().getVendor() == llvm::Triple::Myriad) {
	std::string MyriadArchValue, Myriad2Value;
	Builder.defineMacro("__sparc_v8__");
	Builder.defineMacro("__leon__");
	switch (CPU) {
	case CK_MYRIAD2150:
	MyriadArchValue = "__ma2150";
	Myriad2Value = "2";
	break;
	case CK_MYRIAD2450:
	MyriadArchValue = "__ma2450";
	Myriad2Value = "2";
	break;
	default:
	MyriadArchValue = "__ma2100";
	Myriad2Value = "1";
	break;
	}
	Builder.defineMacro(MyriadArchValue, "1");
	Builder.defineMacro(MyriadArchValue+"__", "1");
	Builder.defineMacro("__myriad2__", Myriad2Value);
	Builder.defineMacro("__myriad2", Myriad2Value);
	}
	}

	bool hasSjLjLowering() const override {
	return true;
	}
	};

	// SPARCV8el is the 32-bit little-endian mode selected by Triple::sparcel.
	class SparcV8elTargetInfo : public SparcV8TargetInfo {
	public:
	SparcV8elTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: SparcV8TargetInfo(Triple, Opts) {
	resetDataLayout("e-m:e-p:32:32-i64:64-f128:64-n32-S64");
	}
	};

	// SPARC v9 is the 64-bit mode selected by Triple::sparcv9.
	class SparcV9TargetInfo : public SparcTargetInfo {
	public:
	SparcV9TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: SparcTargetInfo(Triple, Opts) {
	// FIXME: Support Sparc quad-precision long double?
	resetDataLayout("E-m:e-i64:64-n32:64-S128");
	// This is an LP64 platform.
	LongWidth = LongAlign = PointerWidth = PointerAlign = 64;

	// OpenBSD uses long long for int64_t and intmax_t.
	if (getTriple().getOS() == llvm::Triple::OpenBSD)
	IntMaxType = SignedLongLong;
	else
	IntMaxType = SignedLong;
	Int64Type = IntMaxType;

	// The SPARCv8 System V ABI has long double 128-bits in size, but 64-bit
	// aligned. The SPARCv9 SCD 2.4.1 says 16-byte aligned.
	LongDoubleWidth = 128;
	LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::IEEEquad();
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	SparcTargetInfo::getTargetDefines(Opts, Builder);
	Builder.defineMacro("__sparcv9");
	Builder.defineMacro("__arch64__");
	// Solaris doesn't need these variants, but the BSDs do.
	if (getTriple().getOS() != llvm::Triple::Solaris) {
	Builder.defineMacro("__sparc64__");
	Builder.defineMacro("__sparc_v9__");
	Builder.defineMacro("__sparcv9__");
	}
	}

	bool setCPU(const std::string &Name) override {
	if (!SparcTargetInfo::setCPU(Name))
	return false;
	return getCPUGeneration(CPU) == CG_V9;
	}
	};

	class SystemZTargetInfo : public TargetInfo {
	static const Builtin::Info BuiltinInfo[];
	static const char *const GCCRegNames[];
	std::string CPU;
	int ISARevision;
	bool HasTransactionalExecution;
	bool HasVector;

	public:
	SystemZTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple), CPU("z10"), ISARevision(8),
	HasTransactionalExecution(false), HasVector(false) {
	IntMaxType = SignedLong;
	Int64Type = SignedLong;
	TLSSupported = true;
	IntWidth = IntAlign = 32;
	LongWidth = LongLongWidth = LongAlign = LongLongAlign = 64;
	PointerWidth = PointerAlign = 64;
	LongDoubleWidth = 128;
	LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEquad();
	DefaultAlignForAttributeAligned = 64;
	MinGlobalAlign = 16;
	resetDataLayout("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64-a:8:16-n32:64");
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__s390__");
	Builder.defineMacro("__s390x__");
	Builder.defineMacro("__zarch__");
	Builder.defineMacro("__LONG_DOUBLE_128__");

	Builder.defineMacro("__ARCH__", Twine(ISARevision));

	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");

	if (HasTransactionalExecution)
	Builder.defineMacro("__HTM__");
	if (HasVector)
	Builder.defineMacro("__VX__");
	if (Opts.ZVector)
	Builder.defineMacro("__VEC__", "10302");
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::SystemZ::LastTSBuiltin-Builtin::FirstTSBuiltin);
	}

	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	// No aliases.
	return None;
	}
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override;
	const char *getClobbers() const override {
	// FIXME: Is this really right?
	return "";
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::SystemZBuiltinVaList;
	}
	int getISARevision(const StringRef &Name) const {
	return llvm::StringSwitch<int>(Name)
	.Cases("arch8", "z10", 8)
	.Cases("arch9", "z196", 9)
	.Cases("arch10", "zEC12", 10)
	.Cases("arch11", "z13", 11)
	.Cases("arch12", "z14", 12)
	.Default(-1);
	}
	bool setCPU(const std::string &Name) override {
	CPU = Name;
	ISARevision = getISARevision(CPU);
	return ISARevision != -1;
	}
	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override {
	int ISARevision = getISARevision(CPU);
	if (ISARevision >= 10)
	Features["transactional-execution"] = true;
	if (ISARevision >= 11)
	Features["vector"] = true;
	if (ISARevision >= 12)
	Features["vector-enhancements-1"] = true;
	return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	HasTransactionalExecution = false;
	HasVector = false;
	for (const auto &Feature : Features) {
	if (Feature == "+transactional-execution")
	HasTransactionalExecution = true;
	else if (Feature == "+vector")
	HasVector = true;
	}
	// If we use the vector ABI, vector types are 64-bit aligned.
	if (HasVector) {
	MaxVectorAlign = 64;
	resetDataLayout("E-m:e-i1:8:16-i8:8:16-i64:64-f128:64"
	"-v128:64-a:8:16-n32:64");
	}
	return true;
	}

	bool hasFeature(StringRef Feature) const override {
	return llvm::StringSwitch<bool>(Feature)
	.Case("systemz", true)
	.Case("arch8", ISARevision >= 8)
	.Case("arch9", ISARevision >= 9)
	.Case("arch10", ISARevision >= 10)
	.Case("arch11", ISARevision >= 11)
	.Case("arch12", ISARevision >= 12)
	.Case("htm", HasTransactionalExecution)
	.Case("vx", HasVector)
	.Default(false);
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	case CC_C:
	case CC_Swift:
	case CC_OpenCLKernel:
	return CCCR_OK;
	default:
	return CCCR_Warning;
	}
	}

	StringRef getABI() const override {
	if (HasVector)
	return "vector";
	return "";
	}

	bool useFloat128ManglingForLongDouble() const override {
	return true;
	}
	};

	const Builtin::Info SystemZTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE },
	#include "clang/Basic/BuiltinsSystemZ.def"
	};

	const char *const SystemZTargetInfo::GCCRegNames[] = {
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
	"f0", "f2", "f4", "f6", "f1", "f3", "f5", "f7",
	"f8", "f10", "f12", "f14", "f9", "f11", "f13", "f15"
	};

	ArrayRef<const char *> SystemZTargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	bool SystemZTargetInfo::
	validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const {
	switch (*Name) {
	default:
	return false;

	case 'a': // Address register
	case 'd': // Data register (equivalent to 'r')
	case 'f': // Floating-point register
	Info.setAllowsRegister();
	return true;

	case 'I': // Unsigned 8-bit constant
	case 'J': // Unsigned 12-bit constant
	case 'K': // Signed 16-bit constant
	case 'L': // Signed 20-bit displacement (on all targets we support)
	case 'M': // 0x7fffffff
	return true;

	case 'Q': // Memory with base and unsigned 12-bit displacement
	case 'R': // Likewise, plus an index
	case 'S': // Memory with base and signed 20-bit displacement
	case 'T': // Likewise, plus an index
	Info.setAllowsMemory();
	return true;
	}
	}

	class MSP430TargetInfo : public TargetInfo {
	static const char *const GCCRegNames[];

	public:
	MSP430TargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	TLSSupported = false;
	IntWidth = 16;
	IntAlign = 16;
	LongWidth = 32;
	LongLongWidth = 64;
	LongAlign = LongLongAlign = 16;
	PointerWidth = 16;
	PointerAlign = 16;
	SuitableAlign = 16;
	SizeType = UnsignedInt;
	IntMaxType = SignedLongLong;
	IntPtrType = SignedInt;
	PtrDiffType = SignedInt;
	SigAtomicType = SignedLong;
	resetDataLayout("e-m:e-p:16:16-i32:16-i64:16-f32:16-f64:16-a:8-n8:16-S16");
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("MSP430");
	Builder.defineMacro("__MSP430__");
	// FIXME: defines for different 'flavours' of MCU
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	// FIXME: Implement.
	return None;
	}
	bool hasFeature(StringRef Feature) const override {
	return Feature == "msp430";
	}
	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	// No aliases.
	return None;
	}
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override {
	// FIXME: implement
	switch (*Name) {
	case 'K': // the constant 1
	case 'L': // constant -1^20 .. 1^19
	case 'M': // constant 1-4:
	return true;
	}
	// No target constraints for now.
	return false;
	}
	const char *getClobbers() const override {
	// FIXME: Is this really right?
	return "";
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	// FIXME: implement
	return TargetInfo::CharPtrBuiltinVaList;
	}
	};

	const char *const MSP430TargetInfo::GCCRegNames[] = {
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"};

	ArrayRef<const char *> MSP430TargetInfo::getGCCRegNames() const {
	return llvm::makeArrayRef(GCCRegNames);
	}

	// LLVM and Clang cannot be used directly to output native binaries for
	// target, but is used to compile C code to llvm bitcode with correct
	// type and alignment information.
	//
	// TCE uses the llvm bitcode as input and uses it for generating customized
	// target processor and program binary. TCE co-design environment is
	// publicly available in http://tce.cs.tut.fi

	static const unsigned TCEOpenCLAddrSpaceMap[] = {
	0, // Default
	3, // opencl_global
	4, // opencl_local
	5, // opencl_constant
	// FIXME: generic has to be added to the target
	0, // opencl_generic
	0, // cuda_device
	0, // cuda_constant
	0 // cuda_shared
	};

	class TCETargetInfo : public TargetInfo {
	public:
	TCETargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	TLSSupported = false;
	IntWidth = 32;
	LongWidth = LongLongWidth = 32;
	PointerWidth = 32;
	IntAlign = 32;
	LongAlign = LongLongAlign = 32;
	PointerAlign = 32;
	SuitableAlign = 32;
	SizeType = UnsignedInt;
	IntMaxType = SignedLong;
	IntPtrType = SignedInt;
	PtrDiffType = SignedInt;
	FloatWidth = 32;
	FloatAlign = 32;
	DoubleWidth = 32;
	DoubleAlign = 32;
	LongDoubleWidth = 32;
	LongDoubleAlign = 32;
	FloatFormat = &llvm::APFloat::IEEEsingle();
	DoubleFormat = &llvm::APFloat::IEEEsingle();
	LongDoubleFormat = &llvm::APFloat::IEEEsingle();
	resetDataLayout("E-p:32:32:32-i1:8:8-i8:8:32-"
	"i16:16:32-i32:32:32-i64:32:32-"
	"f32:32:32-f64:32:32-v64:32:32-"
	"v128:32:32-v256:32:32-v512:32:32-"
	"v1024:32:32-a0:0:32-n32");
	AddrSpaceMap = &TCEOpenCLAddrSpaceMap;
	UseAddrSpaceMapMangling = true;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "tce", Opts);
	Builder.defineMacro("__TCE__");
	Builder.defineMacro("__TCE_V1__");
	}
	bool hasFeature(StringRef Feature) const override { return Feature == "tce"; }

	ArrayRef<Builtin::Info> getTargetBuiltins() const override { return None; }
	const char *getClobbers() const override { return ""; }
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}
	ArrayRef<const char *> getGCCRegNames() const override { return None; }
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override {
	return true;
	}
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}
	};

	class TCELETargetInfo : public TCETargetInfo {
	public:
	TCELETargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: TCETargetInfo(Triple, Opts) {
	BigEndian = false;

	resetDataLayout("e-p:32:32:32-i1:8:8-i8:8:32-"
	"i16:16:32-i32:32:32-i64:32:32-"
	"f32:32:32-f64:32:32-v64:32:32-"
	"v128:32:32-v256:32:32-v512:32:32-"
	"v1024:32:32-a0:0:32-n32");

	}

	virtual void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const {
	DefineStd(Builder, "tcele", Opts);
	Builder.defineMacro("__TCE__");
	Builder.defineMacro("__TCE_V1__");
	Builder.defineMacro("__TCELE__");
	Builder.defineMacro("__TCELE_V1__");
	}

	};

	class BPFTargetInfo : public TargetInfo {
	public:
	BPFTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
	SizeType = UnsignedLong;
	PtrDiffType = SignedLong;
	IntPtrType = SignedLong;
	IntMaxType = SignedLong;
	Int64Type = SignedLong;
	RegParmMax = 5;
	if (Triple.getArch() == llvm::Triple::bpfeb) {
	resetDataLayout("E-m:e-p:64:64-i64:64-n32:64-S128");
	} else {
	resetDataLayout("e-m:e-p:64:64-i64:64-n32:64-S128");
	}
	MaxAtomicPromoteWidth = 64;
	MaxAtomicInlineWidth = 64;
	TLSSupported = false;
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "bpf", Opts);
	Builder.defineMacro("__BPF__");
	}
	bool hasFeature(StringRef Feature) const override {
	return Feature == "bpf";
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override { return None; }
	const char *getClobbers() const override {
	return "";
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}
	ArrayRef<const char *> getGCCRegNames() const override {
	return None;
	}
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override {
	return true;
	}
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}
	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	switch (CC) {
	default:
	return CCCR_Warning;
	case CC_C:
	case CC_OpenCLKernel:
	return CCCR_OK;
	}
	}
	};

	class Nios2TargetInfo : public TargetInfo {
	void setDataLayout() {
	if (BigEndian)
	resetDataLayout("E-p:32:32:32-i8:8:32-i16:16:32-n32");
	else
	resetDataLayout("e-p:32:32:32-i8:8:32-i16:16:32-n32");
	}

	static const Builtin::Info BuiltinInfo[];
	std::string CPU;
	std::string ABI;

	public:
	Nios2TargetInfo(const llvm::Triple &triple, const TargetOptions &opts)
	: TargetInfo(triple), CPU(opts.CPU), ABI(opts.ABI) {
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
	setDataLayout();
	}

	StringRef getABI() const override { return ABI; }
	bool setABI(const std::string &Name) override {
	if (Name == "o32" \|\| Name == "eabi") {
	ABI = Name;
	return true;
	}
	return false;
	}

	bool setCPU(const std::string &Name) override {
	if (Name == "nios2r1" \|\| Name == "nios2r2") {
	CPU = Name;
	return true;
	}
	return false;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "nios2", Opts);
	DefineStd(Builder, "NIOS2", Opts);

	Builder.defineMacro("__nios2");
	Builder.defineMacro("__NIOS2");
	Builder.defineMacro("__nios2__");
	Builder.defineMacro("__NIOS2__");
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo, clang::Nios2::LastTSBuiltin -
	Builtin::FirstTSBuiltin);
	}

	bool isFeatureSupportedByCPU(StringRef Feature, StringRef CPU) const {
	const bool isR2 = CPU == "nios2r2";
	return llvm::StringSwitch<bool>(Feature)
	.Case("nios2r2mandatory", isR2)
	.Case("nios2r2bmx", isR2)
	.Case("nios2r2mpx", isR2)
	.Case("nios2r2cdx", isR2)
	.Default(false);
	}

	bool initFeatureMap(llvm::StringMap<bool> &Features,
	DiagnosticsEngine &Diags, StringRef CPU,
	const std::vector<std::string> &FeatureVec) const override {
	static const char *allFeatures[] = {
	"nios2r2mandatory", "nios2r2bmx", "nios2r2mpx", "nios2r2cdx"
	};
	for (const char *feature : allFeatures) {
	Features[feature] = isFeatureSupportedByCPU(feature, CPU);
	}
	return true;
	}

	bool hasFeature(StringRef Feature) const override {
	return isFeatureSupportedByCPU(Feature, CPU);
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}

	ArrayRef<const char *> getGCCRegNames() const override {
	static const char *const GCCRegNames[] = {
	// CPU register names
	// Must match second column of GCCRegAliases
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", "r9", "r10",
	"r11", "r12", "r13", "r14", "r15", "r16", "r17", "r18", "r19", "r20",
	"r21", "r22", "r23", "r24", "r25", "r26", "r27", "r28", "r29", "r30",
	"r31",
	// Floating point register names
	"ctl0", "ctl1", "ctl2", "ctl3", "ctl4", "ctl5", "ctl6", "ctl7", "ctl8",
	"ctl9", "ctl10", "ctl11", "ctl12", "ctl13", "ctl14", "ctl15"
	};
	return llvm::makeArrayRef(GCCRegNames);
	}

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default:
	return false;

	case 'r': // CPU registers.
	case 'd': // Equivalent to "r" unless generating MIPS16 code.
	case 'y': // Equivalent to "r", backwards compatibility only.
	case 'f': // floating-point registers.
	case 'c': // $25 for indirect jumps
	case 'l': // lo register
	case 'x': // hilo register pair
	Info.setAllowsRegister();
	return true;
	}
	}

	const char *getClobbers() const override { return ""; }

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	static const TargetInfo::GCCRegAlias aliases[] = {
	{{"zero"}, "r0"}, {{"at"}, "r1"}, {{"et"}, "r24"},
	{{"bt"}, "r25"}, {{"gp"}, "r26"}, {{"sp"}, "r27"},
	{{"fp"}, "r28"}, {{"ea"}, "r29"}, {{"ba"}, "r30"},
	{{"ra"}, "r31"}, {{"status"}, "ctl0"}, {{"estatus"}, "ctl1"},
	{{"bstatus"}, "ctl2"}, {{"ienable"}, "ctl3"}, {{"ipending"}, "ctl4"},
	{{"cpuid"}, "ctl5"}, {{"exception"}, "ctl7"}, {{"pteaddr"}, "ctl8"},
	{{"tlbacc"}, "ctl9"}, {{"tlbmisc"}, "ctl10"}, {{"badaddr"}, "ctl12"},
	{{"config"}, "ctl13"}, {{"mpubase"}, "ctl14"}, {{"mpuacc"}, "ctl15"},
	};
	return llvm::makeArrayRef(aliases);
	}
	};

	const Builtin::Info Nios2TargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr},
	#define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) \
	{#ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, FEATURE},
	#include "clang/Basic/BuiltinsNios2.def"
	};

	class MipsTargetInfo : public TargetInfo {
	void setDataLayout() {
	StringRef Layout;

	if (ABI == "o32")
	Layout = "m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64";
	else if (ABI == "n32")
	Layout = "m:e-p:32:32-i8:8:32-i16:16:32-i64:64-n32:64-S128";
	else if (ABI == "n64")
	Layout = "m:e-i8:8:32-i16:16:32-i64:64-n32:64-S128";
	else
	llvm_unreachable("Invalid ABI");

	if (BigEndian)
	resetDataLayout(("E-" + Layout).str());
	else
	resetDataLayout(("e-" + Layout).str());
	}


	static const Builtin::Info BuiltinInfo[];
	std::string CPU;
	bool IsMips16;
	bool IsMicromips;
	bool IsNan2008;
	bool IsSingleFloat;
	bool IsNoABICalls;
	bool CanUseBSDABICalls;
	enum MipsFloatABI {
	HardFloat, SoftFloat
	} FloatABI;
	enum DspRevEnum {
	NoDSP, DSP1, DSP2
	} DspRev;
	bool HasMSA;
	- bool DisableMadd4;

	protected:
	bool HasFP64;
	std::string ABI;

	public:
	MipsTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple), IsMips16(false), IsMicromips(false),
	IsNan2008(false), IsSingleFloat(false), IsNoABICalls(false),
	CanUseBSDABICalls(false), FloatABI(HardFloat), DspRev(NoDSP),
	- HasMSA(false), DisableMadd4(false), HasFP64(false) {
	+ HasMSA(false), HasFP64(false) {
	TheCXXABI.set(TargetCXXABI::GenericMIPS);

	setABI((getTriple().getArch() == llvm::Triple::mips \|\|
	getTriple().getArch() == llvm::Triple::mipsel)
	? "o32"
	: "n64");

	CPU = ABI == "o32" ? "mips32r2" : "mips64r2";

	CanUseBSDABICalls = Triple.getOS() == llvm::Triple::FreeBSD \|\|
	Triple.getOS() == llvm::Triple::OpenBSD;
	}

	bool isNaN2008Default() const {
	return CPU == "mips32r6" \|\| CPU == "mips64r6";
	}

	bool isFP64Default() const {
	return CPU == "mips32r6" \|\| ABI == "n32" \|\| ABI == "n64" \|\| ABI == "64";
	}

	bool isNan2008() const override {
	return IsNan2008;
	}

	bool processorSupportsGPR64() const {
	return llvm::StringSwitch<bool>(CPU)
	.Case("mips3", true)
	.Case("mips4", true)
	.Case("mips5", true)
	.Case("mips64", true)
	.Case("mips64r2", true)
	.Case("mips64r3", true)
	.Case("mips64r5", true)
	.Case("mips64r6", true)
	.Case("octeon", true)
	.Default(false);
	return false;
	}

	StringRef getABI() const override { return ABI; }
	bool setABI(const std::string &Name) override {
	if (Name == "o32") {
	setO32ABITypes();
	ABI = Name;
	return true;
	}

	if (Name == "n32") {
	setN32ABITypes();
	ABI = Name;
	return true;
	}
	if (Name == "n64") {
	setN64ABITypes();
	ABI = Name;
	return true;
	}
	return false;
	}

	void setO32ABITypes() {
	Int64Type = SignedLongLong;
	IntMaxType = Int64Type;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	LongDoubleWidth = LongDoubleAlign = 64;
	LongWidth = LongAlign = 32;
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 32;
	PointerWidth = PointerAlign = 32;
	PtrDiffType = SignedInt;
	SizeType = UnsignedInt;
	SuitableAlign = 64;
	}

	void setN32N64ABITypes() {
	LongDoubleWidth = LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::IEEEquad();
	if (getTriple().getOS() == llvm::Triple::FreeBSD) {
	LongDoubleWidth = LongDoubleAlign = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	SuitableAlign = 128;
	}

	void setN64ABITypes() {
	setN32N64ABITypes();
	if (getTriple().getOS() == llvm::Triple::OpenBSD) {
	Int64Type = SignedLongLong;
	} else {
	Int64Type = SignedLong;
	}
	IntMaxType = Int64Type;
	LongWidth = LongAlign = 64;
	PointerWidth = PointerAlign = 64;
	PtrDiffType = SignedLong;
	SizeType = UnsignedLong;
	}

	void setN32ABITypes() {
	setN32N64ABITypes();
	Int64Type = SignedLongLong;
	IntMaxType = Int64Type;
	LongWidth = LongAlign = 32;
	PointerWidth = PointerAlign = 32;
	PtrDiffType = SignedInt;
	SizeType = UnsignedInt;
	}

	bool setCPU(const std::string &Name) override {
	CPU = Name;
	return llvm::StringSwitch<bool>(Name)
	.Case("mips1", true)
	.Case("mips2", true)
	.Case("mips3", true)
	.Case("mips4", true)
	.Case("mips5", true)
	.Case("mips32", true)
	.Case("mips32r2", true)
	.Case("mips32r3", true)
	.Case("mips32r5", true)
	.Case("mips32r6", true)
	.Case("mips64", true)
	.Case("mips64r2", true)
	.Case("mips64r3", true)
	.Case("mips64r5", true)
	.Case("mips64r6", true)
	.Case("octeon", true)
	.Case("p5600", true)
	.Default(false);
	}
	const std::string& getCPU() const { return CPU; }
	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override {
	if (CPU.empty())
	CPU = getCPU();
	if (CPU == "octeon")
	Features["mips64r2"] = Features["cnmips"] = true;
	else
	Features[CPU] = true;
	return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	if (BigEndian) {
	DefineStd(Builder, "MIPSEB", Opts);
	Builder.defineMacro("_MIPSEB");
	} else {
	DefineStd(Builder, "MIPSEL", Opts);
	Builder.defineMacro("_MIPSEL");
	}

	Builder.defineMacro("__mips__");
	Builder.defineMacro("_mips");
	if (Opts.GNUMode)
	Builder.defineMacro("mips");

	if (ABI == "o32") {
	Builder.defineMacro("__mips", "32");
	Builder.defineMacro("_MIPS_ISA", "_MIPS_ISA_MIPS32");
	} else {
	Builder.defineMacro("__mips", "64");
	Builder.defineMacro("__mips64");
	Builder.defineMacro("__mips64__");
	Builder.defineMacro("_MIPS_ISA", "_MIPS_ISA_MIPS64");
	}

	const std::string ISARev = llvm::StringSwitch<std::string>(getCPU())
	.Cases("mips32", "mips64", "1")
	.Cases("mips32r2", "mips64r2", "2")
	.Cases("mips32r3", "mips64r3", "3")
	.Cases("mips32r5", "mips64r5", "5")
	.Cases("mips32r6", "mips64r6", "6")
	.Default("");
	if (!ISARev.empty())
	Builder.defineMacro("__mips_isa_rev", ISARev);

	if (ABI == "o32") {
	Builder.defineMacro("__mips_o32");
	Builder.defineMacro("_ABIO32", "1");
	Builder.defineMacro("_MIPS_SIM", "_ABIO32");
	} else if (ABI == "n32") {
	Builder.defineMacro("__mips_n32");
	Builder.defineMacro("_ABIN32", "2");
	Builder.defineMacro("_MIPS_SIM", "_ABIN32");
	} else if (ABI == "n64") {
	Builder.defineMacro("__mips_n64");
	Builder.defineMacro("_ABI64", "3");
	Builder.defineMacro("_MIPS_SIM", "_ABI64");
	} else
	llvm_unreachable("Invalid ABI.");

	if (!IsNoABICalls) {
	Builder.defineMacro("__mips_abicalls");
	if (CanUseBSDABICalls)
	Builder.defineMacro("__ABICALLS__");
	}

	Builder.defineMacro("__REGISTER_PREFIX__", "");

	switch (FloatABI) {
	case HardFloat:
	Builder.defineMacro("__mips_hard_float", Twine(1));
	break;
	case SoftFloat:
	Builder.defineMacro("__mips_soft_float", Twine(1));
	break;
	}

	if (IsSingleFloat)
	Builder.defineMacro("__mips_single_float", Twine(1));

	Builder.defineMacro("__mips_fpr", HasFP64 ? Twine(64) : Twine(32));
	Builder.defineMacro("_MIPS_FPSET",
	Twine(32 / (HasFP64 \|\| IsSingleFloat ? 1 : 2)));

	if (IsMips16)
	Builder.defineMacro("__mips16", Twine(1));

	if (IsMicromips)
	Builder.defineMacro("__mips_micromips", Twine(1));

	if (IsNan2008)
	Builder.defineMacro("__mips_nan2008", Twine(1));

	switch (DspRev) {
	default:
	break;
	case DSP1:
	Builder.defineMacro("__mips_dsp_rev", Twine(1));
	Builder.defineMacro("__mips_dsp", Twine(1));
	break;
	case DSP2:
	Builder.defineMacro("__mips_dsp_rev", Twine(2));
	Builder.defineMacro("__mips_dspr2", Twine(1));
	Builder.defineMacro("__mips_dsp", Twine(1));
	break;
	}

	if (HasMSA)
	Builder.defineMacro("__mips_msa", Twine(1));

	- if (DisableMadd4)
	- Builder.defineMacro("__mips_no_madd4", Twine(1));
	-
	Builder.defineMacro("_MIPS_SZPTR", Twine(getPointerWidth(0)));
	Builder.defineMacro("_MIPS_SZINT", Twine(getIntWidth()));
	Builder.defineMacro("_MIPS_SZLONG", Twine(getLongWidth()));

	Builder.defineMacro("_MIPS_ARCH", "\"" + CPU + "\"");
	Builder.defineMacro("_MIPS_ARCH_" + StringRef(CPU).upper());

	// These shouldn't be defined for MIPS-I but there's no need to check
	// for that since MIPS-I isn't supported.
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2");
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_4");

	// 32-bit MIPS processors don't have the necessary lld/scd instructions
	// found in 64-bit processors. In the case of O32 on a 64-bit processor,
	// the instructions exist but using them violates the ABI since they
	// require 64-bit GPRs and O32 only supports 32-bit GPRs.
	if (ABI == "n32" \|\| ABI == "n64")
	Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8");
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::Mips::LastTSBuiltin - Builtin::FirstTSBuiltin);
	}
	bool hasFeature(StringRef Feature) const override {
	return llvm::StringSwitch<bool>(Feature)
	.Case("mips", true)
	.Case("fp64", HasFP64)
	.Default(false);
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}
	ArrayRef<const char *> getGCCRegNames() const override {
	static const char *const GCCRegNames[] = {
	// CPU register names
	// Must match second column of GCCRegAliases
	"$0", "$1", "$2", "$3", "$4", "$5", "$6", "$7",
	"$8", "$9", "$10", "$11", "$12", "$13", "$14", "$15",
	"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23",
	"$24", "$25", "$26", "$27", "$28", "$29", "$30", "$31",
	// Floating point register names
	"$f0", "$f1", "$f2", "$f3", "$f4", "$f5", "$f6", "$f7",
	"$f8", "$f9", "$f10", "$f11", "$f12", "$f13", "$f14", "$f15",
	"$f16", "$f17", "$f18", "$f19", "$f20", "$f21", "$f22", "$f23",
	"$f24", "$f25", "$f26", "$f27", "$f28", "$f29", "$f30", "$f31",
	// Hi/lo and condition register names
	"hi", "lo", "", "$fcc0","$fcc1","$fcc2","$fcc3","$fcc4",
	"$fcc5","$fcc6","$fcc7","$ac1hi","$ac1lo","$ac2hi","$ac2lo",
	"$ac3hi","$ac3lo",
	// MSA register names
	"$w0", "$w1", "$w2", "$w3", "$w4", "$w5", "$w6", "$w7",
	"$w8", "$w9", "$w10", "$w11", "$w12", "$w13", "$w14", "$w15",
	"$w16", "$w17", "$w18", "$w19", "$w20", "$w21", "$w22", "$w23",
	"$w24", "$w25", "$w26", "$w27", "$w28", "$w29", "$w30", "$w31",
	// MSA control register names
	"$msair", "$msacsr", "$msaaccess", "$msasave", "$msamodify",
	"$msarequest", "$msamap", "$msaunmap"
	};
	return llvm::makeArrayRef(GCCRegNames);
	}
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	switch (*Name) {
	default:
	return false;
	case 'r': // CPU registers.
	case 'd': // Equivalent to "r" unless generating MIPS16 code.
	case 'y': // Equivalent to "r", backward compatibility only.
	case 'f': // floating-point registers.
	case 'c': // $25 for indirect jumps
	case 'l': // lo register
	case 'x': // hilo register pair
	Info.setAllowsRegister();
	return true;
	case 'I': // Signed 16-bit constant
	case 'J': // Integer 0
	case 'K': // Unsigned 16-bit constant
	case 'L': // Signed 32-bit constant, lower 16-bit zeros (for lui)
	case 'M': // Constants not loadable via lui, addiu, or ori
	case 'N': // Constant -1 to -65535
	case 'O': // A signed 15-bit constant
	case 'P': // A constant between 1 go 65535
	return true;
	case 'R': // An address that can be used in a non-macro load or store
	Info.setAllowsMemory();
	return true;
	case 'Z':
	if (Name[1] == 'C') { // An address usable by ll, and sc.
	Info.setAllowsMemory();
	Name++; // Skip over 'Z'.
	return true;
	}
	return false;
	}
	}

	std::string convertConstraint(const char *&Constraint) const override {
	std::string R;
	switch (*Constraint) {
	case 'Z': // Two-character constraint; add "^" hint for later parsing.
	if (Constraint[1] == 'C') {
	R = std::string("^") + std::string(Constraint, 2);
	Constraint++;
	return R;
	}
	break;
	}
	return TargetInfo::convertConstraint(Constraint);
	}

	const char *getClobbers() const override {
	// In GCC, $1 is not widely used in generated code (it's used only in a few
	// specific situations), so there is no real need for users to add it to
	// the clobbers list if they want to use it in their inline assembly code.
	//
	// In LLVM, $1 is treated as a normal GPR and is always allocatable during
	// code generation, so using it in inline assembly without adding it to the
	// clobbers list can cause conflicts between the inline assembly code and
	// the surrounding generated code.
	//
	// Another problem is that LLVM is allowed to choose $1 for inline assembly
	// operands, which will conflict with the ".set at" assembler option (which
	// we use only for inline assembly, in order to maintain compatibility with
	// GCC) and will also conflict with the user's usage of $1.
	//
	// The easiest way to avoid these conflicts and keep $1 as an allocatable
	// register for generated code is to automatically clobber $1 for all inline
	// assembly code.
	//
	// FIXME: We should automatically clobber $1 only for inline assembly code
	// which actually uses it. This would allow LLVM to use $1 for inline
	// assembly operands if the user's assembly code doesn't use it.
	return "~{$1}";
	}

	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) override {
	IsMips16 = false;
	IsMicromips = false;
	IsNan2008 = isNaN2008Default();
	IsSingleFloat = false;
	FloatABI = HardFloat;
	DspRev = NoDSP;
	HasFP64 = isFP64Default();

	for (const auto &Feature : Features) {
	if (Feature == "+single-float")
	IsSingleFloat = true;
	else if (Feature == "+soft-float")
	FloatABI = SoftFloat;
	else if (Feature == "+mips16")
	IsMips16 = true;
	else if (Feature == "+micromips")
	IsMicromips = true;
	else if (Feature == "+dsp")
	DspRev = std::max(DspRev, DSP1);
	else if (Feature == "+dspr2")
	DspRev = std::max(DspRev, DSP2);
	else if (Feature == "+msa")
	HasMSA = true;
	- else if (Feature == "+nomadd4")
	- DisableMadd4 = true;
	else if (Feature == "+fp64")
	HasFP64 = true;
	else if (Feature == "-fp64")
	HasFP64 = false;
	else if (Feature == "+nan2008")
	IsNan2008 = true;
	else if (Feature == "-nan2008")
	IsNan2008 = false;
	else if (Feature == "+noabicalls")
	IsNoABICalls = true;
	}

	setDataLayout();

	return true;
	}

	int getEHDataRegisterNumber(unsigned RegNo) const override {
	if (RegNo == 0) return 4;
	if (RegNo == 1) return 5;
	return -1;
	}

	bool isCLZForZeroUndef() const override { return false; }

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	static const TargetInfo::GCCRegAlias O32RegAliases[] = {
	{{"at"}, "$1"}, {{"v0"}, "$2"}, {{"v1"}, "$3"},
	{{"a0"}, "$4"}, {{"a1"}, "$5"}, {{"a2"}, "$6"},
	{{"a3"}, "$7"}, {{"t0"}, "$8"}, {{"t1"}, "$9"},
	{{"t2"}, "$10"}, {{"t3"}, "$11"}, {{"t4"}, "$12"},
	{{"t5"}, "$13"}, {{"t6"}, "$14"}, {{"t7"}, "$15"},
	{{"s0"}, "$16"}, {{"s1"}, "$17"}, {{"s2"}, "$18"},
	{{"s3"}, "$19"}, {{"s4"}, "$20"}, {{"s5"}, "$21"},
	{{"s6"}, "$22"}, {{"s7"}, "$23"}, {{"t8"}, "$24"},
	{{"t9"}, "$25"}, {{"k0"}, "$26"}, {{"k1"}, "$27"},
	{{"gp"}, "$28"}, {{"sp", "$sp"}, "$29"}, {{"fp", "$fp"}, "$30"},
	{{"ra"}, "$31"}};
	static const TargetInfo::GCCRegAlias NewABIRegAliases[] = {
	{{"at"}, "$1"}, {{"v0"}, "$2"}, {{"v1"}, "$3"},
	{{"a0"}, "$4"}, {{"a1"}, "$5"}, {{"a2"}, "$6"},
	{{"a3"}, "$7"}, {{"a4"}, "$8"}, {{"a5"}, "$9"},
	{{"a6"}, "$10"}, {{"a7"}, "$11"}, {{"t0"}, "$12"},
	{{"t1"}, "$13"}, {{"t2"}, "$14"}, {{"t3"}, "$15"},
	{{"s0"}, "$16"}, {{"s1"}, "$17"}, {{"s2"}, "$18"},
	{{"s3"}, "$19"}, {{"s4"}, "$20"}, {{"s5"}, "$21"},
	{{"s6"}, "$22"}, {{"s7"}, "$23"}, {{"t8"}, "$24"},
	{{"t9"}, "$25"}, {{"k0"}, "$26"}, {{"k1"}, "$27"},
	{{"gp"}, "$28"}, {{"sp", "$sp"}, "$29"}, {{"fp", "$fp"}, "$30"},
	{{"ra"}, "$31"}};
	if (ABI == "o32")
	return llvm::makeArrayRef(O32RegAliases);
	return llvm::makeArrayRef(NewABIRegAliases);
	}

	bool hasInt128Type() const override {
	return ABI == "n32" \|\| ABI == "n64";
	}

	bool validateTarget(DiagnosticsEngine &Diags) const override {
	// FIXME: It's valid to use O32 on a 64-bit CPU but the backend can't handle
	// this yet. It's better to fail here than on the backend assertion.
	if (processorSupportsGPR64() && ABI == "o32") {
	Diags.Report(diag::err_target_unsupported_abi) << ABI << CPU;
	return false;
	}

	// 64-bit ABI's require 64-bit CPU's.
	if (!processorSupportsGPR64() && (ABI == "n32" \|\| ABI == "n64")) {
	Diags.Report(diag::err_target_unsupported_abi) << ABI << CPU;
	return false;
	}

	// FIXME: It's valid to use O32 on a mips64/mips64el triple but the backend
	// can't handle this yet. It's better to fail here than on the
	// backend assertion.
	if ((getTriple().getArch() == llvm::Triple::mips64 \|\|
	getTriple().getArch() == llvm::Triple::mips64el) &&
	ABI == "o32") {
	Diags.Report(diag::err_target_unsupported_abi_for_triple)
	<< ABI << getTriple().str();
	return false;
	}

	// FIXME: It's valid to use N32/N64 on a mips/mipsel triple but the backend
	// can't handle this yet. It's better to fail here than on the
	// backend assertion.
	if ((getTriple().getArch() == llvm::Triple::mips \|\|
	getTriple().getArch() == llvm::Triple::mipsel) &&
	(ABI == "n32" \|\| ABI == "n64")) {
	Diags.Report(diag::err_target_unsupported_abi_for_triple)
	<< ABI << getTriple().str();
	return false;
	}

	return true;
	}
	};

	const Builtin::Info MipsTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{ #ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsMips.def"
	};

	class PNaClTargetInfo : public TargetInfo {
	public:
	PNaClTargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: TargetInfo(Triple) {
	this->LongAlign = 32;
	this->LongWidth = 32;
	this->PointerAlign = 32;
	this->PointerWidth = 32;
	this->IntMaxType = TargetInfo::SignedLongLong;
	this->Int64Type = TargetInfo::SignedLongLong;
	this->DoubleAlign = 64;
	this->LongDoubleWidth = 64;
	this->LongDoubleAlign = 64;
	this->SizeType = TargetInfo::UnsignedInt;
	this->PtrDiffType = TargetInfo::SignedInt;
	this->IntPtrType = TargetInfo::SignedInt;
	this->RegParmMax = 0; // Disallow regparm
	}

	void getArchDefines(const LangOptions &Opts, MacroBuilder &Builder) const {
	Builder.defineMacro("__le32__");
	Builder.defineMacro("__pnacl__");
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	getArchDefines(Opts, Builder);
	}
	bool hasFeature(StringRef Feature) const override {
	return Feature == "pnacl";
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const override { return None; }
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::PNaClABIBuiltinVaList;
	}
	ArrayRef<const char *> getGCCRegNames() const override;
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override;
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	return false;
	}

	const char *getClobbers() const override {
	return "";
	}
	};

	ArrayRef<const char *> PNaClTargetInfo::getGCCRegNames() const {
	return None;
	}

	ArrayRef<TargetInfo::GCCRegAlias> PNaClTargetInfo::getGCCRegAliases() const {
	return None;
	}

	// We attempt to use PNaCl (le32) frontend and Mips32EL backend.
	class NaClMips32TargetInfo : public MipsTargetInfo {
	public:
	NaClMips32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: MipsTargetInfo(Triple, Opts) {}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::PNaClABIBuiltinVaList;
	}
	};

	class Le64TargetInfo : public TargetInfo {
	static const Builtin::Info BuiltinInfo[];

	public:
	Le64TargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	NoAsmVariants = true;
	LongWidth = LongAlign = PointerWidth = PointerAlign = 64;
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	resetDataLayout("e-m:e-v128:32-v16:16-v32:32-v96:32-n8:16:32:64-S128");
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "unix", Opts);
	defineCPUMacros(Builder, "le64", /Tuning=/false);
	Builder.defineMacro("__ELF__");
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::Le64::LastTSBuiltin - Builtin::FirstTSBuiltin);
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::PNaClABIBuiltinVaList;
	}
	const char *getClobbers() const override { return ""; }
	ArrayRef<const char *> getGCCRegNames() const override {
	return None;
	}
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	return false;
	}

	bool hasProtectedVisibility() const override { return false; }
	};

	class WebAssemblyTargetInfo : public TargetInfo {
	static const Builtin::Info BuiltinInfo[];

	enum SIMDEnum {
	NoSIMD,
	SIMD128,
	} SIMDLevel;

	public:
	explicit WebAssemblyTargetInfo(const llvm::Triple &T, const TargetOptions &)
	: TargetInfo(T), SIMDLevel(NoSIMD) {
	NoAsmVariants = true;
	SuitableAlign = 128;
	LargeArrayMinWidth = 128;
	LargeArrayAlign = 128;
	SimdDefaultAlign = 128;
	SigAtomicType = SignedLong;
	LongDoubleWidth = LongDoubleAlign = 128;
	LongDoubleFormat = &llvm::APFloat::IEEEquad();
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;
	}

	protected:
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	defineCPUMacros(Builder, "wasm", /Tuning=/false);
	if (SIMDLevel >= SIMD128)
	Builder.defineMacro("__wasm_simd128__");
	}

	private:
	bool
	initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
	StringRef CPU,
	const std::vector<std::string> &FeaturesVec) const override {
	if (CPU == "bleeding-edge")
	Features["simd128"] = true;
	return TargetInfo::initFeatureMap(Features, Diags, CPU, FeaturesVec);
	}
	bool hasFeature(StringRef Feature) const final {
	return llvm::StringSwitch<bool>(Feature)
	.Case("simd128", SIMDLevel >= SIMD128)
	.Default(false);
	}
	bool handleTargetFeatures(std::vector<std::string> &Features,
	DiagnosticsEngine &Diags) final {
	for (const auto &Feature : Features) {
	if (Feature == "+simd128") {
	SIMDLevel = std::max(SIMDLevel, SIMD128);
	continue;
	}
	if (Feature == "-simd128") {
	SIMDLevel = std::min(SIMDLevel, SIMDEnum(SIMD128 - 1));
	continue;
	}

	Diags.Report(diag::err_opt_not_valid_with_opt) << Feature
	<< "-target-feature";
	return false;
	}
	return true;
	}
	bool setCPU(const std::string &Name) final {
	return llvm::StringSwitch<bool>(Name)
	.Case("mvp", true)
	.Case("bleeding-edge", true)
	.Case("generic", true)
	.Default(false);
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const final {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::WebAssembly::LastTSBuiltin - Builtin::FirstTSBuiltin);
	}
	BuiltinVaListKind getBuiltinVaListKind() const final {
	return VoidPtrBuiltinVaList;
	}
	ArrayRef<const char *> getGCCRegNames() const final {
	return None;
	}
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const final {
	return None;
	}
	bool
	validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const final {
	return false;
	}
	const char *getClobbers() const final { return ""; }
	bool isCLZForZeroUndef() const final { return false; }
	bool hasInt128Type() const final { return true; }
	IntType getIntTypeByWidth(unsigned BitWidth,
	bool IsSigned) const final {
	// WebAssembly prefers long long for explicitly 64-bit integers.
	return BitWidth == 64 ? (IsSigned ? SignedLongLong : UnsignedLongLong)
	: TargetInfo::getIntTypeByWidth(BitWidth, IsSigned);
	}
	IntType getLeastIntTypeByWidth(unsigned BitWidth,
	bool IsSigned) const final {
	// WebAssembly uses long long for int_least64_t and int_fast64_t.
	return BitWidth == 64
	? (IsSigned ? SignedLongLong : UnsignedLongLong)
	: TargetInfo::getLeastIntTypeByWidth(BitWidth, IsSigned);
	}
	};

	const Builtin::Info WebAssemblyTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{ #ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsWebAssembly.def"
	};

	class WebAssembly32TargetInfo : public WebAssemblyTargetInfo {
	public:
	explicit WebAssembly32TargetInfo(const llvm::Triple &T,
	const TargetOptions &Opts)
	: WebAssemblyTargetInfo(T, Opts) {
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	resetDataLayout("e-m:e-p:32:32-i64:64-n32:64-S128");
	}

	protected:
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WebAssemblyTargetInfo::getTargetDefines(Opts, Builder);
	defineCPUMacros(Builder, "wasm32", /Tuning=/false);
	}
	};

	class WebAssembly64TargetInfo : public WebAssemblyTargetInfo {
	public:
	explicit WebAssembly64TargetInfo(const llvm::Triple &T,
	const TargetOptions &Opts)
	: WebAssemblyTargetInfo(T, Opts) {
	LongAlign = LongWidth = 64;
	PointerAlign = PointerWidth = 64;
	MaxAtomicPromoteWidth = MaxAtomicInlineWidth = 64;
	SizeType = UnsignedLong;
	PtrDiffType = SignedLong;
	IntPtrType = SignedLong;
	resetDataLayout("e-m:e-p:64:64-i64:64-n32:64-S128");
	}

	protected:
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	WebAssemblyTargetInfo::getTargetDefines(Opts, Builder);
	defineCPUMacros(Builder, "wasm64", /Tuning=/false);
	}
	};

	const Builtin::Info Le64TargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsLe64.def"
	};

	static const unsigned SPIRAddrSpaceMap[] = {
	0, // Default
	1, // opencl_global
	3, // opencl_local
	2, // opencl_constant
	4, // opencl_generic
	0, // cuda_device
	0, // cuda_constant
	0 // cuda_shared
	};
	class SPIRTargetInfo : public TargetInfo {
	public:
	SPIRTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	assert(getTriple().getOS() == llvm::Triple::UnknownOS &&
	"SPIR target must use unknown OS");
	assert(getTriple().getEnvironment() == llvm::Triple::UnknownEnvironment &&
	"SPIR target must use unknown environment type");
	TLSSupported = false;
	LongWidth = LongAlign = 64;
	AddrSpaceMap = &SPIRAddrSpaceMap;
	UseAddrSpaceMapMangling = true;
	// Define available target features
	// These must be defined in sorted order!
	NoAsmVariants = true;
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "SPIR", Opts);
	}
	bool hasFeature(StringRef Feature) const override {
	return Feature == "spir";
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override { return None; }
	const char *getClobbers() const override { return ""; }
	ArrayRef<const char *> getGCCRegNames() const override { return None; }
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &info) const override {
	return true;
	}
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}

	CallingConvCheckResult checkCallingConvention(CallingConv CC) const override {
	return (CC == CC_SpirFunction \|\| CC == CC_OpenCLKernel) ? CCCR_OK
	: CCCR_Warning;
	}

	CallingConv getDefaultCallingConv(CallingConvMethodType MT) const override {
	return CC_SpirFunction;
	}

	void setSupportedOpenCLOpts() override {
	// Assume all OpenCL extensions and optional core features are supported
	// for SPIR since it is a generic target.
	getSupportedOpenCLOpts().supportAll();
	}
	};

	class SPIR32TargetInfo : public SPIRTargetInfo {
	public:
	SPIR32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: SPIRTargetInfo(Triple, Opts) {
	PointerWidth = PointerAlign = 32;
	SizeType = TargetInfo::UnsignedInt;
	PtrDiffType = IntPtrType = TargetInfo::SignedInt;
	resetDataLayout("e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-"
	"v96:128-v192:256-v256:256-v512:512-v1024:1024");
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "SPIR32", Opts);
	}
	};

	class SPIR64TargetInfo : public SPIRTargetInfo {
	public:
	SPIR64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: SPIRTargetInfo(Triple, Opts) {
	PointerWidth = PointerAlign = 64;
	SizeType = TargetInfo::UnsignedLong;
	PtrDiffType = IntPtrType = TargetInfo::SignedLong;
	resetDataLayout("e-i64:64-v16:16-v24:32-v32:32-v48:64-"
	"v96:128-v192:256-v256:256-v512:512-v1024:1024");
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	DefineStd(Builder, "SPIR64", Opts);
	}
	};

	class XCoreTargetInfo : public TargetInfo {
	static const Builtin::Info BuiltinInfo[];
	public:
	XCoreTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	NoAsmVariants = true;
	LongLongAlign = 32;
	SuitableAlign = 32;
	DoubleAlign = LongDoubleAlign = 32;
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;
	WCharType = UnsignedChar;
	WIntType = UnsignedInt;
	UseZeroLengthBitfieldAlignment = true;
	resetDataLayout("e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:32"
	"-f64:32-a:0:32-n32");
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__XS1B__");
	}
	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return llvm::makeArrayRef(BuiltinInfo,
	clang::XCore::LastTSBuiltin-Builtin::FirstTSBuiltin);
	}
	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}
	const char *getClobbers() const override {
	return "";
	}
	ArrayRef<const char *> getGCCRegNames() const override {
	static const char * const GCCRegNames[] = {
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
	"r8", "r9", "r10", "r11", "cp", "dp", "sp", "lr"
	};
	return llvm::makeArrayRef(GCCRegNames);
	}
	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}
	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	return false;
	}
	int getEHDataRegisterNumber(unsigned RegNo) const override {
	// R0=ExceptionPointerRegister R1=ExceptionSelectorRegister
	return (RegNo < 2)? RegNo : -1;
	}
	bool allowsLargerPreferedTypeAlignment() const override {
	return false;
	}
	};

	const Builtin::Info XCoreTargetInfo::BuiltinInfo[] = {
	#define BUILTIN(ID, TYPE, ATTRS) \
	{ #ID, TYPE, ATTRS, nullptr, ALL_LANGUAGES, nullptr },
	#define LIBBUILTIN(ID, TYPE, ATTRS, HEADER) \
	{ #ID, TYPE, ATTRS, HEADER, ALL_LANGUAGES, nullptr },
	#include "clang/Basic/BuiltinsXCore.def"
	};

	// x86_32 Android target
	class AndroidX86_32TargetInfo : public LinuxTargetInfo<X86_32TargetInfo> {
	public:
	AndroidX86_32TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: LinuxTargetInfo<X86_32TargetInfo>(Triple, Opts) {
	SuitableAlign = 32;
	LongDoubleWidth = 64;
	LongDoubleFormat = &llvm::APFloat::IEEEdouble();
	}
	};

	// x86_64 Android target
	class AndroidX86_64TargetInfo : public LinuxTargetInfo<X86_64TargetInfo> {
	public:
	AndroidX86_64TargetInfo(const llvm::Triple &Triple, const TargetOptions &Opts)
	: LinuxTargetInfo<X86_64TargetInfo>(Triple, Opts) {
	LongDoubleFormat = &llvm::APFloat::IEEEquad();
	}

	bool useFloat128ManglingForLongDouble() const override {
	return true;
	}
	};

	// 32-bit RenderScript is armv7 with width and align of 'long' set to 8-bytes
	class RenderScript32TargetInfo : public ARMleTargetInfo {
	public:
	RenderScript32TargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: ARMleTargetInfo(llvm::Triple("armv7", Triple.getVendorName(),
	Triple.getOSName(),
	Triple.getEnvironmentName()),
	Opts) {
	IsRenderScriptTarget = true;
	LongWidth = LongAlign = 64;
	}
	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__RENDERSCRIPT__");
	ARMleTargetInfo::getTargetDefines(Opts, Builder);
	}
	};

	// 64-bit RenderScript is aarch64
	class RenderScript64TargetInfo : public AArch64leTargetInfo {
	public:
	RenderScript64TargetInfo(const llvm::Triple &Triple,
	const TargetOptions &Opts)
	: AArch64leTargetInfo(llvm::Triple("aarch64", Triple.getVendorName(),
	Triple.getOSName(),
	Triple.getEnvironmentName()),
	Opts) {
	IsRenderScriptTarget = true;
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("__RENDERSCRIPT__");
	AArch64leTargetInfo::getTargetDefines(Opts, Builder);
	}
	};

	/// Information about a specific microcontroller.
	struct MCUInfo {
	const char *Name;
	const char *DefineName;
	};

	// This list should be kept up-to-date with AVRDevices.td in LLVM.
	static ArrayRef<MCUInfo> AVRMcus = {
	{ "at90s1200", "__AVR_AT90S1200__" },
	{ "attiny11", "__AVR_ATtiny11__" },
	{ "attiny12", "__AVR_ATtiny12__" },
	{ "attiny15", "__AVR_ATtiny15__" },
	{ "attiny28", "__AVR_ATtiny28__" },
	{ "at90s2313", "__AVR_AT90S2313__" },
	{ "at90s2323", "__AVR_AT90S2323__" },
	{ "at90s2333", "__AVR_AT90S2333__" },
	{ "at90s2343", "__AVR_AT90S2343__" },
	{ "attiny22", "__AVR_ATtiny22__" },
	{ "attiny26", "__AVR_ATtiny26__" },
	{ "at86rf401", "__AVR_AT86RF401__" },
	{ "at90s4414", "__AVR_AT90S4414__" },
	{ "at90s4433", "__AVR_AT90S4433__" },
	{ "at90s4434", "__AVR_AT90S4434__" },
	{ "at90s8515", "__AVR_AT90S8515__" },
	{ "at90c8534", "__AVR_AT90c8534__" },
	{ "at90s8535", "__AVR_AT90S8535__" },
	{ "ata5272", "__AVR_ATA5272__" },
	{ "attiny13", "__AVR_ATtiny13__" },
	{ "attiny13a", "__AVR_ATtiny13A__" },
	{ "attiny2313", "__AVR_ATtiny2313__" },
	{ "attiny2313a", "__AVR_ATtiny2313A__" },
	{ "attiny24", "__AVR_ATtiny24__" },
	{ "attiny24a", "__AVR_ATtiny24A__" },
	{ "attiny4313", "__AVR_ATtiny4313__" },
	{ "attiny44", "__AVR_ATtiny44__" },
	{ "attiny44a", "__AVR_ATtiny44A__" },
	{ "attiny84", "__AVR_ATtiny84__" },
	{ "attiny84a", "__AVR_ATtiny84A__" },
	{ "attiny25", "__AVR_ATtiny25__" },
	{ "attiny45", "__AVR_ATtiny45__" },
	{ "attiny85", "__AVR_ATtiny85__" },
	{ "attiny261", "__AVR_ATtiny261__" },
	{ "attiny261a", "__AVR_ATtiny261A__" },
	{ "attiny461", "__AVR_ATtiny461__" },
	{ "attiny461a", "__AVR_ATtiny461A__" },
	{ "attiny861", "__AVR_ATtiny861__" },
	{ "attiny861a", "__AVR_ATtiny861A__" },
	{ "attiny87", "__AVR_ATtiny87__" },
	{ "attiny43u", "__AVR_ATtiny43U__" },
	{ "attiny48", "__AVR_ATtiny48__" },
	{ "attiny88", "__AVR_ATtiny88__" },
	{ "attiny828", "__AVR_ATtiny828__" },
	{ "at43usb355", "__AVR_AT43USB355__" },
	{ "at76c711", "__AVR_AT76C711__" },
	{ "atmega103", "__AVR_ATmega103__" },
	{ "at43usb320", "__AVR_AT43USB320__" },
	{ "attiny167", "__AVR_ATtiny167__" },
	{ "at90usb82", "__AVR_AT90USB82__" },
	{ "at90usb162", "__AVR_AT90USB162__" },
	{ "ata5505", "__AVR_ATA5505__" },
	{ "atmega8u2", "__AVR_ATmega8U2__" },
	{ "atmega16u2", "__AVR_ATmega16U2__" },
	{ "atmega32u2", "__AVR_ATmega32U2__" },
	{ "attiny1634", "__AVR_ATtiny1634__" },
	{ "atmega8", "__AVR_ATmega8__" },
	{ "ata6289", "__AVR_ATA6289__" },
	{ "atmega8a", "__AVR_ATmega8A__" },
	{ "ata6285", "__AVR_ATA6285__" },
	{ "ata6286", "__AVR_ATA6286__" },
	{ "atmega48", "__AVR_ATmega48__" },
	{ "atmega48a", "__AVR_ATmega48A__" },
	{ "atmega48pa", "__AVR_ATmega48PA__" },
	{ "atmega48p", "__AVR_ATmega48P__" },
	{ "atmega88", "__AVR_ATmega88__" },
	{ "atmega88a", "__AVR_ATmega88A__" },
	{ "atmega88p", "__AVR_ATmega88P__" },
	{ "atmega88pa", "__AVR_ATmega88PA__" },
	{ "atmega8515", "__AVR_ATmega8515__" },
	{ "atmega8535", "__AVR_ATmega8535__" },
	{ "atmega8hva", "__AVR_ATmega8HVA__" },
	{ "at90pwm1", "__AVR_AT90PWM1__" },
	{ "at90pwm2", "__AVR_AT90PWM2__" },
	{ "at90pwm2b", "__AVR_AT90PWM2B__" },
	{ "at90pwm3", "__AVR_AT90PWM3__" },
	{ "at90pwm3b", "__AVR_AT90PWM3B__" },
	{ "at90pwm81", "__AVR_AT90PWM81__" },
	{ "ata5790", "__AVR_ATA5790__" },
	{ "ata5795", "__AVR_ATA5795__" },
	{ "atmega16", "__AVR_ATmega16__" },
	{ "atmega16a", "__AVR_ATmega16A__" },
	{ "atmega161", "__AVR_ATmega161__" },
	{ "atmega162", "__AVR_ATmega162__" },
	{ "atmega163", "__AVR_ATmega163__" },
	{ "atmega164a", "__AVR_ATmega164A__" },
	{ "atmega164p", "__AVR_ATmega164P__" },
	{ "atmega164pa", "__AVR_ATmega164PA__" },
	{ "atmega165", "__AVR_ATmega165__" },
	{ "atmega165a", "__AVR_ATmega165A__" },
	{ "atmega165p", "__AVR_ATmega165P__" },
	{ "atmega165pa", "__AVR_ATmega165PA__" },
	{ "atmega168", "__AVR_ATmega168__" },
	{ "atmega168a", "__AVR_ATmega168A__" },
	{ "atmega168p", "__AVR_ATmega168P__" },
	{ "atmega168pa", "__AVR_ATmega168PA__" },
	{ "atmega169", "__AVR_ATmega169__" },
	{ "atmega169a", "__AVR_ATmega169A__" },
	{ "atmega169p", "__AVR_ATmega169P__" },
	{ "atmega169pa", "__AVR_ATmega169PA__" },
	{ "atmega32", "__AVR_ATmega32__" },
	{ "atmega32a", "__AVR_ATmega32A__" },
	{ "atmega323", "__AVR_ATmega323__" },
	{ "atmega324a", "__AVR_ATmega324A__" },
	{ "atmega324p", "__AVR_ATmega324P__" },
	{ "atmega324pa", "__AVR_ATmega324PA__" },
	{ "atmega325", "__AVR_ATmega325__" },
	{ "atmega325a", "__AVR_ATmega325A__" },
	{ "atmega325p", "__AVR_ATmega325P__" },
	{ "atmega325pa", "__AVR_ATmega325PA__" },
	{ "atmega3250", "__AVR_ATmega3250__" },
	{ "atmega3250a", "__AVR_ATmega3250A__" },
	{ "atmega3250p", "__AVR_ATmega3250P__" },
	{ "atmega3250pa", "__AVR_ATmega3250PA__" },
	{ "atmega328", "__AVR_ATmega328__" },
	{ "atmega328p", "__AVR_ATmega328P__" },
	{ "atmega329", "__AVR_ATmega329__" },
	{ "atmega329a", "__AVR_ATmega329A__" },
	{ "atmega329p", "__AVR_ATmega329P__" },
	{ "atmega329pa", "__AVR_ATmega329PA__" },
	{ "atmega3290", "__AVR_ATmega3290__" },
	{ "atmega3290a", "__AVR_ATmega3290A__" },
	{ "atmega3290p", "__AVR_ATmega3290P__" },
	{ "atmega3290pa", "__AVR_ATmega3290PA__" },
	{ "atmega406", "__AVR_ATmega406__" },
	{ "atmega64", "__AVR_ATmega64__" },
	{ "atmega64a", "__AVR_ATmega64A__" },
	{ "atmega640", "__AVR_ATmega640__" },
	{ "atmega644", "__AVR_ATmega644__" },
	{ "atmega644a", "__AVR_ATmega644A__" },
	{ "atmega644p", "__AVR_ATmega644P__" },
	{ "atmega644pa", "__AVR_ATmega644PA__" },
	{ "atmega645", "__AVR_ATmega645__" },
	{ "atmega645a", "__AVR_ATmega645A__" },
	{ "atmega645p", "__AVR_ATmega645P__" },
	{ "atmega649", "__AVR_ATmega649__" },
	{ "atmega649a", "__AVR_ATmega649A__" },
	{ "atmega649p", "__AVR_ATmega649P__" },
	{ "atmega6450", "__AVR_ATmega6450__" },
	{ "atmega6450a", "__AVR_ATmega6450A__" },
	{ "atmega6450p", "__AVR_ATmega6450P__" },
	{ "atmega6490", "__AVR_ATmega6490__" },
	{ "atmega6490a", "__AVR_ATmega6490A__" },
	{ "atmega6490p", "__AVR_ATmega6490P__" },
	{ "atmega64rfr2", "__AVR_ATmega64RFR2__" },
	{ "atmega644rfr2", "__AVR_ATmega644RFR2__" },
	{ "atmega16hva", "__AVR_ATmega16HVA__" },
	{ "atmega16hva2", "__AVR_ATmega16HVA2__" },
	{ "atmega16hvb", "__AVR_ATmega16HVB__" },
	{ "atmega16hvbrevb", "__AVR_ATmega16HVBREVB__" },
	{ "atmega32hvb", "__AVR_ATmega32HVB__" },
	{ "atmega32hvbrevb", "__AVR_ATmega32HVBREVB__" },
	{ "atmega64hve", "__AVR_ATmega64HVE__" },
	{ "at90can32", "__AVR_AT90CAN32__" },
	{ "at90can64", "__AVR_AT90CAN64__" },
	{ "at90pwm161", "__AVR_AT90PWM161__" },
	{ "at90pwm216", "__AVR_AT90PWM216__" },
	{ "at90pwm316", "__AVR_AT90PWM316__" },
	{ "atmega32c1", "__AVR_ATmega32C1__" },
	{ "atmega64c1", "__AVR_ATmega64C1__" },
	{ "atmega16m1", "__AVR_ATmega16M1__" },
	{ "atmega32m1", "__AVR_ATmega32M1__" },
	{ "atmega64m1", "__AVR_ATmega64M1__" },
	{ "atmega16u4", "__AVR_ATmega16U4__" },
	{ "atmega32u4", "__AVR_ATmega32U4__" },
	{ "atmega32u6", "__AVR_ATmega32U6__" },
	{ "at90usb646", "__AVR_AT90USB646__" },
	{ "at90usb647", "__AVR_AT90USB647__" },
	{ "at90scr100", "__AVR_AT90SCR100__" },
	{ "at94k", "__AVR_AT94K__" },
	{ "m3000", "__AVR_AT000__" },
	{ "atmega128", "__AVR_ATmega128__" },
	{ "atmega128a", "__AVR_ATmega128A__" },
	{ "atmega1280", "__AVR_ATmega1280__" },
	{ "atmega1281", "__AVR_ATmega1281__" },
	{ "atmega1284", "__AVR_ATmega1284__" },
	{ "atmega1284p", "__AVR_ATmega1284P__" },
	{ "atmega128rfa1", "__AVR_ATmega128RFA1__" },
	{ "atmega128rfr2", "__AVR_ATmega128RFR2__" },
	{ "atmega1284rfr2", "__AVR_ATmega1284RFR2__" },
	{ "at90can128", "__AVR_AT90CAN128__" },
	{ "at90usb1286", "__AVR_AT90USB1286__" },
	{ "at90usb1287", "__AVR_AT90USB1287__" },
	{ "atmega2560", "__AVR_ATmega2560__" },
	{ "atmega2561", "__AVR_ATmega2561__" },
	{ "atmega256rfr2", "__AVR_ATmega256RFR2__" },
	{ "atmega2564rfr2", "__AVR_ATmega2564RFR2__" },
	{ "atxmega16a4", "__AVR_ATxmega16A4__" },
	{ "atxmega16a4u", "__AVR_ATxmega16a4U__" },
	{ "atxmega16c4", "__AVR_ATxmega16C4__" },
	{ "atxmega16d4", "__AVR_ATxmega16D4__" },
	{ "atxmega32a4", "__AVR_ATxmega32A4__" },
	{ "atxmega32a4u", "__AVR_ATxmega32A4U__" },
	{ "atxmega32c4", "__AVR_ATxmega32C4__" },
	{ "atxmega32d4", "__AVR_ATxmega32D4__" },
	{ "atxmega32e5", "__AVR_ATxmega32E5__" },
	{ "atxmega16e5", "__AVR_ATxmega16E5__" },
	{ "atxmega8e5", "__AVR_ATxmega8E5__" },
	{ "atxmega32x1", "__AVR_ATxmega32X1__" },
	{ "atxmega64a3", "__AVR_ATxmega64A3__" },
	{ "atxmega64a3u", "__AVR_ATxmega64A3U__" },
	{ "atxmega64a4u", "__AVR_ATxmega64A4U__" },
	{ "atxmega64b1", "__AVR_ATxmega64B1__" },
	{ "atxmega64b3", "__AVR_ATxmega64B3__" },
	{ "atxmega64c3", "__AVR_ATxmega64C3__" },
	{ "atxmega64d3", "__AVR_ATxmega64D3__" },
	{ "atxmega64d4", "__AVR_ATxmega64D4__" },
	{ "atxmega64a1", "__AVR_ATxmega64A1__" },
	{ "atxmega64a1u", "__AVR_ATxmega64A1U__" },
	{ "atxmega128a3", "__AVR_ATxmega128A3__" },
	{ "atxmega128a3u", "__AVR_ATxmega128A3U__" },
	{ "atxmega128b1", "__AVR_ATxmega128B1__" },
	{ "atxmega128b3", "__AVR_ATxmega128B3__" },
	{ "atxmega128c3", "__AVR_ATxmega128C3__" },
	{ "atxmega128d3", "__AVR_ATxmega128D3__" },
	{ "atxmega128d4", "__AVR_ATxmega128D4__" },
	{ "atxmega192a3", "__AVR_ATxmega192A3__" },
	{ "atxmega192a3u", "__AVR_ATxmega192A3U__" },
	{ "atxmega192c3", "__AVR_ATxmega192C3__" },
	{ "atxmega192d3", "__AVR_ATxmega192D3__" },
	{ "atxmega256a3", "__AVR_ATxmega256A3__" },
	{ "atxmega256a3u", "__AVR_ATxmega256A3U__" },
	{ "atxmega256a3b", "__AVR_ATxmega256A3B__" },
	{ "atxmega256a3bu", "__AVR_ATxmega256A3BU__" },
	{ "atxmega256c3", "__AVR_ATxmega256C3__" },
	{ "atxmega256d3", "__AVR_ATxmega256D3__" },
	{ "atxmega384c3", "__AVR_ATxmega384C3__" },
	{ "atxmega384d3", "__AVR_ATxmega384D3__" },
	{ "atxmega128a1", "__AVR_ATxmega128A1__" },
	{ "atxmega128a1u", "__AVR_ATxmega128A1U__" },
	{ "atxmega128a4u", "__AVR_ATxmega128a4U__" },
	{ "attiny4", "__AVR_ATtiny4__" },
	{ "attiny5", "__AVR_ATtiny5__" },
	{ "attiny9", "__AVR_ATtiny9__" },
	{ "attiny10", "__AVR_ATtiny10__" },
	{ "attiny20", "__AVR_ATtiny20__" },
	{ "attiny40", "__AVR_ATtiny40__" },
	{ "attiny102", "__AVR_ATtiny102__" },
	{ "attiny104", "__AVR_ATtiny104__" },
	};

	// AVR Target
	class AVRTargetInfo : public TargetInfo {
	public:
	AVRTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
	: TargetInfo(Triple) {
	TLSSupported = false;
	PointerWidth = 16;
	PointerAlign = 8;
	IntWidth = 16;
	IntAlign = 8;
	LongWidth = 32;
	LongAlign = 8;
	LongLongWidth = 64;
	LongLongAlign = 8;
	SuitableAlign = 8;
	DefaultAlignForAttributeAligned = 8;
	HalfWidth = 16;
	HalfAlign = 8;
	FloatWidth = 32;
	FloatAlign = 8;
	DoubleWidth = 32;
	DoubleAlign = 8;
	DoubleFormat = &llvm::APFloat::IEEEsingle();
	LongDoubleWidth = 32;
	LongDoubleAlign = 8;
	LongDoubleFormat = &llvm::APFloat::IEEEsingle();
	SizeType = UnsignedInt;
	PtrDiffType = SignedInt;
	IntPtrType = SignedInt;
	Char16Type = UnsignedInt;
	WCharType = SignedInt;
	WIntType = SignedInt;
	Char32Type = UnsignedLong;
	SigAtomicType = SignedChar;
	resetDataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:32:32-i64:64:64"
	"-f32:32:32-f64:64:64-n8");
	}

	void getTargetDefines(const LangOptions &Opts,
	MacroBuilder &Builder) const override {
	Builder.defineMacro("AVR");
	Builder.defineMacro("__AVR");
	Builder.defineMacro("__AVR__");

	if (!this->CPU.empty()) {
	auto It = std::find_if(AVRMcus.begin(), AVRMcus.end(),
	[&](const MCUInfo &Info) { return Info.Name == this->CPU; });

	if (It != AVRMcus.end())
	Builder.defineMacro(It->DefineName);
	}
	}

	ArrayRef<Builtin::Info> getTargetBuiltins() const override {
	return None;
	}

	BuiltinVaListKind getBuiltinVaListKind() const override {
	return TargetInfo::VoidPtrBuiltinVaList;
	}

	const char *getClobbers() const override {
	return "";
	}

	ArrayRef<const char *> getGCCRegNames() const override {
	static const char * const GCCRegNames[] = {
	"r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
	"r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
	"r16", "r17", "r18", "r19", "r20", "r21", "r22", "r23",
	"r24", "r25", "X", "Y", "Z", "SP"
	};
	return llvm::makeArrayRef(GCCRegNames);
	}

	ArrayRef<TargetInfo::GCCRegAlias> getGCCRegAliases() const override {
	return None;
	}

	ArrayRef<TargetInfo::AddlRegName> getGCCAddlRegNames() const override {
	static const TargetInfo::AddlRegName AddlRegNames[] = {
	{ { "r26", "r27"}, 26 },
	{ { "r28", "r29"}, 27 },
	{ { "r30", "r31"}, 28 },
	{ { "SPL", "SPH"}, 29 },
	};
	return llvm::makeArrayRef(AddlRegNames);
	}

	bool validateAsmConstraint(const char *&Name,
	TargetInfo::ConstraintInfo &Info) const override {
	// There aren't any multi-character AVR specific constraints.
	if (StringRef(Name).size() > 1) return false;

	switch (*Name) {
	default: return false;
	case 'a': // Simple upper registers
	case 'b': // Base pointer registers pairs
	case 'd': // Upper register
	case 'l': // Lower registers
	case 'e': // Pointer register pairs
	case 'q': // Stack pointer register
	case 'r': // Any register
	case 'w': // Special upper register pairs
	case 't': // Temporary register
	case 'x': case 'X': // Pointer register pair X
	case 'y': case 'Y': // Pointer register pair Y
	case 'z': case 'Z': // Pointer register pair Z
	Info.setAllowsRegister();
	return true;
	case 'I': // 6-bit positive integer constant
	Info.setRequiresImmediate(0, 63);
	return true;
	case 'J': // 6-bit negative integer constant
	Info.setRequiresImmediate(-63, 0);
	return true;
	case 'K': // Integer constant (Range: 2)
	Info.setRequiresImmediate(2);
	return true;
	case 'L': // Integer constant (Range: 0)
	Info.setRequiresImmediate(0);
	return true;
	case 'M': // 8-bit integer constant
	Info.setRequiresImmediate(0, 0xff);
	return true;
	case 'N': // Integer constant (Range: -1)
	Info.setRequiresImmediate(-1);
	return true;
	case 'O': // Integer constant (Range: 8, 16, 24)
	Info.setRequiresImmediate({8, 16, 24});
	return true;
	case 'P': // Integer constant (Range: 1)
	Info.setRequiresImmediate(1);
	return true;
	case 'R': // Integer constant (Range: -6 to 5)
	Info.setRequiresImmediate(-6, 5);
	return true;
	case 'G': // Floating point constant
	case 'Q': // A memory address based on Y or Z pointer with displacement.
	return true;
	}

	return false;
	}

	IntType getIntTypeByWidth(unsigned BitWidth,
	bool IsSigned) const final {
	// AVR prefers int for 16-bit integers.
	return BitWidth == 16 ? (IsSigned ? SignedInt : UnsignedInt)
	: TargetInfo::getIntTypeByWidth(BitWidth, IsSigned);
	}

	IntType getLeastIntTypeByWidth(unsigned BitWidth,
	bool IsSigned) const final {
	// AVR uses int for int_least16_t and int_fast16_t.
	return BitWidth == 16
	? (IsSigned ? SignedInt : UnsignedInt)
	: TargetInfo::getLeastIntTypeByWidth(BitWidth, IsSigned);
	}

	bool setCPU(const std::string &Name) override {
	bool IsFamily = llvm::StringSwitch<bool>(Name)
	.Case("avr1", true)
	.Case("avr2", true)
	.Case("avr25", true)
	.Case("avr3", true)
	.Case("avr31", true)
	.Case("avr35", true)
	.Case("avr4", true)
	.Case("avr5", true)
	.Case("avr51", true)
	.Case("avr6", true)
	.Case("avrxmega1", true)
	.Case("avrxmega2", true)
	.Case("avrxmega3", true)
	.Case("avrxmega4", true)
	.Case("avrxmega5", true)
	.Case("avrxmega6", true)
	.Case("avrxmega7", true)
	.Case("avrtiny", true)
	.Default(false);

	if (IsFamily) this->CPU = Name;

	bool IsMCU = std::find_if(AVRMcus.begin(), AVRMcus.end(),
	[&](const MCUInfo &Info) { return Info.Name == Name; }) != AVRMcus.end();

	if (IsMCU) this->CPU = Name;

	return IsFamily \|\| IsMCU;
	}

	protected:
	std::string CPU;
	};

	} // end anonymous namespace

	//===----------------------------------------------------------------------===//
	// Driver code
	//===----------------------------------------------------------------------===//

	static TargetInfo *AllocateTarget(const llvm::Triple &Triple,
	const TargetOptions &Opts) {
	llvm::Triple::OSType os = Triple.getOS();

	switch (Triple.getArch()) {
	default:
	return nullptr;

	case llvm::Triple::xcore:
	return new XCoreTargetInfo(Triple, Opts);

	case llvm::Triple::hexagon:
	return new HexagonTargetInfo(Triple, Opts);

	case llvm::Triple::lanai:
	return new LanaiTargetInfo(Triple, Opts);

	case llvm::Triple::aarch64:
	if (Triple.isOSDarwin())
	return new DarwinAArch64TargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::CloudABI:
	return new CloudABITargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::Fuchsia:
	return new FuchsiaTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<AArch64leTargetInfo>(Triple, Opts);
	case llvm::Triple::Win32:
	return new MicrosoftARM64TargetInfo(Triple, Opts);
	default:
	return new AArch64leTargetInfo(Triple, Opts);
	}

	case llvm::Triple::aarch64_be:
	switch (os) {
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<AArch64beTargetInfo>(Triple, Opts);
	case llvm::Triple::Fuchsia:
	return new FuchsiaTargetInfo<AArch64beTargetInfo>(Triple, Opts);
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<AArch64beTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<AArch64beTargetInfo>(Triple, Opts);
	default:
	return new AArch64beTargetInfo(Triple, Opts);
	}

	case llvm::Triple::arm:
	case llvm::Triple::thumb:
	if (Triple.isOSBinFormatMachO())
	return new DarwinARMTargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::CloudABI:
	return new CloudABITargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::Bitrig:
	return new BitrigTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<ARMleTargetInfo>(Triple, Opts);
	case llvm::Triple::Win32:
	switch (Triple.getEnvironment()) {
	case llvm::Triple::Cygnus:
	return new CygwinARMTargetInfo(Triple, Opts);
	case llvm::Triple::GNU:
	return new MinGWARMTargetInfo(Triple, Opts);
	case llvm::Triple::Itanium:
	return new ItaniumWindowsARMleTargetInfo(Triple, Opts);
	case llvm::Triple::MSVC:
	default: // Assume MSVC for unknown environments
	return new MicrosoftARMleTargetInfo(Triple, Opts);
	}
	default:
	return new ARMleTargetInfo(Triple, Opts);
	}

	case llvm::Triple::armeb:
	case llvm::Triple::thumbeb:
	if (Triple.isOSDarwin())
	return new DarwinARMTargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::Bitrig:
	return new BitrigTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<ARMbeTargetInfo>(Triple, Opts);
	default:
	return new ARMbeTargetInfo(Triple, Opts);
	}

	case llvm::Triple::avr:
	return new AVRTargetInfo(Triple, Opts);
	case llvm::Triple::bpfeb:
	case llvm::Triple::bpfel:
	return new BPFTargetInfo(Triple, Opts);

	case llvm::Triple::msp430:
	return new MSP430TargetInfo(Triple, Opts);

	case llvm::Triple::nios2:
	return new LinuxTargetInfo<Nios2TargetInfo>(Triple, Opts);

	case llvm::Triple::mips:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	default:
	return new MipsTargetInfo(Triple, Opts);
	}

	case llvm::Triple::mipsel:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<NaClMips32TargetInfo>(Triple, Opts);
	default:
	return new MipsTargetInfo(Triple, Opts);
	}

	case llvm::Triple::mips64:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	default:
	return new MipsTargetInfo(Triple, Opts);
	}

	case llvm::Triple::mips64el:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<MipsTargetInfo>(Triple, Opts);
	default:
	return new MipsTargetInfo(Triple, Opts);
	}

	case llvm::Triple::le32:
	switch (os) {
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<PNaClTargetInfo>(Triple, Opts);
	default:
	return nullptr;
	}

	case llvm::Triple::le64:
	return new Le64TargetInfo(Triple, Opts);

	case llvm::Triple::ppc:
	if (Triple.isOSDarwin())
	return new DarwinPPC32TargetInfo(Triple, Opts);
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<PPC32TargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<PPC32TargetInfo>(Triple, Opts);
	default:
	return new PPC32TargetInfo(Triple, Opts);
	}

	case llvm::Triple::ppc64:
	if (Triple.isOSDarwin())
	return new DarwinPPC64TargetInfo(Triple, Opts);
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::Lv2:
	return new PS3PPUTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
	default:
	return new PPC64TargetInfo(Triple, Opts);
	}

	case llvm::Triple::ppc64le:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<PPC64TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<PPC64TargetInfo>(Triple, Opts);
	default:
	return new PPC64TargetInfo(Triple, Opts);
	}

	case llvm::Triple::nvptx:
	return new NVPTXTargetInfo(Triple, Opts, /TargetPointerWidth=/32);
	case llvm::Triple::nvptx64:
	return new NVPTXTargetInfo(Triple, Opts, /TargetPointerWidth=/64);

	case llvm::Triple::amdgcn:
	case llvm::Triple::r600:
	return new AMDGPUTargetInfo(Triple, Opts);

	case llvm::Triple::sparc:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	case llvm::Triple::Solaris:
	return new SolarisTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<SparcV8TargetInfo>(Triple, Opts);
	default:
	return new SparcV8TargetInfo(Triple, Opts);
	}

	// The 'sparcel' architecture copies all the above cases except for Solaris.
	case llvm::Triple::sparcel:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSTargetInfo<SparcV8elTargetInfo>(Triple, Opts);
	default:
	return new SparcV8elTargetInfo(Triple, Opts);
	}

	case llvm::Triple::sparcv9:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	case llvm::Triple::Solaris:
	return new SolarisTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<SparcV9TargetInfo>(Triple, Opts);
	default:
	return new SparcV9TargetInfo(Triple, Opts);
	}

	case llvm::Triple::systemz:
	switch (os) {
	case llvm::Triple::Linux:
	return new LinuxTargetInfo<SystemZTargetInfo>(Triple, Opts);
	default:
	return new SystemZTargetInfo(Triple, Opts);
	}

	case llvm::Triple::tce:
	return new TCETargetInfo(Triple, Opts);

	case llvm::Triple::tcele:
	return new TCELETargetInfo(Triple, Opts);

	case llvm::Triple::x86:
	if (Triple.isOSDarwin())
	return new DarwinI386TargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::Ananas:
	return new AnanasTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::CloudABI:
	return new CloudABITargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Linux: {
	switch (Triple.getEnvironment()) {
	default:
	return new LinuxTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Android:
	return new AndroidX86_32TargetInfo(Triple, Opts);
	}
	}
	case llvm::Triple::DragonFly:
	return new DragonFlyBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDI386TargetInfo(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDI386TargetInfo(Triple, Opts);
	case llvm::Triple::Bitrig:
	return new BitrigI386TargetInfo(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::KFreeBSD:
	return new KFreeBSDTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Minix:
	return new MinixTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Solaris:
	return new SolarisTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::Win32: {
	switch (Triple.getEnvironment()) {
	case llvm::Triple::Cygnus:
	return new CygwinX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::GNU:
	return new MinGWX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::Itanium:
	case llvm::Triple::MSVC:
	default: // Assume MSVC for unknown environments
	return new MicrosoftX86_32TargetInfo(Triple, Opts);
	}
	}
	case llvm::Triple::Haiku:
	return new HaikuX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::RTEMS:
	return new RTEMSX86_32TargetInfo(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<X86_32TargetInfo>(Triple, Opts);
	case llvm::Triple::ELFIAMCU:
	return new MCUX86_32TargetInfo(Triple, Opts);
	default:
	return new X86_32TargetInfo(Triple, Opts);
	}

	case llvm::Triple::x86_64:
	if (Triple.isOSDarwin() \|\| Triple.isOSBinFormatMachO())
	return new DarwinX86_64TargetInfo(Triple, Opts);

	switch (os) {
	case llvm::Triple::Ananas:
	return new AnanasTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::CloudABI:
	return new CloudABITargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Linux: {
	switch (Triple.getEnvironment()) {
	default:
	return new LinuxTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Android:
	return new AndroidX86_64TargetInfo(Triple, Opts);
	}
	}
	case llvm::Triple::DragonFly:
	return new DragonFlyBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::NetBSD:
	return new NetBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::OpenBSD:
	return new OpenBSDX86_64TargetInfo(Triple, Opts);
	case llvm::Triple::Bitrig:
	return new BitrigX86_64TargetInfo(Triple, Opts);
	case llvm::Triple::FreeBSD:
	return new FreeBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Fuchsia:
	return new FuchsiaTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::KFreeBSD:
	return new KFreeBSDTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Solaris:
	return new SolarisTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::Win32: {
	switch (Triple.getEnvironment()) {
	case llvm::Triple::Cygnus:
	return new CygwinX86_64TargetInfo(Triple, Opts);
	case llvm::Triple::GNU:
	return new MinGWX86_64TargetInfo(Triple, Opts);
	case llvm::Triple::MSVC:
	default: // Assume MSVC for unknown environments
	return new MicrosoftX86_64TargetInfo(Triple, Opts);
	}
	}
	case llvm::Triple::Haiku:
	return new HaikuTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::NaCl:
	return new NaClTargetInfo<X86_64TargetInfo>(Triple, Opts);
	case llvm::Triple::PS4:
	return new PS4OSTargetInfo<X86_64TargetInfo>(Triple, Opts);
	default:
	return new X86_64TargetInfo(Triple, Opts);
	}

	case llvm::Triple::spir: {
	if (Triple.getOS() != llvm::Triple::UnknownOS \|\|
	Triple.getEnvironment() != llvm::Triple::UnknownEnvironment)
	return nullptr;
	return new SPIR32TargetInfo(Triple, Opts);
	}
	case llvm::Triple::spir64: {
	if (Triple.getOS() != llvm::Triple::UnknownOS \|\|
	Triple.getEnvironment() != llvm::Triple::UnknownEnvironment)
	return nullptr;
	return new SPIR64TargetInfo(Triple, Opts);
	}
	case llvm::Triple::wasm32:
	if (Triple.getSubArch() != llvm::Triple::NoSubArch \|\|
	Triple.getVendor() != llvm::Triple::UnknownVendor \|\|
	Triple.getOS() != llvm::Triple::UnknownOS \|\|
	Triple.getEnvironment() != llvm::Triple::UnknownEnvironment \|\|
	!(Triple.isOSBinFormatELF() \|\| Triple.isOSBinFormatWasm()))
	return nullptr;
	return new WebAssemblyOSTargetInfo<WebAssembly32TargetInfo>(Triple, Opts);
	case llvm::Triple::wasm64:
	if (Triple.getSubArch() != llvm::Triple::NoSubArch \|\|
	Triple.getVendor() != llvm::Triple::UnknownVendor \|\|
	Triple.getOS() != llvm::Triple::UnknownOS \|\|
	Triple.getEnvironment() != llvm::Triple::UnknownEnvironment \|\|
	!(Triple.isOSBinFormatELF() \|\| Triple.isOSBinFormatWasm()))
	return nullptr;
	return new WebAssemblyOSTargetInfo<WebAssembly64TargetInfo>(Triple, Opts);

	case llvm::Triple::renderscript32:
	return new LinuxTargetInfo<RenderScript32TargetInfo>(Triple, Opts);
	case llvm::Triple::renderscript64:
	return new LinuxTargetInfo<RenderScript64TargetInfo>(Triple, Opts);
	}
	}

	/// CreateTargetInfo - Return the target info object for the specified target
	/// options.
	TargetInfo *
	TargetInfo::CreateTargetInfo(DiagnosticsEngine &Diags,
	const std::shared_ptr<TargetOptions> &Opts) {
	llvm::Triple Triple(Opts->Triple);

	// Construct the target
	std::unique_ptr<TargetInfo> Target(AllocateTarget(Triple, *Opts));
	if (!Target) {
	Diags.Report(diag::err_target_unknown_triple) << Triple.str();
	return nullptr;
	}
	Target->TargetOpts = Opts;

	// Set the target CPU if specified.
	if (!Opts->CPU.empty() && !Target->setCPU(Opts->CPU)) {
	Diags.Report(diag::err_target_unknown_cpu) << Opts->CPU;
	return nullptr;
	}

	// Set the target ABI if specified.
	if (!Opts->ABI.empty() && !Target->setABI(Opts->ABI)) {
	Diags.Report(diag::err_target_unknown_abi) << Opts->ABI;
	return nullptr;
	}

	// Set the fp math unit.
	if (!Opts->FPMath.empty() && !Target->setFPMath(Opts->FPMath)) {
	Diags.Report(diag::err_target_unknown_fpmath) << Opts->FPMath;
	return nullptr;
	}

	// Compute the default target features, we need the target to handle this
	// because features may have dependencies on one another.
	llvm::StringMap<bool> Features;
	if (!Target->initFeatureMap(Features, Diags, Opts->CPU,
	Opts->FeaturesAsWritten))
	return nullptr;

	// Add the features to the compile options.
	Opts->Features.clear();
	for (const auto &F : Features)
	Opts->Features.push_back((F.getValue() ? "+" : "-") + F.getKey().str());

	if (!Target->handleTargetFeatures(Opts->Features, Diags))
	return nullptr;

	Target->setSupportedOpenCLOpts();
	Target->setOpenCLExtensionOpts();

	if (!Target->validateTarget(Diags))
	return nullptr;

	return Target.release();
	}
	Index: head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Arch/Mips.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Arch/Mips.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Driver/ToolChains/Arch/Mips.cpp (revision 322320)
	@@ -1,407 +1,405 @@
	//===--- Mips.cpp - Tools Implementations ------------------------ C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "Mips.h"
	#include "ToolChains/CommonArgs.h"
	#include "clang/Driver/Driver.h"
	#include "clang/Driver/DriverDiagnostic.h"
	#include "clang/Driver/Options.h"
	#include "llvm/ADT/StringSwitch.h"
	#include "llvm/Option/ArgList.h"

	using namespace clang::driver;
	using namespace clang::driver::tools;
	using namespace clang;
	using namespace llvm::opt;

	bool tools::isMipsArch(llvm::Triple::ArchType Arch) {
	return Arch == llvm::Triple::mips \|\| Arch == llvm::Triple::mipsel \|\|
	Arch == llvm::Triple::mips64 \|\| Arch == llvm::Triple::mips64el;
	}

	// Get CPU and ABI names. They are not independent
	// so we have to calculate them together.
	void mips::getMipsCPUAndABI(const ArgList &Args, const llvm::Triple &Triple,
	StringRef &CPUName, StringRef &ABIName) {
	const char *DefMips32CPU = "mips32r2";
	const char *DefMips64CPU = "mips64r2";

	// MIPS32r6 is the default for mips(el)?-img-linux-gnu and MIPS64r6 is the
	// default for mips64(el)?-img-linux-gnu.
	if (Triple.getVendor() == llvm::Triple::ImaginationTechnologies &&
	Triple.getEnvironment() == llvm::Triple::GNU) {
	DefMips32CPU = "mips32r6";
	DefMips64CPU = "mips64r6";
	}

	// MIPS64r6 is the default for Android MIPS64 (mips64el-linux-android).
	if (Triple.isAndroid()) {
	DefMips32CPU = "mips32";
	DefMips64CPU = "mips64r6";
	}

	// MIPS3 is the default for mips64*-unknown-openbsd.
	if (Triple.getOS() == llvm::Triple::OpenBSD)
	DefMips64CPU = "mips3";

	if (Arg *A = Args.getLastArg(clang::driver::options::OPT_march_EQ,
	options::OPT_mcpu_EQ))
	CPUName = A->getValue();

	if (Arg *A = Args.getLastArg(options::OPT_mabi_EQ)) {
	ABIName = A->getValue();
	// Convert a GNU style Mips ABI name to the name
	// accepted by LLVM Mips backend.
	ABIName = llvm::StringSwitch<llvm::StringRef>(ABIName)
	.Case("32", "o32")
	.Case("64", "n64")
	.Default(ABIName);
	}

	// Setup default CPU and ABI names.
	if (CPUName.empty() && ABIName.empty()) {
	switch (Triple.getArch()) {
	default:
	llvm_unreachable("Unexpected triple arch name");
	case llvm::Triple::mips:
	case llvm::Triple::mipsel:
	CPUName = DefMips32CPU;
	break;
	case llvm::Triple::mips64:
	case llvm::Triple::mips64el:
	CPUName = DefMips64CPU;
	break;
	}
	}

	if (ABIName.empty() &&
	(Triple.getVendor() == llvm::Triple::MipsTechnologies \|\|
	Triple.getVendor() == llvm::Triple::ImaginationTechnologies)) {
	ABIName = llvm::StringSwitch<const char *>(CPUName)
	.Case("mips1", "o32")
	.Case("mips2", "o32")
	.Case("mips3", "n64")
	.Case("mips4", "n64")
	.Case("mips5", "n64")
	.Case("mips32", "o32")
	.Case("mips32r2", "o32")
	.Case("mips32r3", "o32")
	.Case("mips32r5", "o32")
	.Case("mips32r6", "o32")
	.Case("mips64", "n64")
	.Case("mips64r2", "n64")
	.Case("mips64r3", "n64")
	.Case("mips64r5", "n64")
	.Case("mips64r6", "n64")
	.Case("octeon", "n64")
	.Case("p5600", "o32")
	.Default("");
	}

	if (ABIName.empty()) {
	// Deduce ABI name from the target triple.
	if (Triple.getArch() == llvm::Triple::mips \|\|
	Triple.getArch() == llvm::Triple::mipsel)
	ABIName = "o32";
	else
	ABIName = "n64";
	}

	if (CPUName.empty()) {
	// Deduce CPU name from ABI name.
	CPUName = llvm::StringSwitch<const char *>(ABIName)
	.Case("o32", DefMips32CPU)
	.Cases("n32", "n64", DefMips64CPU)
	.Default("");
	}

	// FIXME: Warn on inconsistent use of -march and -mabi.
	}

	std::string mips::getMipsABILibSuffix(const ArgList &Args,
	const llvm::Triple &Triple) {
	StringRef CPUName, ABIName;
	tools::mips::getMipsCPUAndABI(Args, Triple, CPUName, ABIName);
	return llvm::StringSwitch<std::string>(ABIName)
	.Case("o32", "")
	.Case("n32", "32")
	.Case("n64", "64");
	}

	// Convert ABI name to the GNU tools acceptable variant.
	StringRef mips::getGnuCompatibleMipsABIName(StringRef ABI) {
	return llvm::StringSwitch<llvm::StringRef>(ABI)
	.Case("o32", "32")
	.Case("n64", "64")
	.Default(ABI);
	}

	// Select the MIPS float ABI as determined by -msoft-float, -mhard-float,
	// and -mfloat-abi=.
	mips::FloatABI mips::getMipsFloatABI(const Driver &D, const ArgList &Args) {
	mips::FloatABI ABI = mips::FloatABI::Invalid;
	if (Arg *A =
	Args.getLastArg(options::OPT_msoft_float, options::OPT_mhard_float,
	options::OPT_mfloat_abi_EQ)) {
	if (A->getOption().matches(options::OPT_msoft_float))
	ABI = mips::FloatABI::Soft;
	else if (A->getOption().matches(options::OPT_mhard_float))
	ABI = mips::FloatABI::Hard;
	else {
	ABI = llvm::StringSwitch<mips::FloatABI>(A->getValue())
	.Case("soft", mips::FloatABI::Soft)
	.Case("hard", mips::FloatABI::Hard)
	.Default(mips::FloatABI::Invalid);
	if (ABI == mips::FloatABI::Invalid && !StringRef(A->getValue()).empty()) {
	D.Diag(clang::diag::err_drv_invalid_mfloat_abi) << A->getAsString(Args);
	ABI = mips::FloatABI::Hard;
	}
	}
	}

	// If unspecified, choose the default based on the platform.
	if (ABI == mips::FloatABI::Invalid) {
	// Assume "hard", because it's a default value used by gcc.
	// When we start to recognize specific target MIPS processors,
	// we will be able to select the default more correctly.
	ABI = mips::FloatABI::Hard;
	}

	assert(ABI != mips::FloatABI::Invalid && "must select an ABI");
	return ABI;
	}

	void mips::getMIPSTargetFeatures(const Driver &D, const llvm::Triple &Triple,
	const ArgList &Args,
	std::vector<StringRef> &Features) {
	StringRef CPUName;
	StringRef ABIName;
	getMipsCPUAndABI(Args, Triple, CPUName, ABIName);
	ABIName = getGnuCompatibleMipsABIName(ABIName);

	// Historically, PIC code for MIPS was associated with -mabicalls, a.k.a
	// SVR4 abicalls. Static code does not use SVR4 calling sequences. An ABI
	// extension was developed by Richard Sandiford & Code Sourcery to support
	// static code calling PIC code (CPIC). For O32 and N32 this means we have
	// several combinations of PIC/static and abicalls. Pure static, static
	// with the CPIC extension, and pure PIC code.

	// At final link time, O32 and N32 with CPIC will have another section
	// added to the binary which contains the stub functions to perform
	// any fixups required for PIC code.

	// For N64, the situation is more regular: code can either be static
	// (non-abicalls) or PIC (abicalls). GCC has traditionally picked PIC code
	// code for N64. Since Clang has already built the relocation model portion
	// of the commandline, we pick add +noabicalls feature in the N64 static
	// case.

	// The is another case to be accounted for: -msym32, which enforces that all
	// symbols have 32 bits in size. In this case, N64 can in theory use CPIC
	// but it is unsupported.

	// The combinations for N64 are:
	// a) Static without abicalls and 64bit symbols.
	// b) Static with abicalls and 32bit symbols.
	// c) PIC with abicalls and 64bit symbols.

	// For case (a) we need to add +noabicalls for N64.

	bool IsN64 = ABIName == "64";
	bool NonPIC = false;

	Arg *LastPICArg = Args.getLastArg(options::OPT_fPIC, options::OPT_fno_PIC,
	options::OPT_fpic, options::OPT_fno_pic,
	options::OPT_fPIE, options::OPT_fno_PIE,
	options::OPT_fpie, options::OPT_fno_pie);
	if (LastPICArg) {
	Option O = LastPICArg->getOption();
	NonPIC =
	(O.matches(options::OPT_fno_PIC) \|\| O.matches(options::OPT_fno_pic) \|\|
	O.matches(options::OPT_fno_PIE) \|\| O.matches(options::OPT_fno_pie));
	}

	if (IsN64 && NonPIC)
	Features.push_back("+noabicalls");
	else
	AddTargetFeature(Args, Features, options::OPT_mno_abicalls,
	options::OPT_mabicalls, "noabicalls");

	mips::FloatABI FloatABI = mips::getMipsFloatABI(D, Args);
	if (FloatABI == mips::FloatABI::Soft) {
	// FIXME: Note, this is a hack. We need to pass the selected float
	// mode to the MipsTargetInfoBase to define appropriate macros there.
	// Now it is the only method.
	Features.push_back("+soft-float");
	}

	if (Arg *A = Args.getLastArg(options::OPT_mnan_EQ)) {
	StringRef Val = StringRef(A->getValue());
	if (Val == "2008") {
	if (mips::getSupportedNanEncoding(CPUName) & mips::Nan2008)
	Features.push_back("+nan2008");
	else {
	Features.push_back("-nan2008");
	D.Diag(diag::warn_target_unsupported_nan2008) << CPUName;
	}
	} else if (Val == "legacy") {
	if (mips::getSupportedNanEncoding(CPUName) & mips::NanLegacy)
	Features.push_back("-nan2008");
	else {
	Features.push_back("+nan2008");
	D.Diag(diag::warn_target_unsupported_nanlegacy) << CPUName;
	}
	} else
	D.Diag(diag::err_drv_unsupported_option_argument)
	<< A->getOption().getName() << Val;
	}

	AddTargetFeature(Args, Features, options::OPT_msingle_float,
	options::OPT_mdouble_float, "single-float");
	AddTargetFeature(Args, Features, options::OPT_mips16, options::OPT_mno_mips16,
	"mips16");
	AddTargetFeature(Args, Features, options::OPT_mmicromips,
	options::OPT_mno_micromips, "micromips");
	AddTargetFeature(Args, Features, options::OPT_mdsp, options::OPT_mno_dsp,
	"dsp");
	AddTargetFeature(Args, Features, options::OPT_mdspr2, options::OPT_mno_dspr2,
	"dspr2");
	AddTargetFeature(Args, Features, options::OPT_mmsa, options::OPT_mno_msa,
	"msa");

	// Add the last -mfp32/-mfpxx/-mfp64, if none are given and the ABI is O32
	// pass -mfpxx, or if none are given and fp64a is default, pass fp64 and
	// nooddspreg.
	if (Arg *A = Args.getLastArg(options::OPT_mfp32, options::OPT_mfpxx,
	options::OPT_mfp64)) {
	if (A->getOption().matches(options::OPT_mfp32))
	Features.push_back("-fp64");
	else if (A->getOption().matches(options::OPT_mfpxx)) {
	Features.push_back("+fpxx");
	Features.push_back("+nooddspreg");
	} else
	Features.push_back("+fp64");
	} else if (mips::shouldUseFPXX(Args, Triple, CPUName, ABIName, FloatABI)) {
	Features.push_back("+fpxx");
	Features.push_back("+nooddspreg");
	} else if (mips::isFP64ADefault(Triple, CPUName)) {
	Features.push_back("+fp64");
	Features.push_back("+nooddspreg");
	}

	AddTargetFeature(Args, Features, options::OPT_mno_odd_spreg,
	options::OPT_modd_spreg, "nooddspreg");
	- AddTargetFeature(Args, Features, options::OPT_mno_madd4, options::OPT_mmadd4,
	- "nomadd4");
	AddTargetFeature(Args, Features, options::OPT_mlong_calls,
	options::OPT_mno_long_calls, "long-calls");
	AddTargetFeature(Args, Features, options::OPT_mmt, options::OPT_mno_mt,"mt");
	}

	mips::NanEncoding mips::getSupportedNanEncoding(StringRef &CPU) {
	// Strictly speaking, mips32r2 and mips64r2 are NanLegacy-only since Nan2008
	// was first introduced in Release 3. However, other compilers have
	// traditionally allowed it for Release 2 so we should do the same.
	return (NanEncoding)llvm::StringSwitch<int>(CPU)
	.Case("mips1", NanLegacy)
	.Case("mips2", NanLegacy)
	.Case("mips3", NanLegacy)
	.Case("mips4", NanLegacy)
	.Case("mips5", NanLegacy)
	.Case("mips32", NanLegacy)
	.Case("mips32r2", NanLegacy \| Nan2008)
	.Case("mips32r3", NanLegacy \| Nan2008)
	.Case("mips32r5", NanLegacy \| Nan2008)
	.Case("mips32r6", Nan2008)
	.Case("mips64", NanLegacy)
	.Case("mips64r2", NanLegacy \| Nan2008)
	.Case("mips64r3", NanLegacy \| Nan2008)
	.Case("mips64r5", NanLegacy \| Nan2008)
	.Case("mips64r6", Nan2008)
	.Default(NanLegacy);
	}

	bool mips::hasCompactBranches(StringRef &CPU) {
	// mips32r6 and mips64r6 have compact branches.
	return llvm::StringSwitch<bool>(CPU)
	.Case("mips32r6", true)
	.Case("mips64r6", true)
	.Default(false);
	}

	bool mips::hasMipsAbiArg(const ArgList &Args, const char *Value) {
	Arg *A = Args.getLastArg(options::OPT_mabi_EQ);
	return A && (A->getValue() == StringRef(Value));
	}

	bool mips::isUCLibc(const ArgList &Args) {
	Arg *A = Args.getLastArg(options::OPT_m_libc_Group);
	return A && A->getOption().matches(options::OPT_muclibc);
	}

	bool mips::isNaN2008(const ArgList &Args, const llvm::Triple &Triple) {
	if (Arg *NaNArg = Args.getLastArg(options::OPT_mnan_EQ))
	return llvm::StringSwitch<bool>(NaNArg->getValue())
	.Case("2008", true)
	.Case("legacy", false)
	.Default(false);

	// NaN2008 is the default for MIPS32r6/MIPS64r6.
	return llvm::StringSwitch<bool>(getCPUName(Args, Triple))
	.Cases("mips32r6", "mips64r6", true)
	.Default(false);

	return false;
	}

	bool mips::isFP64ADefault(const llvm::Triple &Triple, StringRef CPUName) {
	if (!Triple.isAndroid())
	return false;

	// Android MIPS32R6 defaults to FP64A.
	return llvm::StringSwitch<bool>(CPUName)
	.Case("mips32r6", true)
	.Default(false);
	}

	bool mips::isFPXXDefault(const llvm::Triple &Triple, StringRef CPUName,
	StringRef ABIName, mips::FloatABI FloatABI) {
	if (Triple.getVendor() != llvm::Triple::ImaginationTechnologies &&
	Triple.getVendor() != llvm::Triple::MipsTechnologies &&
	!Triple.isAndroid())
	return false;

	if (ABIName != "32")
	return false;

	// FPXX shouldn't be used if either -msoft-float or -mfloat-abi=soft is
	// present.
	if (FloatABI == mips::FloatABI::Soft)
	return false;

	return llvm::StringSwitch<bool>(CPUName)
	.Cases("mips2", "mips3", "mips4", "mips5", true)
	.Cases("mips32", "mips32r2", "mips32r3", "mips32r5", true)
	.Cases("mips64", "mips64r2", "mips64r3", "mips64r5", true)
	.Default(false);
	}

	bool mips::shouldUseFPXX(const ArgList &Args, const llvm::Triple &Triple,
	StringRef CPUName, StringRef ABIName,
	mips::FloatABI FloatABI) {
	bool UseFPXX = isFPXXDefault(Triple, CPUName, ABIName, FloatABI);

	// FPXX shouldn't be used if -msingle-float is present.
	if (Arg *A = Args.getLastArg(options::OPT_msingle_float,
	options::OPT_mdouble_float))
	if (A->getOption().matches(options::OPT_msingle_float))
	UseFPXX = false;

	return UseFPXX;
	}
	Index: head/contrib/llvm/tools/clang/lib/Driver/ToolChains/OpenBSD.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Driver/ToolChains/OpenBSD.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Driver/ToolChains/OpenBSD.cpp (revision 322320)
	@@ -1,234 +1,236 @@
	//===--- OpenBSD.cpp - OpenBSD ToolChain Implementations --------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#include "OpenBSD.h"
	#include "Arch/Mips.h"
	#include "Arch/Sparc.h"
	#include "CommonArgs.h"
	#include "clang/Driver/Compilation.h"
	#include "clang/Driver/Options.h"
	#include "llvm/Option/ArgList.h"

	using namespace clang::driver;
	using namespace clang::driver::tools;
	using namespace clang::driver::toolchains;
	using namespace clang;
	using namespace llvm::opt;

	void openbsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
	const InputInfo &Output,
	const InputInfoList &Inputs,
	const ArgList &Args,
	const char *LinkingOutput) const {
	claimNoWarnArgs(Args);
	ArgStringList CmdArgs;

	switch (getToolChain().getArch()) {
	case llvm::Triple::x86:
	// When building 32-bit code on OpenBSD/amd64, we have to explicitly
	// instruct as in the base system to assemble 32-bit code.
	CmdArgs.push_back("--32");
	break;

	case llvm::Triple::ppc:
	CmdArgs.push_back("-mppc");
	CmdArgs.push_back("-many");
	break;

	case llvm::Triple::sparc:
	case llvm::Triple::sparcel: {
	CmdArgs.push_back("-32");
	std::string CPU = getCPUName(Args, getToolChain().getTriple());
	CmdArgs.push_back(sparc::getSparcAsmModeForCPU(CPU, getToolChain().getTriple()));
	AddAssemblerKPIC(getToolChain(), Args, CmdArgs);
	break;
	}

	case llvm::Triple::sparcv9: {
	CmdArgs.push_back("-64");
	std::string CPU = getCPUName(Args, getToolChain().getTriple());
	CmdArgs.push_back(sparc::getSparcAsmModeForCPU(CPU, getToolChain().getTriple()));
	AddAssemblerKPIC(getToolChain(), Args, CmdArgs);
	break;
	}

	case llvm::Triple::mips64:
	case llvm::Triple::mips64el: {
	StringRef CPUName;
	StringRef ABIName;
	mips::getMipsCPUAndABI(Args, getToolChain().getTriple(), CPUName, ABIName);

	CmdArgs.push_back("-mabi");
	CmdArgs.push_back(mips::getGnuCompatibleMipsABIName(ABIName).data());

	if (getToolChain().getArch() == llvm::Triple::mips64)
	CmdArgs.push_back("-EB");
	else
	CmdArgs.push_back("-EL");

	AddAssemblerKPIC(getToolChain(), Args, CmdArgs);
	break;
	}

	default:
	break;
	}

	Args.AddAllArgValues(CmdArgs, options::OPT_Wa_COMMA, options::OPT_Xassembler);

	CmdArgs.push_back("-o");
	CmdArgs.push_back(Output.getFilename());

	for (const auto &II : Inputs)
	CmdArgs.push_back(II.getFilename());

	const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
	C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
	}

	void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
	const InputInfo &Output,
	const InputInfoList &Inputs,
	const ArgList &Args,
	const char *LinkingOutput) const {
	const Driver &D = getToolChain().getDriver();
	ArgStringList CmdArgs;

	// Silence warning for "clang -g foo.o -o foo"
	Args.ClaimAllArgs(options::OPT_g_Group);
	// and "clang -emit-llvm foo.o -o foo"
	Args.ClaimAllArgs(options::OPT_emit_llvm);
	// and for "clang -w foo.o -o foo". Other warning options are already
	// handled somewhere else.
	Args.ClaimAllArgs(options::OPT_w);

	if (getToolChain().getArch() == llvm::Triple::mips64)
	CmdArgs.push_back("-EB");
	else if (getToolChain().getArch() == llvm::Triple::mips64el)
	CmdArgs.push_back("-EL");

	if (!Args.hasArg(options::OPT_nostdlib, options::OPT_shared)) {
	CmdArgs.push_back("-e");
	CmdArgs.push_back("__start");
	}

	CmdArgs.push_back("--eh-frame-hdr");
	if (Args.hasArg(options::OPT_static)) {
	CmdArgs.push_back("-Bstatic");
	} else {
	if (Args.hasArg(options::OPT_rdynamic))
	CmdArgs.push_back("-export-dynamic");
	CmdArgs.push_back("-Bdynamic");
	if (Args.hasArg(options::OPT_shared)) {
	CmdArgs.push_back("-shared");
	} else {
	CmdArgs.push_back("-dynamic-linker");
	CmdArgs.push_back("/usr/libexec/ld.so");
	}
	}

	+ if (Args.hasArg(options::OPT_pie))
	+ CmdArgs.push_back("-pie");
	if (Args.hasArg(options::OPT_nopie))
	CmdArgs.push_back("-nopie");

	if (Output.isFilename()) {
	CmdArgs.push_back("-o");
	CmdArgs.push_back(Output.getFilename());
	} else {
	assert(Output.isNothing() && "Invalid output.");
	}

	if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
	if (!Args.hasArg(options::OPT_shared)) {
	if (Args.hasArg(options::OPT_pg))
	CmdArgs.push_back(
	Args.MakeArgString(getToolChain().GetFilePath("gcrt0.o")));
	else if (Args.hasArg(options::OPT_static) &&
	!Args.hasArg(options::OPT_nopie))
	CmdArgs.push_back(
	Args.MakeArgString(getToolChain().GetFilePath("rcrt0.o")));
	else
	CmdArgs.push_back(
	Args.MakeArgString(getToolChain().GetFilePath("crt0.o")));
	CmdArgs.push_back(
	Args.MakeArgString(getToolChain().GetFilePath("crtbegin.o")));
	} else {
	CmdArgs.push_back(
	Args.MakeArgString(getToolChain().GetFilePath("crtbeginS.o")));
	}
	}

	std::string Triple = getToolChain().getTripleString();
	if (Triple.substr(0, 6) == "x86_64")
	Triple.replace(0, 6, "amd64");
	CmdArgs.push_back(
	Args.MakeArgString("-L/usr/lib/gcc-lib/" + Triple + "/4.2.1"));

	Args.AddAllArgs(CmdArgs, {options::OPT_L, options::OPT_T_Group,
	options::OPT_e, options::OPT_s, options::OPT_t,
	options::OPT_Z_Flag, options::OPT_r});

	AddLinkerInputs(getToolChain(), Inputs, Args, CmdArgs, JA);

	if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nodefaultlibs)) {
	if (D.CCCIsCXX()) {
	getToolChain().AddCXXStdlibLibArgs(Args, CmdArgs);
	if (Args.hasArg(options::OPT_pg))
	CmdArgs.push_back("-lm_p");
	else
	CmdArgs.push_back("-lm");
	}

	// FIXME: For some reason GCC passes -lgcc before adding
	// the default system libraries. Just mimic this for now.
	CmdArgs.push_back("-lgcc");

	if (Args.hasArg(options::OPT_pthread)) {
	if (!Args.hasArg(options::OPT_shared) && Args.hasArg(options::OPT_pg))
	CmdArgs.push_back("-lpthread_p");
	else
	CmdArgs.push_back("-lpthread");
	}

	if (!Args.hasArg(options::OPT_shared)) {
	if (Args.hasArg(options::OPT_pg))
	CmdArgs.push_back("-lc_p");
	else
	CmdArgs.push_back("-lc");
	}

	CmdArgs.push_back("-lgcc");
	}

	if (!Args.hasArg(options::OPT_nostdlib, options::OPT_nostartfiles)) {
	if (!Args.hasArg(options::OPT_shared))
	CmdArgs.push_back(
	Args.MakeArgString(getToolChain().GetFilePath("crtend.o")));
	else
	CmdArgs.push_back(
	Args.MakeArgString(getToolChain().GetFilePath("crtendS.o")));
	}

	const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
	C.addCommand(llvm::make_unique<Command>(JA, *this, Exec, CmdArgs, Inputs));
	}

	/// OpenBSD - OpenBSD tool chain which can call as(1) and ld(1) directly.

	OpenBSD::OpenBSD(const Driver &D, const llvm::Triple &Triple,
	const ArgList &Args)
	: Generic_ELF(D, Triple, Args) {
	getFilePaths().push_back(getDriver().Dir + "/../lib");
	getFilePaths().push_back("/usr/lib");
	}

	Tool *OpenBSD::buildAssembler() const {
	return new tools::openbsd::Assembler(*this);
	}

	Tool OpenBSD::buildLinker() const { return new tools::openbsd::Linker(this); }
	Index: head/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Headers/avx2intrin.h (revision 322320)
	@@ -1,1299 +1,1300 @@
	/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------===
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*
	*===-----------------------------------------------------------------------===
	*/

	#ifndef __IMMINTRIN_H
	#error "Never use <avx2intrin.h> directly; include <immintrin.h> instead."
	#endif

	#ifndef __AVX2INTRIN_H
	#define __AVX2INTRIN_H

	/* Define the default attributes for the functions in this file. */
	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx2")))

	/* SSE4 Multiple Packed Sums of Absolute Difference. */
	#define _mm256_mpsadbw_epu8(X, Y, M) \
	(__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
	(__v32qi)(__m256i)(Y), (int)(M))

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_abs_epi8(__m256i __a)
	{
	return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_abs_epi16(__m256i __a)
	{
	return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_abs_epi32(__m256i __a)
	{
	return (__m256i)__builtin_ia32_pabsd256((__v8si)__a);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_packs_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_packs_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_packus_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_packus_epi32(__m256i __V1, __m256i __V2)
	{
	return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_add_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v32qu)__a + (__v32qu)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_add_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v16hu)__a + (__v16hu)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_add_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v8su)__a + (__v8su)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_add_epi64(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v4du)__a + (__v4du)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_adds_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_adds_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_adds_epu8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_adds_epu16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b);
	}

	#define _mm256_alignr_epi8(a, b, n) __extension__ ({ \
	(__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
	(__v32qi)(__m256i)(b), (n)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_and_si256(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v4du)__a & (__v4du)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_andnot_si256(__m256i __a, __m256i __b)
	{
	return (__m256i)(~(__v4du)__a & (__v4du)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_avg_epu8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_avg_epu16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
	{
	return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
	(__v32qi)__M);
	}

	#define _mm256_blend_epi16(V1, V2, M) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v16hi)(__m256i)(V1), \
	(__v16hi)(__m256i)(V2), \
	(((M) & 0x01) ? 16 : 0), \
	(((M) & 0x02) ? 17 : 1), \
	(((M) & 0x04) ? 18 : 2), \
	(((M) & 0x08) ? 19 : 3), \
	(((M) & 0x10) ? 20 : 4), \
	(((M) & 0x20) ? 21 : 5), \
	(((M) & 0x40) ? 22 : 6), \
	(((M) & 0x80) ? 23 : 7), \
	(((M) & 0x01) ? 24 : 8), \
	(((M) & 0x02) ? 25 : 9), \
	(((M) & 0x04) ? 26 : 10), \
	(((M) & 0x08) ? 27 : 11), \
	(((M) & 0x10) ? 28 : 12), \
	(((M) & 0x20) ? 29 : 13), \
	(((M) & 0x40) ? 30 : 14), \
	(((M) & 0x80) ? 31 : 15)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v32qi)__a == (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v16hi)__a == (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v8si)__a == (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v4di)__a == (__v4di)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
	{
	/* This function always performs a signed comparison, but __v32qi is a char
	which may be signed or unsigned, so use __v32qs. */
	return (__m256i)((__v32qs)__a > (__v32qs)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v16hi)__a > (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v8si)__a > (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v4di)__a > (__v4di)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_hadd_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_hadd_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_hadds_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_hsub_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_hsub_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_hsubs_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_maddubs_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_madd_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_max_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_max_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_max_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_max_epu8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_max_epu16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_max_epu32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_min_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_min_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_min_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_min_epu8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_min_epu16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_min_epu32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm256_movemask_epi8(__m256i __a)
	{
	return __builtin_ia32_pmovmskb256((__v32qi)__a);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepi8_epi16(__m128i __V)
	{
	/* This function always performs a signed extension, but __v16qi is a char
	which may be signed or unsigned, so use __v16qs. */
	return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepi8_epi32(__m128i __V)
	{
	/* This function always performs a signed extension, but __v16qi is a char
	which may be signed or unsigned, so use __v16qs. */
	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepi8_epi64(__m128i __V)
	{
	/* This function always performs a signed extension, but __v16qi is a char
	which may be signed or unsigned, so use __v16qs. */
	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepi16_epi32(__m128i __V)
	{
	return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepi16_epi64(__m128i __V)
	{
	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepi32_epi64(__m128i __V)
	{
	return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepu8_epi16(__m128i __V)
	{
	return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepu8_epi32(__m128i __V)
	{
	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepu8_epi64(__m128i __V)
	{
	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepu16_epi32(__m128i __V)
	{
	return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepu16_epi64(__m128i __V)
	{
	return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtepu32_epi64(__m128i __V)
	{
	return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_mul_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_mulhi_epu16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_mulhi_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_mullo_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v16hu)__a * (__v16hu)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_mullo_epi32 (__m256i __a, __m256i __b)
	{
	return (__m256i)((__v8su)__a * (__v8su)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_mul_epu32(__m256i __a, __m256i __b)
	{
	return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_or_si256(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v4du)__a \| (__v4du)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sad_epu8(__m256i __a, __m256i __b)
	{
	return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_shuffle_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
	}

	#define _mm256_shuffle_epi32(a, imm) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v8si)(__m256i)(a), \
	(__v8si)_mm256_undefined_si256(), \
	0 + (((imm) >> 0) & 0x3), \
	0 + (((imm) >> 2) & 0x3), \
	0 + (((imm) >> 4) & 0x3), \
	0 + (((imm) >> 6) & 0x3), \
	4 + (((imm) >> 0) & 0x3), \
	4 + (((imm) >> 2) & 0x3), \
	4 + (((imm) >> 4) & 0x3), \
	4 + (((imm) >> 6) & 0x3)); })

	#define _mm256_shufflehi_epi16(a, imm) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
	(__v16hi)_mm256_undefined_si256(), \
	0, 1, 2, 3, \
	4 + (((imm) >> 0) & 0x3), \
	4 + (((imm) >> 2) & 0x3), \
	4 + (((imm) >> 4) & 0x3), \
	4 + (((imm) >> 6) & 0x3), \
	8, 9, 10, 11, \
	12 + (((imm) >> 0) & 0x3), \
	12 + (((imm) >> 2) & 0x3), \
	12 + (((imm) >> 4) & 0x3), \
	12 + (((imm) >> 6) & 0x3)); })

	#define _mm256_shufflelo_epi16(a, imm) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v16hi)(__m256i)(a), \
	(__v16hi)_mm256_undefined_si256(), \
	0 + (((imm) >> 0) & 0x3), \
	0 + (((imm) >> 2) & 0x3), \
	0 + (((imm) >> 4) & 0x3), \
	0 + (((imm) >> 6) & 0x3), \
	4, 5, 6, 7, \
	8 + (((imm) >> 0) & 0x3), \
	8 + (((imm) >> 2) & 0x3), \
	8 + (((imm) >> 4) & 0x3), \
	8 + (((imm) >> 6) & 0x3), \
	12, 13, 14, 15); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sign_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sign_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sign_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
	}

	#define _mm256_slli_si256(a, imm) __extension__ ({ \
	(__m256i)__builtin_shufflevector( \
	(__v32qi)_mm256_setzero_si256(), \
	(__v32qi)(__m256i)(a), \
	((char)(imm)&0xF0) ? 0 : ((char)(imm)>0x0 ? 16 : 32) - (char)(imm), \
	((char)(imm)&0xF0) ? 1 : ((char)(imm)>0x1 ? 17 : 33) - (char)(imm), \
	((char)(imm)&0xF0) ? 2 : ((char)(imm)>0x2 ? 18 : 34) - (char)(imm), \
	((char)(imm)&0xF0) ? 3 : ((char)(imm)>0x3 ? 19 : 35) - (char)(imm), \
	((char)(imm)&0xF0) ? 4 : ((char)(imm)>0x4 ? 20 : 36) - (char)(imm), \
	((char)(imm)&0xF0) ? 5 : ((char)(imm)>0x5 ? 21 : 37) - (char)(imm), \
	((char)(imm)&0xF0) ? 6 : ((char)(imm)>0x6 ? 22 : 38) - (char)(imm), \
	((char)(imm)&0xF0) ? 7 : ((char)(imm)>0x7 ? 23 : 39) - (char)(imm), \
	((char)(imm)&0xF0) ? 8 : ((char)(imm)>0x8 ? 24 : 40) - (char)(imm), \
	((char)(imm)&0xF0) ? 9 : ((char)(imm)>0x9 ? 25 : 41) - (char)(imm), \
	((char)(imm)&0xF0) ? 10 : ((char)(imm)>0xA ? 26 : 42) - (char)(imm), \
	((char)(imm)&0xF0) ? 11 : ((char)(imm)>0xB ? 27 : 43) - (char)(imm), \
	((char)(imm)&0xF0) ? 12 : ((char)(imm)>0xC ? 28 : 44) - (char)(imm), \
	((char)(imm)&0xF0) ? 13 : ((char)(imm)>0xD ? 29 : 45) - (char)(imm), \
	((char)(imm)&0xF0) ? 14 : ((char)(imm)>0xE ? 30 : 46) - (char)(imm), \
	((char)(imm)&0xF0) ? 15 : ((char)(imm)>0xF ? 31 : 47) - (char)(imm), \
	((char)(imm)&0xF0) ? 16 : ((char)(imm)>0x0 ? 32 : 48) - (char)(imm), \
	((char)(imm)&0xF0) ? 17 : ((char)(imm)>0x1 ? 33 : 49) - (char)(imm), \
	((char)(imm)&0xF0) ? 18 : ((char)(imm)>0x2 ? 34 : 50) - (char)(imm), \
	((char)(imm)&0xF0) ? 19 : ((char)(imm)>0x3 ? 35 : 51) - (char)(imm), \
	((char)(imm)&0xF0) ? 20 : ((char)(imm)>0x4 ? 36 : 52) - (char)(imm), \
	((char)(imm)&0xF0) ? 21 : ((char)(imm)>0x5 ? 37 : 53) - (char)(imm), \
	((char)(imm)&0xF0) ? 22 : ((char)(imm)>0x6 ? 38 : 54) - (char)(imm), \
	((char)(imm)&0xF0) ? 23 : ((char)(imm)>0x7 ? 39 : 55) - (char)(imm), \
	((char)(imm)&0xF0) ? 24 : ((char)(imm)>0x8 ? 40 : 56) - (char)(imm), \
	((char)(imm)&0xF0) ? 25 : ((char)(imm)>0x9 ? 41 : 57) - (char)(imm), \
	((char)(imm)&0xF0) ? 26 : ((char)(imm)>0xA ? 42 : 58) - (char)(imm), \
	((char)(imm)&0xF0) ? 27 : ((char)(imm)>0xB ? 43 : 59) - (char)(imm), \
	((char)(imm)&0xF0) ? 28 : ((char)(imm)>0xC ? 44 : 60) - (char)(imm), \
	((char)(imm)&0xF0) ? 29 : ((char)(imm)>0xD ? 45 : 61) - (char)(imm), \
	((char)(imm)&0xF0) ? 30 : ((char)(imm)>0xE ? 46 : 62) - (char)(imm), \
	((char)(imm)&0xF0) ? 31 : ((char)(imm)>0xF ? 47 : 63) - (char)(imm)); })

	#define _mm256_bslli_epi128(a, count) _mm256_slli_si256((a), (count))

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_slli_epi16(__m256i __a, int __count)
	{
	return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sll_epi16(__m256i __a, __m128i __count)
	{
	return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_slli_epi32(__m256i __a, int __count)
	{
	return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sll_epi32(__m256i __a, __m128i __count)
	{
	return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_slli_epi64(__m256i __a, int __count)
	{
	return __builtin_ia32_psllqi256((__v4di)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sll_epi64(__m256i __a, __m128i __count)
	{
	return __builtin_ia32_psllq256((__v4di)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srai_epi16(__m256i __a, int __count)
	{
	return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sra_epi16(__m256i __a, __m128i __count)
	{
	return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srai_epi32(__m256i __a, int __count)
	{
	return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sra_epi32(__m256i __a, __m128i __count)
	{
	return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
	}

	#define _mm256_srli_si256(a, imm) __extension__ ({ \
	(__m256i)__builtin_shufflevector( \
	(__v32qi)(__m256i)(a), \
	(__v32qi)_mm256_setzero_si256(), \
	((char)(imm)&0xF0) ? 32 : (char)(imm) + ((char)(imm)>0xF ? 16 : 0), \
	((char)(imm)&0xF0) ? 33 : (char)(imm) + ((char)(imm)>0xE ? 17 : 1), \
	((char)(imm)&0xF0) ? 34 : (char)(imm) + ((char)(imm)>0xD ? 18 : 2), \
	((char)(imm)&0xF0) ? 35 : (char)(imm) + ((char)(imm)>0xC ? 19 : 3), \
	((char)(imm)&0xF0) ? 36 : (char)(imm) + ((char)(imm)>0xB ? 20 : 4), \
	((char)(imm)&0xF0) ? 37 : (char)(imm) + ((char)(imm)>0xA ? 21 : 5), \
	((char)(imm)&0xF0) ? 38 : (char)(imm) + ((char)(imm)>0x9 ? 22 : 6), \
	((char)(imm)&0xF0) ? 39 : (char)(imm) + ((char)(imm)>0x8 ? 23 : 7), \
	((char)(imm)&0xF0) ? 40 : (char)(imm) + ((char)(imm)>0x7 ? 24 : 8), \
	((char)(imm)&0xF0) ? 41 : (char)(imm) + ((char)(imm)>0x6 ? 25 : 9), \
	((char)(imm)&0xF0) ? 42 : (char)(imm) + ((char)(imm)>0x5 ? 26 : 10), \
	((char)(imm)&0xF0) ? 43 : (char)(imm) + ((char)(imm)>0x4 ? 27 : 11), \
	((char)(imm)&0xF0) ? 44 : (char)(imm) + ((char)(imm)>0x3 ? 28 : 12), \
	((char)(imm)&0xF0) ? 45 : (char)(imm) + ((char)(imm)>0x2 ? 29 : 13), \
	((char)(imm)&0xF0) ? 46 : (char)(imm) + ((char)(imm)>0x1 ? 30 : 14), \
	((char)(imm)&0xF0) ? 47 : (char)(imm) + ((char)(imm)>0x0 ? 31 : 15), \
	((char)(imm)&0xF0) ? 48 : (char)(imm) + ((char)(imm)>0xF ? 32 : 16), \
	((char)(imm)&0xF0) ? 49 : (char)(imm) + ((char)(imm)>0xE ? 33 : 17), \
	((char)(imm)&0xF0) ? 50 : (char)(imm) + ((char)(imm)>0xD ? 34 : 18), \
	((char)(imm)&0xF0) ? 51 : (char)(imm) + ((char)(imm)>0xC ? 35 : 19), \
	((char)(imm)&0xF0) ? 52 : (char)(imm) + ((char)(imm)>0xB ? 36 : 20), \
	((char)(imm)&0xF0) ? 53 : (char)(imm) + ((char)(imm)>0xA ? 37 : 21), \
	((char)(imm)&0xF0) ? 54 : (char)(imm) + ((char)(imm)>0x9 ? 38 : 22), \
	((char)(imm)&0xF0) ? 55 : (char)(imm) + ((char)(imm)>0x8 ? 39 : 23), \
	((char)(imm)&0xF0) ? 56 : (char)(imm) + ((char)(imm)>0x7 ? 40 : 24), \
	((char)(imm)&0xF0) ? 57 : (char)(imm) + ((char)(imm)>0x6 ? 41 : 25), \
	((char)(imm)&0xF0) ? 58 : (char)(imm) + ((char)(imm)>0x5 ? 42 : 26), \
	((char)(imm)&0xF0) ? 59 : (char)(imm) + ((char)(imm)>0x4 ? 43 : 27), \
	((char)(imm)&0xF0) ? 60 : (char)(imm) + ((char)(imm)>0x3 ? 44 : 28), \
	((char)(imm)&0xF0) ? 61 : (char)(imm) + ((char)(imm)>0x2 ? 45 : 29), \
	((char)(imm)&0xF0) ? 62 : (char)(imm) + ((char)(imm)>0x1 ? 46 : 30), \
	((char)(imm)&0xF0) ? 63 : (char)(imm) + ((char)(imm)>0x0 ? 47 : 31)); })

	#define _mm256_bsrli_epi128(a, count) _mm256_srli_si256((a), (count))

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srli_epi16(__m256i __a, int __count)
	{
	return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srl_epi16(__m256i __a, __m128i __count)
	{
	return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srli_epi32(__m256i __a, int __count)
	{
	return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srl_epi32(__m256i __a, __m128i __count)
	{
	return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srli_epi64(__m256i __a, int __count)
	{
	return __builtin_ia32_psrlqi256((__v4di)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srl_epi64(__m256i __a, __m128i __count)
	{
	return __builtin_ia32_psrlq256((__v4di)__a, __count);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sub_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v32qu)__a - (__v32qu)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sub_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v16hu)__a - (__v16hu)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sub_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v8su)__a - (__v8su)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sub_epi64(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v4du)__a - (__v4du)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_subs_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_subs_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_subs_epu8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_subs_epu16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_unpackhi_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_unpackhi_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_unpackhi_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_unpackhi_epi64(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_unpacklo_epi8(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_unpacklo_epi16(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_unpacklo_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_unpacklo_epi64(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_xor_si256(__m256i __a, __m256i __b)
	{
	return (__m256i)((__v4du)__a ^ (__v4du)__b);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_stream_load_si256(__m256i const *__V)
	{
	- return (__m256i)__builtin_nontemporal_load((const __v4di *)__V);
	+ typedef __v4di __v4di_aligned __attribute__((aligned(32)));
	+ return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_broadcastss_ps(__m128 __X)
	{
	return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_broadcastsd_pd(__m128d __a)
	{
	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
	}

	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm256_broadcastss_ps(__m128 __X)
	{
	return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m256d __DEFAULT_FN_ATTRS
	_mm256_broadcastsd_pd(__m128d __X)
	{
	return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_broadcastsi128_si256(__m128i __X)
	{
	return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1);
	}

	#define _mm_blend_epi32(V1, V2, M) __extension__ ({ \
	(__m128i)__builtin_shufflevector((__v4si)(__m128i)(V1), \
	(__v4si)(__m128i)(V2), \
	(((M) & 0x01) ? 4 : 0), \
	(((M) & 0x02) ? 5 : 1), \
	(((M) & 0x04) ? 6 : 2), \
	(((M) & 0x08) ? 7 : 3)); })

	#define _mm256_blend_epi32(V1, V2, M) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v8si)(__m256i)(V1), \
	(__v8si)(__m256i)(V2), \
	(((M) & 0x01) ? 8 : 0), \
	(((M) & 0x02) ? 9 : 1), \
	(((M) & 0x04) ? 10 : 2), \
	(((M) & 0x08) ? 11 : 3), \
	(((M) & 0x10) ? 12 : 4), \
	(((M) & 0x20) ? 13 : 5), \
	(((M) & 0x40) ? 14 : 6), \
	(((M) & 0x80) ? 15 : 7)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_broadcastb_epi8(__m128i __X)
	{
	return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_broadcastw_epi16(__m128i __X)
	{
	return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_broadcastd_epi32(__m128i __X)
	{
	return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_broadcastq_epi64(__m128i __X)
	{
	return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_broadcastb_epi8(__m128i __X)
	{
	return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_broadcastw_epi16(__m128i __X)
	{
	return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
	}


	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_broadcastd_epi32(__m128i __X)
	{
	return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_broadcastq_epi64(__m128i __X)
	{
	return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
	{
	return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
	}

	#define _mm256_permute4x64_pd(V, M) __extension__ ({ \
	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V), \
	(__v4df)_mm256_undefined_pd(), \
	((M) >> 0) & 0x3, \
	((M) >> 2) & 0x3, \
	((M) >> 4) & 0x3, \
	((M) >> 6) & 0x3); })

	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
	{
	return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
	}

	#define _mm256_permute4x64_epi64(V, M) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v4di)(__m256i)(V), \
	(__v4di)_mm256_undefined_si256(), \
	((M) >> 0) & 0x3, \
	((M) >> 2) & 0x3, \
	((M) >> 4) & 0x3, \
	((M) >> 6) & 0x3); })

	#define _mm256_permute2x128_si256(V1, V2, M) __extension__ ({ \
	(__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (M)); })

	#define _mm256_extracti128_si256(V, M) __extension__ ({ \
	(__m128i)__builtin_shufflevector((__v4di)(__m256i)(V), \
	(__v4di)_mm256_undefined_si256(), \
	(((M) & 1) ? 2 : 0), \
	(((M) & 1) ? 3 : 1) ); })

	#define _mm256_inserti128_si256(V1, V2, M) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v4di)(__m256i)(V1), \
	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
	(((M) & 1) ? 0 : 4), \
	(((M) & 1) ? 1 : 5), \
	(((M) & 1) ? 4 : 2), \
	(((M) & 1) ? 5 : 3) ); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_maskload_epi32(int const *__X, __m256i __M)
	{
	return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_maskload_epi64(long long const *__X, __m256i __M)
	{
	return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_maskload_epi32(int const *__X, __m128i __M)
	{
	return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_maskload_epi64(long long const *__X, __m128i __M)
	{
	return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
	{
	__builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
	{
	__builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
	{
	__builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
	{
	__builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sllv_epi32(__m256i __X, __m256i __Y)
	{
	return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_sllv_epi32(__m128i __X, __m128i __Y)
	{
	return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_sllv_epi64(__m256i __X, __m256i __Y)
	{
	return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_sllv_epi64(__m128i __X, __m128i __Y)
	{
	return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srav_epi32(__m256i __X, __m256i __Y)
	{
	return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_srav_epi32(__m128i __X, __m128i __Y)
	{
	return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srlv_epi32(__m256i __X, __m256i __Y)
	{
	return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_srlv_epi32(__m128i __X, __m128i __Y)
	{
	return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_srlv_epi64(__m256i __X, __m256i __Y)
	{
	return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm_srlv_epi64(__m128i __X, __m128i __Y)
	{
	return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
	}

	#define _mm_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
	(__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
	(double const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v2df)(__m128d)(mask), (s)); })

	#define _mm256_mask_i32gather_pd(a, m, i, mask, s) __extension__ ({ \
	(__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
	(double const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v4df)(__m256d)(mask), (s)); })

	#define _mm_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
	(__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
	(double const *)(m), \
	(__v2di)(__m128i)(i), \
	(__v2df)(__m128d)(mask), (s)); })

	#define _mm256_mask_i64gather_pd(a, m, i, mask, s) __extension__ ({ \
	(__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
	(double const *)(m), \
	(__v4di)(__m256i)(i), \
	(__v4df)(__m256d)(mask), (s)); })

	#define _mm_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
	(__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
	(float const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v4sf)(__m128)(mask), (s)); })

	#define _mm256_mask_i32gather_ps(a, m, i, mask, s) __extension__ ({ \
	(__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
	(float const *)(m), \
	(__v8si)(__m256i)(i), \
	(__v8sf)(__m256)(mask), (s)); })

	#define _mm_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
	(__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
	(float const *)(m), \
	(__v2di)(__m128i)(i), \
	(__v4sf)(__m128)(mask), (s)); })

	#define _mm256_mask_i64gather_ps(a, m, i, mask, s) __extension__ ({ \
	(__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
	(float const *)(m), \
	(__v4di)(__m256i)(i), \
	(__v4sf)(__m128)(mask), (s)); })

	#define _mm_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
	(int const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v4si)(__m128i)(mask), (s)); })

	#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) __extension__ ({ \
	(__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
	(int const *)(m), \
	(__v8si)(__m256i)(i), \
	(__v8si)(__m256i)(mask), (s)); })

	#define _mm_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
	(int const *)(m), \
	(__v2di)(__m128i)(i), \
	(__v4si)(__m128i)(mask), (s)); })

	#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
	(int const *)(m), \
	(__v4di)(__m256i)(i), \
	(__v4si)(__m128i)(mask), (s)); })

	#define _mm_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
	(long long const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v2di)(__m128i)(mask), (s)); })

	#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) __extension__ ({ \
	(__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
	(long long const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v4di)(__m256i)(mask), (s)); })

	#define _mm_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
	(long long const *)(m), \
	(__v2di)(__m128i)(i), \
	(__v2di)(__m128i)(mask), (s)); })

	#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) __extension__ ({ \
	(__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
	(long long const *)(m), \
	(__v4di)(__m256i)(i), \
	(__v4di)(__m256i)(mask), (s)); })

	#define _mm_i32gather_pd(m, i, s) __extension__ ({ \
	(__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
	(double const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
	_mm_setzero_pd()), \
	(s)); })

	#define _mm256_i32gather_pd(m, i, s) __extension__ ({ \
	(__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
	(double const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
	_mm256_setzero_pd(), \
	_CMP_EQ_OQ), \
	(s)); })

	#define _mm_i64gather_pd(m, i, s) __extension__ ({ \
	(__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
	(double const *)(m), \
	(__v2di)(__m128i)(i), \
	(__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
	_mm_setzero_pd()), \
	(s)); })

	#define _mm256_i64gather_pd(m, i, s) __extension__ ({ \
	(__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
	(double const *)(m), \
	(__v4di)(__m256i)(i), \
	(__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
	_mm256_setzero_pd(), \
	_CMP_EQ_OQ), \
	(s)); })

	#define _mm_i32gather_ps(m, i, s) __extension__ ({ \
	(__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
	(float const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
	_mm_setzero_ps()), \
	(s)); })

	#define _mm256_i32gather_ps(m, i, s) __extension__ ({ \
	(__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
	(float const *)(m), \
	(__v8si)(__m256i)(i), \
	(__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
	_mm256_setzero_ps(), \
	_CMP_EQ_OQ), \
	(s)); })

	#define _mm_i64gather_ps(m, i, s) __extension__ ({ \
	(__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
	(float const *)(m), \
	(__v2di)(__m128i)(i), \
	(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
	_mm_setzero_ps()), \
	(s)); })

	#define _mm256_i64gather_ps(m, i, s) __extension__ ({ \
	(__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
	(float const *)(m), \
	(__v4di)(__m256i)(i), \
	(__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
	_mm_setzero_ps()), \
	(s)); })

	#define _mm_i32gather_epi32(m, i, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
	(int const *)(m), (__v4si)(__m128i)(i), \
	(__v4si)_mm_set1_epi32(-1), (s)); })

	#define _mm256_i32gather_epi32(m, i, s) __extension__ ({ \
	(__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
	(int const *)(m), (__v8si)(__m256i)(i), \
	(__v8si)_mm256_set1_epi32(-1), (s)); })

	#define _mm_i64gather_epi32(m, i, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
	(int const *)(m), (__v2di)(__m128i)(i), \
	(__v4si)_mm_set1_epi32(-1), (s)); })

	#define _mm256_i64gather_epi32(m, i, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
	(int const *)(m), (__v4di)(__m256i)(i), \
	(__v4si)_mm_set1_epi32(-1), (s)); })

	#define _mm_i32gather_epi64(m, i, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
	(long long const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v2di)_mm_set1_epi64x(-1), (s)); })

	#define _mm256_i32gather_epi64(m, i, s) __extension__ ({ \
	(__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
	(long long const *)(m), \
	(__v4si)(__m128i)(i), \
	(__v4di)_mm256_set1_epi64x(-1), (s)); })

	#define _mm_i64gather_epi64(m, i, s) __extension__ ({ \
	(__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
	(long long const *)(m), \
	(__v2di)(__m128i)(i), \
	(__v2di)_mm_set1_epi64x(-1), (s)); })

	#define _mm256_i64gather_epi64(m, i, s) __extension__ ({ \
	(__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
	(long long const *)(m), \
	(__v4di)(__m256i)(i), \
	(__v4di)_mm256_set1_epi64x(-1), (s)); })

	#undef __DEFAULT_FN_ATTRS

	#endif /* __AVX2INTRIN_H */
	Index: head/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Headers/avx512fintrin.h (revision 322320)
	@@ -1,10399 +1,10403 @@
	/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------===
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*
	*===-----------------------------------------------------------------------===
	*/
	#ifndef __IMMINTRIN_H
	#error "Never use <avx512fintrin.h> directly; include <immintrin.h> instead."
	#endif

	#ifndef __AVX512FINTRIN_H
	#define __AVX512FINTRIN_H

	typedef char __v64qi __attribute__((__vector_size__(64)));
	typedef short __v32hi __attribute__((__vector_size__(64)));
	typedef double __v8df __attribute__((__vector_size__(64)));
	typedef float __v16sf __attribute__((__vector_size__(64)));
	typedef long long __v8di __attribute__((__vector_size__(64)));
	typedef int __v16si __attribute__((__vector_size__(64)));

	/* Unsigned types */
	typedef unsigned char __v64qu __attribute__((__vector_size__(64)));
	typedef unsigned short __v32hu __attribute__((__vector_size__(64)));
	typedef unsigned long long __v8du __attribute__((__vector_size__(64)));
	typedef unsigned int __v16su __attribute__((__vector_size__(64)));

	typedef float __m512 __attribute__((__vector_size__(64)));
	typedef double __m512d __attribute__((__vector_size__(64)));
	typedef long long __m512i __attribute__((__vector_size__(64)));

	typedef unsigned char __mmask8;
	typedef unsigned short __mmask16;

	/* Rounding mode macros. */
	#define _MM_FROUND_TO_NEAREST_INT 0x00
	#define _MM_FROUND_TO_NEG_INF 0x01
	#define _MM_FROUND_TO_POS_INF 0x02
	#define _MM_FROUND_TO_ZERO 0x03
	#define _MM_FROUND_CUR_DIRECTION 0x04

	/* Constants for integer comparison predicates */
	typedef enum {
	_MM_CMPINT_EQ, /* Equal */
	_MM_CMPINT_LT, /* Less than */
	_MM_CMPINT_LE, /* Less than or Equal */
	_MM_CMPINT_UNUSED,
	_MM_CMPINT_NE, /* Not Equal */
	_MM_CMPINT_NLT, /* Not Less than */
	#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */
	_MM_CMPINT_NLE /* Not Less than or Equal */
	#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */
	} _MM_CMPINT_ENUM;

	typedef enum
	{
	_MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02,
	_MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05,
	_MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08,
	_MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B,
	_MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E,
	_MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11,
	_MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14,
	_MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17,
	_MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A,
	_MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D,
	_MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20,
	_MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23,
	_MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26,
	_MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29,
	_MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C,
	_MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F,
	_MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32,
	_MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35,
	_MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38,
	_MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B,
	_MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E,
	_MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41,
	_MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44,
	_MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47,
	_MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A,
	_MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D,
	_MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50,
	_MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53,
	_MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56,
	_MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59,
	_MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C,
	_MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F,
	_MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62,
	_MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65,
	_MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68,
	_MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B,
	_MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E,
	_MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71,
	_MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74,
	_MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77,
	_MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A,
	_MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D,
	_MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80,
	_MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83,
	_MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86,
	_MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89,
	_MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C,
	_MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F,
	_MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92,
	_MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95,
	_MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98,
	_MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B,
	_MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E,
	_MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1,
	_MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4,
	_MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7,
	_MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA,
	_MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD,
	_MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0,
	_MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3,
	_MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6,
	_MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9,
	_MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC,
	_MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF,
	_MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2,
	_MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5,
	_MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8,
	_MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB,
	_MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE,
	_MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1,
	_MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4,
	_MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7,
	_MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA,
	_MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD,
	_MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0,
	_MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3,
	_MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6,
	_MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9,
	_MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC,
	_MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF,
	_MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2,
	_MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5,
	_MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8,
	_MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB,
	_MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE,
	_MM_PERM_DDDD = 0xFF
	} _MM_PERM_ENUM;

	typedef enum
	{
	_MM_MANT_NORM_1_2, /* interval [1, 2) */
	_MM_MANT_NORM_p5_2, /* interval [0.5, 2) */
	_MM_MANT_NORM_p5_1, /* interval [0.5, 1) */
	_MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */
	} _MM_MANTISSA_NORM_ENUM;

	typedef enum
	{
	_MM_MANT_SIGN_src, /* sign = sign(SRC) */
	_MM_MANT_SIGN_zero, /* sign = 0 */
	_MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */
	} _MM_MANTISSA_SIGN_ENUM;

	/* Define the default attributes for the functions in this file. */
	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f")))

	/* Create vectors with repeated elements */

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_setzero_si512(void)
	{
	return (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 };
	}

	#define _mm512_setzero_epi32 _mm512_setzero_si512

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_undefined_pd(void)
	{
	return (__m512d)__builtin_ia32_undef512();
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_undefined(void)
	{
	return (__m512)__builtin_ia32_undef512();
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_undefined_ps(void)
	{
	return (__m512)__builtin_ia32_undef512();
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_undefined_epi32(void)
	{
	return (__m512i)__builtin_ia32_undef512();
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcastd_epi32 (__m128i __A)
	{
	return (__m512i)__builtin_shufflevector((__v4si) __A,
	(__v4si)_mm_undefined_si128(),
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512(__M,
	(__v16si) _mm512_broadcastd_epi32(__A),
	(__v16si) __O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512(__M,
	(__v16si) _mm512_broadcastd_epi32(__A),
	(__v16si) _mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcastq_epi64 (__m128i __A)
	{
	return (__m512i)__builtin_shufflevector((__v2di) __A,
	(__v2di) _mm_undefined_si128(),
	0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512(__M,
	(__v8di) _mm512_broadcastq_epi64(__A),
	(__v8di) __O);

	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512(__M,
	(__v8di) _mm512_broadcastq_epi64(__A),
	(__v8di) _mm512_setzero_si512());
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_set1_epi32(__mmask16 __M, int __A)
	{
	return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_set1_epi64(__mmask8 __M, long long __A)
	{
	#ifdef __x86_64__
	return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	#else
	return (__m512i) __builtin_ia32_pbroadcastq512_mem_mask (__A,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	#endif
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_setzero_ps(void)
	{
	return (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
	0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
	}

	#define _mm512_setzero _mm512_setzero_ps

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_setzero_pd(void)
	{
	return (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_set1_ps(float __w)
	{
	return (__m512){ __w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w };
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_set1_pd(double __w)
	{
	return (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set1_epi8(char __w)
	{
	return (__m512i)(__v64qi){ __w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set1_epi16(short __w)
	{
	return (__m512i)(__v32hi){ __w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w, __w, __w };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set1_epi32(int __s)
	{
	return (__m512i)(__v16si){ __s, __s, __s, __s, __s, __s, __s, __s,
	__s, __s, __s, __s, __s, __s, __s, __s };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set1_epi64(long long __d)
	{
	return (__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d };
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_broadcastss_ps(__m128 __A)
	{
	return (__m512)__builtin_shufflevector((__v4sf) __A,
	(__v4sf)_mm_undefined_ps(),
	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set4_epi32 (int __A, int __B, int __C, int __D)
	{
	return (__m512i)(__v16si)
	{ __D, __C, __B, __A, __D, __C, __B, __A,
	__D, __C, __B, __A, __D, __C, __B, __A };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set4_epi64 (long long __A, long long __B, long long __C,
	long long __D)
	{
	return (__m512i) (__v8di)
	{ __D, __C, __B, __A, __D, __C, __B, __A };
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_set4_pd (double __A, double __B, double __C, double __D)
	{
	return (__m512d)
	{ __D, __C, __B, __A, __D, __C, __B, __A };
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_set4_ps (float __A, float __B, float __C, float __D)
	{
	return (__m512)
	{ __D, __C, __B, __A, __D, __C, __B, __A,
	__D, __C, __B, __A, __D, __C, __B, __A };
	}

	#define _mm512_setr4_epi32(e0,e1,e2,e3) \
	_mm512_set4_epi32((e3),(e2),(e1),(e0))

	#define _mm512_setr4_epi64(e0,e1,e2,e3) \
	_mm512_set4_epi64((e3),(e2),(e1),(e0))

	#define _mm512_setr4_pd(e0,e1,e2,e3) \
	_mm512_set4_pd((e3),(e2),(e1),(e0))

	#define _mm512_setr4_ps(e0,e1,e2,e3) \
	_mm512_set4_ps((e3),(e2),(e1),(e0))

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_broadcastsd_pd(__m128d __A)
	{
	return (__m512d)__builtin_shufflevector((__v2df) __A,
	(__v2df) _mm_undefined_pd(),
	0, 0, 0, 0, 0, 0, 0, 0);
	}

	/* Cast between vector types */

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_castpd256_pd512(__m256d __a)
	{
	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_castps256_ps512(__m256 __a)
	{
	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7,
	-1, -1, -1, -1, -1, -1, -1, -1);
	}

	static __inline __m128d __DEFAULT_FN_ATTRS
	_mm512_castpd512_pd128(__m512d __a)
	{
	return __builtin_shufflevector(__a, __a, 0, 1);
	}

	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm512_castpd512_pd256 (__m512d __A)
	{
	return __builtin_shufflevector(__A, __A, 0, 1, 2, 3);
	}

	static __inline __m128 __DEFAULT_FN_ATTRS
	_mm512_castps512_ps128(__m512 __a)
	{
	return __builtin_shufflevector(__a, __a, 0, 1, 2, 3);
	}

	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm512_castps512_ps256 (__m512 __A)
	{
	return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_castpd_ps (__m512d __A)
	{
	return (__m512) (__A);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_castpd_si512 (__m512d __A)
	{
	return (__m512i) (__A);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_castpd128_pd512 (__m128d __A)
	{
	return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_castps_pd (__m512 __A)
	{
	return (__m512d) (__A);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_castps_si512 (__m512 __A)
	{
	return (__m512i) (__A);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_castps128_ps512 (__m128 __A)
	{
	return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_castsi128_si512 (__m128i __A)
	{
	return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_castsi256_si512 (__m256i __A)
	{
	return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_castsi512_ps (__m512i __A)
	{
	return (__m512) (__A);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_castsi512_pd (__m512i __A)
	{
	return (__m512d) (__A);
	}

	static __inline __m128i __DEFAULT_FN_ATTRS
	_mm512_castsi512_si128 (__m512i __A)
	{
	return (__m128i)__builtin_shufflevector(__A, __A , 0, 1);
	}

	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm512_castsi512_si256 (__m512i __A)
	{
	return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_int2mask(int __a)
	{
	return (__mmask16)__a;
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask2int(__mmask16 __a)
	{
	return (int)__a;
	}

	/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
	/// contain the value of the source vector. The upper 384 bits are set
	/// to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit vector of [2 x double].
	/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits
	/// contain the value of the parameter. The upper 384 bits are set to zero.
	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_zextpd128_pd512(__m128d __a)
	{
	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3);
	}

	/// \brief Constructs a 512-bit floating-point vector of [8 x double] from a
	/// 256-bit floating-point vector of [4 x double]. The lower 256 bits
	/// contain the value of the source vector. The upper 256 bits are set
	/// to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits
	/// contain the value of the parameter. The upper 256 bits are set to zero.
	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_zextpd256_pd512(__m256d __a)
	{
	return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7);
	}

	/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
	/// the value of the source vector. The upper 384 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits
	/// contain the value of the parameter. The upper 384 bits are set to zero.
	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_zextps128_ps512(__m128 __a)
	{
	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7);
	}

	/// \brief Constructs a 512-bit floating-point vector of [16 x float] from a
	/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain
	/// the value of the source vector. The upper 256 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits
	/// contain the value of the parameter. The upper 256 bits are set to zero.
	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_zextps256_ps512(__m256 __a)
	{
	return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	}

	/// \brief Constructs a 512-bit integer vector from a 128-bit integer vector.
	/// The lower 128 bits contain the value of the source vector. The upper
	/// 384 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit integer vector.
	/// \returns A 512-bit integer vector. The lower 128 bits contain the value of
	/// the parameter. The upper 384 bits are set to zero.
	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_zextsi128_si512(__m128i __a)
	{
	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3);
	}

	/// \brief Constructs a 512-bit integer vector from a 256-bit integer vector.
	/// The lower 256 bits contain the value of the source vector. The upper
	/// 256 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \returns A 512-bit integer vector. The lower 256 bits contain the value of
	/// the parameter. The upper 256 bits are set to zero.
	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_zextsi256_si512(__m256i __a)
	{
	return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7);
	}

	/* Bitwise operators */
	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_and_epi32(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v16su)__a & (__v16su)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
	(__v16si) _mm512_and_epi32(__a, __b),
	(__v16si) __src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (),
	__k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_and_epi64(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a & (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k,
	(__v8di) _mm512_and_epi64(__a, __b),
	(__v8di) __src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (),
	__k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_andnot_si512 (__m512i __A, __m512i __B)
	{
	return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_andnot_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i)(~(__v16su)(__A) & (__v16su)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_andnot_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(),
	__U, __A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_andnot_epi64(__m512i __A, __m512i __B)
	{
	return (__m512i)(~(__v8du)(__A) & (__v8du)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_andnot_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(),
	__U, __A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_or_epi32(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v16su)__a \| (__v16su)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
	(__v16si)_mm512_or_epi32(__a, __b),
	(__v16si)__src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_or_epi64(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a \| (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
	(__v8di)_mm512_or_epi64(__a, __b),
	(__v8di)__src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_xor_epi32(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v16su)__a ^ (__v16su)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k,
	(__v16si)_mm512_xor_epi32(__a, __b),
	(__v16si)__src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_xor_epi64(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a ^ (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k,
	(__v8di)_mm512_xor_epi64(__a, __b),
	(__v8di)__src);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b)
	{
	return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_and_si512(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a & (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_or_si512(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a \| (__v8du)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_xor_si512(__m512i __a, __m512i __b)
	{
	return (__m512i)((__v8du)__a ^ (__v8du)__b);
	}

	/* Arithmetic */

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_add_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)((__v8df)__a + (__v8df)__b);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_add_ps(__m512 __a, __m512 __b)
	{
	return (__m512)((__v16sf)__a + (__v16sf)__b);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_mul_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)((__v8df)__a * (__v8df)__b);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_mul_ps(__m512 __a, __m512 __b)
	{
	return (__m512)((__v16sf)__a * (__v16sf)__b);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_sub_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)((__v8df)__a - (__v8df)__b);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_sub_ps(__m512 __a, __m512 __b)
	{
	return (__m512)((__v16sf)__a - (__v16sf)__b);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_add_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v8du) __A + (__v8du) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_add_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_add_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sub_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v8du) __A - (__v8du) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sub_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sub_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_add_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v16su) __A + (__v16su) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_add_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_add_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sub_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v16su) __A - (__v16su) __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sub_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sub_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	#define _mm512_mask_max_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_max_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_max_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_maxpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_max_pd(__m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_maxpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_max_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_max_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_max_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_maxps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_max_ps(__m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_maxps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_max_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_max_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_max_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_max_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_max_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_max_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline __m512i
	__DEFAULT_FN_ATTRS
	_mm512_max_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epu32(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epi64(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_max_epu64(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pmaxuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	}

	#define _mm512_mask_min_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_min_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_min_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_minpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_min_pd(__m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_min_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_min_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_min_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_minps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_minpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_min_ps(__m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_minps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_min_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_min_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_min_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_min_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_min_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_min_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline __m512i
	__DEFAULT_FN_ATTRS
	_mm512_min_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epu32(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminud512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epi64(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminsq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_min_epu64(__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W, __M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_pminuq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	__M);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mul_epi32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_mul_epi32(__X, __Y),
	(__v8di)__W);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_mul_epi32(__X, __Y),
	(__v8di)_mm512_setzero_si512 ());
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mul_epu32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_mul_epu32(__X, __Y),
	(__v8di)__W);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_mul_epu32(__X, __Y),
	(__v8di)_mm512_setzero_si512 ());
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mullo_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) ((__v16su) __A * (__v16su) __B);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
	(__v16si)_mm512_mullo_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
	(__v16si)_mm512_mullo_epi32(__A, __B),
	(__v16si)__W);
	}

	#define _mm512_mask_sqrt_round_pd(W, U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_sqrt_round_pd(U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_sqrt_round_pd(A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_sqrt_pd(__m512d __a)
	{
	return (__m512d)__builtin_ia32_sqrtpd512_mask((__v8df)__a,
	(__v8df) _mm512_setzero_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_sqrtpd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_sqrt_round_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_sqrt_round_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_sqrt_round_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_sqrtps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_sqrt_ps(__m512 __a)
	{
	return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__a,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_sqrtps512_mask((__v16sf)__A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_rsqrt14_pd(__m512d __A)
	{
	return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1);}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_rsqrt14_ps(__m512 __A)
	{
	return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_rsqrt14_ss(__m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf)
	_mm_setzero_ps (),
	(__mmask8) -1);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_rsqrt14_sd(__m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df)
	_mm_setzero_pd (),
	(__mmask8) -1);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_rcp14_pd(__m512d __A)
	{
	return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_rcp14_ps(__m512 __A)
	{
	return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_rcp14_ss(__m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf)
	_mm_setzero_ps (),
	(__mmask8) -1);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_rcp14_sd(__m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df)
	_mm_setzero_pd (),
	(__mmask8) -1);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_floor_ps(__m512 __A)
	{
	return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
	_MM_FROUND_FLOOR,
	(__v16sf) __A, -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
	_MM_FROUND_FLOOR,
	(__v16sf) __W, __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_floor_pd(__m512d __A)
	{
	return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
	_MM_FROUND_FLOOR,
	(__v8df) __A, -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
	_MM_FROUND_FLOOR,
	(__v8df) __W, __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
	_MM_FROUND_CEIL,
	(__v16sf) __W, __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_ceil_ps(__m512 __A)
	{
	return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A,
	_MM_FROUND_CEIL,
	(__v16sf) __A, -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_ceil_pd(__m512d __A)
	{
	return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
	_MM_FROUND_CEIL,
	(__v8df) __A, -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A,
	_MM_FROUND_CEIL,
	(__v8df) __W, __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_abs_epi64(__m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsq512_mask ((__v8di) __A,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_abs_epi32(__m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_pabsd512_mask ((__v16si) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_addss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_add_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_add_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_add_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_addsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}
	#define _mm_add_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_add_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_add_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_add_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_add_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_add_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_add_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	#define _mm512_add_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_add_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_add_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_addpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_add_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_add_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_add_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_addps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_subss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}
	#define _mm_sub_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_sub_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_sub_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_subsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_sub_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_sub_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_sub_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_sub_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_sub_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_sub_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_sub_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	#define _mm512_sub_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_sub_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_sub_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_subpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_sub_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_sub_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); });

	#define _mm512_maskz_sub_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_subps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); });

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_mulss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}
	#define _mm_mul_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_mul_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_mul_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_mulsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mul_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_mul_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_mul_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_mul_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_mul_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_mul_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_mul_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	#define _mm512_mul_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_mul_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_mul_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_mulpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_mul_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_mul_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); });

	#define _mm512_maskz_mul_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_mulps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); });

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) {
	return (__m128) __builtin_ia32_divss_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_div_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_div_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm_maskz_div_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) {
	return (__m128d) __builtin_ia32_divsd_round_mask ((__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_div_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_div_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_div_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_div_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)((__v8df)__a/(__v8df)__b);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_div_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) {
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_div_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_div_ps(__m512 __a, __m512 __b)
	{
	return (__m512)((__v16sf)__a/(__v16sf)__b);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_div_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) {
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_div_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	#define _mm512_div_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_div_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_div_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_divpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_div_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_div_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), (__mmask16)(U), \
	(int)(R)); });

	#define _mm512_maskz_div_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_divps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); });

	#define _mm512_roundscale_ps(A, B) __extension__ ({ \
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \
	(__v16sf)(__m512)(A), (__mmask16)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_roundscale_ps(A, B, C, imm) __extension__ ({\
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
	(__v16sf)(__m512)(A), (__mmask16)(B), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_roundscale_ps(A, B, imm) __extension__ ({\
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(A), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \
	(__v16sf)(__m512)(A), (__mmask16)(B), \
	(int)(R)); })

	#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(A), (int)(R)); })

	#define _mm512_roundscale_round_ps(A, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_roundscale_pd(A, B) __extension__ ({ \
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \
	(__v8df)(__m512d)(A), (__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_roundscale_pd(A, B, C, imm) __extension__ ({\
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
	(__v8df)(__m512d)(A), (__mmask8)(B), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_roundscale_pd(A, B, imm) __extension__ ({\
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(A), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \
	(__v8df)(__m512d)(A), (__mmask8)(B), \
	(int)(R)); })

	#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(A), (int)(R)); })

	#define _mm512_roundscale_round_pd(A, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_fmadd_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), (__mmask8)-1, \
	(int)(R)); })


	#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_fmsub_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)-1, (int)(R)); })


	#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_fnmadd_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), (__mmask8)-1, \
	(int)(R)); })


	#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_fnmsub_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)-1, (int)(R)); })


	#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_mask (-(__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_fmadd_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), (__mmask16)-1, \
	(int)(R)); })


	#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_fmsub_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)-1, (int)(R)); })


	#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_fnmadd_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), (__mmask16)-1, \
	(int)(R)); })


	#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_fnmsub_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_mask(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)-1, (int)(R)); })


	#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_mask (-(__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_fmaddsub_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)-1, (int)(R)); })


	#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_fmsubadd_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)-1, (int)(R)); })


	#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	-(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A,
	(__v8df) __B,
	-(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_fmaddsub_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)-1, (int)(R)); })


	#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_fmsubadd_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)-1, (int)(R)); })


	#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	-(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A,
	(__v16sf) __B,
	-(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfnmaddpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfnmaddpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfnmaddps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfnmaddps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfnmsubpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) __extension__ ({ \
	(__m512d)__builtin_ia32_vfnmsubpd512_mask3((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(C), \
	(__mmask8)(U), (int)(R)); })


	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C)
	{
	return (__m512d) __builtin_ia32_vfnmsubpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U)
	{
	return (__m512d) __builtin_ia32_vfnmsubpd512_mask3 ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfnmsubps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) __extension__ ({ \
	(__m512)__builtin_ia32_vfnmsubps512_mask3((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(C), \
	(__mmask16)(U), (int)(R)); })


	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C)
	{
	return (__m512) __builtin_ia32_vfnmsubps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U)
	{
	return (__m512) __builtin_ia32_vfnmsubps512_mask3 ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __C,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}



	/* Vector permutations */

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
	/* idx */ ,
	(__v16si) __A,
	(__v16si) __B,
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_epi32 (__m512i __A, __mmask16 __U,
	__m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2vard512_mask ((__v16si) __I
	/* idx */ ,
	(__v16si) __A,
	(__v16si) __B,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_epi32 (__mmask16 __U, __m512i __A,
	__m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2vard512_maskz ((__v16si) __I
	/* idx */ ,
	(__v16si) __A,
	(__v16si) __B,
	(__mmask16) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
	/* idx */ ,
	(__v8di) __A,
	(__v8di) __B,
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_epi64 (__m512i __A, __mmask8 __U, __m512i __I,
	__m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varq512_mask ((__v8di) __I
	/* idx */ ,
	(__v8di) __A,
	(__v8di) __B,
	(__mmask8) __U);
	}


	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_epi64 (__mmask8 __U, __m512i __A,
	__m512i __I, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermt2varq512_maskz ((__v8di) __I
	/* idx */ ,
	(__v8di) __A,
	(__v8di) __B,
	(__mmask8) __U);
	}

	#define _mm512_alignr_epi64(A, B, I) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v8di)(__m512i)(B), \
	(__v8di)(__m512i)(A), \
	((int)(I) & 0x7) + 0, \
	((int)(I) & 0x7) + 1, \
	((int)(I) & 0x7) + 2, \
	((int)(I) & 0x7) + 3, \
	((int)(I) & 0x7) + 4, \
	((int)(I) & 0x7) + 5, \
	((int)(I) & 0x7) + 6, \
	((int)(I) & 0x7) + 7); })

	#define _mm512_mask_alignr_epi64(W, U, A, B, imm) __extension__({\
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
	(__v8di)(__m512i)(W)); })

	#define _mm512_maskz_alignr_epi64(U, A, B, imm) __extension__({\
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_alignr_epi64((A), (B), (imm)), \
	(__v8di)_mm512_setzero_si512()); })

	#define _mm512_alignr_epi32(A, B, I) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v16si)(__m512i)(B), \
	(__v16si)(__m512i)(A), \
	((int)(I) & 0xf) + 0, \
	((int)(I) & 0xf) + 1, \
	((int)(I) & 0xf) + 2, \
	((int)(I) & 0xf) + 3, \
	((int)(I) & 0xf) + 4, \
	((int)(I) & 0xf) + 5, \
	((int)(I) & 0xf) + 6, \
	((int)(I) & 0xf) + 7, \
	((int)(I) & 0xf) + 8, \
	((int)(I) & 0xf) + 9, \
	((int)(I) & 0xf) + 10, \
	((int)(I) & 0xf) + 11, \
	((int)(I) & 0xf) + 12, \
	((int)(I) & 0xf) + 13, \
	((int)(I) & 0xf) + 14, \
	((int)(I) & 0xf) + 15); })

	#define _mm512_mask_alignr_epi32(W, U, A, B, imm) __extension__ ({\
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
	(__v16si)(__m512i)(W)); })

	#define _mm512_maskz_alignr_epi32(U, A, B, imm) __extension__({\
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_alignr_epi32((A), (B), (imm)), \
	(__v16si)_mm512_setzero_si512()); })
	/* Vector Extract */

	#define _mm512_extractf64x4_pd(A, I) __extension__ ({ \
	(__m256d)__builtin_shufflevector((__v8df)(__m512d)(A), \
	(__v8df)_mm512_undefined_pd(), \
	((I) & 1) ? 4 : 0, \
	((I) & 1) ? 5 : 1, \
	((I) & 1) ? 6 : 2, \
	((I) & 1) ? 7 : 3); })

	#define _mm512_mask_extractf64x4_pd(W, U, A, imm) __extension__ ({\
	(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
	(__v4df)_mm512_extractf64x4_pd((A), (imm)), \
	(__v4df)(W)); })

	#define _mm512_maskz_extractf64x4_pd(U, A, imm) __extension__ ({\
	(__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
	(__v4df)_mm512_extractf64x4_pd((A), (imm)), \
	(__v4df)_mm256_setzero_pd()); })

	#define _mm512_extractf32x4_ps(A, I) __extension__ ({ \
	(__m128)__builtin_shufflevector((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_undefined_ps(), \
	0 + ((I) & 0x3) * 4, \
	1 + ((I) & 0x3) * 4, \
	2 + ((I) & 0x3) * 4, \
	3 + ((I) & 0x3) * 4); })

	#define _mm512_mask_extractf32x4_ps(W, U, A, imm) __extension__ ({\
	(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
	(__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
	(__v4sf)(W)); })

	#define _mm512_maskz_extractf32x4_ps(U, A, imm) __extension__ ({\
	(__m128)__builtin_ia32_selectps_128((__mmask8)(U), \
	(__v4sf)_mm512_extractf32x4_ps((A), (imm)), \
	(__v4sf)_mm_setzero_ps()); })

	/* Vector Blend */

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W)
	{
	return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
	(__v8df) __W,
	(__v8df) __A);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W)
	{
	return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
	(__v16sf) __W,
	(__v16sf) __A);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W)
	{
	return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
	(__v8di) __W,
	(__v8di) __A);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W)
	{
	return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
	(__v16si) __W,
	(__v16si) __A);
	}

	/* Compare */

	#define _mm512_cmp_round_ps_mask(A, B, P, R) __extension__ ({ \
	(__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), (int)(P), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) __extension__ ({ \
	(__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), (int)(P), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_cmp_ps_mask(A, B, P) \
	_mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
	#define _mm512_mask_cmp_ps_mask(U, A, B, P) \
	_mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)

	#define _mm512_cmpeq_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ)
	#define _mm512_mask_cmpeq_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ)

	#define _mm512_cmplt_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_LT_OS)
	#define _mm512_mask_cmplt_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS)

	#define _mm512_cmple_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_LE_OS)
	#define _mm512_mask_cmple_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS)

	#define _mm512_cmpunord_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q)
	#define _mm512_mask_cmpunord_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q)

	#define _mm512_cmpneq_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ)
	#define _mm512_mask_cmpneq_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ)

	#define _mm512_cmpnlt_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_NLT_US)
	#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US)

	#define _mm512_cmpnle_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_NLE_US)
	#define _mm512_mask_cmpnle_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US)

	#define _mm512_cmpord_ps_mask(A, B) \
	_mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q)
	#define _mm512_mask_cmpord_ps_mask(k, A, B) \
	_mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q)

	#define _mm512_cmp_round_pd_mask(A, B, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), (int)(P), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), (int)(P), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_cmp_pd_mask(A, B, P) \
	_mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION)
	#define _mm512_mask_cmp_pd_mask(U, A, B, P) \
	_mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION)

	#define _mm512_cmpeq_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ)
	#define _mm512_mask_cmpeq_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ)

	#define _mm512_cmplt_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_LT_OS)
	#define _mm512_mask_cmplt_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS)

	#define _mm512_cmple_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_LE_OS)
	#define _mm512_mask_cmple_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS)

	#define _mm512_cmpunord_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q)
	#define _mm512_mask_cmpunord_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q)

	#define _mm512_cmpneq_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ)
	#define _mm512_mask_cmpneq_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ)

	#define _mm512_cmpnlt_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_NLT_US)
	#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US)

	#define _mm512_cmpnle_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_NLE_US)
	#define _mm512_mask_cmpnle_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US)

	#define _mm512_cmpord_pd_mask(A, B) \
	_mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q)
	#define _mm512_mask_cmpord_pd_mask(k, A, B) \
	_mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q)

	/* Conversion */

	#define _mm512_cvtt_roundps_epu32(A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_undefined_epi32(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U), (int)(R)); })


	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_cvttps_epu32(__m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
	(__v16si) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A,
	(__v16si) _mm512_setzero_si512 (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundepi32_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_cvt_roundepu32_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtepu32_ps (__m512i __A)
	{
	return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
	(__v16sf) _mm512_undefined_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A)
	{
	return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A)
	{
	return (__m512) __builtin_ia32_cvtudq2ps512_mask ((__v16si) __A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_pd(__m256i __A)
	{
	return (__m512d)__builtin_convertvector((__v8si)__A, __v8df);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_cvtepi32_pd(__A),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_cvtepi32_pd(__A),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtepi32lo_pd(__m512i __A)
	{
	return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A));
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
	{
	return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A));
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_ps (__m512i __A)
	{
	return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
	(__v16sf) _mm512_undefined_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A)
	{
	return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A)
	{
	return (__m512) __builtin_ia32_cvtdq2ps512_mask ((__v16si) __A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtepu32_pd(__m256i __A)
	{
	return (__m512d)__builtin_convertvector((__v8su)__A, __v8df);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_cvtepu32_pd(__A),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_cvtepu32_pd(__A),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtepu32lo_pd(__m512i __A)
	{
	return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A));
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A)
	{
	return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A));
	}

	#define _mm512_cvt_roundpd_ps(A, R) __extension__ ({ \
	(__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
	(__v8sf)_mm256_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) __extension__ ({ \
	(__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
	(__v8sf)(__m256)(W), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_cvt_roundpd_ps(U, A, R) __extension__ ({ \
	(__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \
	(__v8sf)_mm256_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm512_cvtpd_ps (__m512d __A)
	{
	return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
	(__v8sf) _mm256_undefined_ps (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A)
	{
	return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
	(__v8sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A)
	{
	return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A,
	(__v8sf) _mm256_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtpd_pslo (__m512d __A)
	{
	return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A),
	(__v8sf) _mm256_setzero_ps (),
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A)
	{
	return (__m512) __builtin_shufflevector (
	(__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W),
	__U, __A),
	(__v8sf) _mm256_setzero_ps (),
	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
	}

	#define _mm512_cvt_roundps_ph(A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)_mm256_undefined_si256(), \
	(__mmask16)-1); })

	#define _mm512_mask_cvt_roundps_ph(U, W, A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)(__m256i)(U), \
	(__mmask16)(W)); })

	#define _mm512_maskz_cvt_roundps_ph(W, A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)_mm256_setzero_si256(), \
	(__mmask16)(W)); })

	#define _mm512_cvtps_ph(A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)_mm256_setzero_si256(), \
	(__mmask16)-1); })

	#define _mm512_mask_cvtps_ph(U, W, A, I) __extension__ ({ \
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)(__m256i)(U), \
	(__mmask16)(W)); })

	#define _mm512_maskz_cvtps_ph(W, A, I) __extension__ ({\
	(__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \
	(__v16hi)_mm256_setzero_si256(), \
	(__mmask16)(W)); })

	#define _mm512_cvt_roundph_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundph_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundph_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })


	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtph_ps(__m256i __A)
	{
	return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A)
	{
	return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A)
	{
	return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvtt_roundpd_epi32(A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)(__m256i)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)(U), (int)(R)); })

	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm512_cvttpd_epi32(__m512d __a)
	{
	return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a,
	(__v8si)_mm256_setzero_si256(),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
	(__v8si) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A,
	(__v8si) _mm256_setzero_si256 (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvtt_roundps_epi32(A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U), (int)(R)); })

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_cvttps_epi32(__m512 __a)
	{
	return (__m512i)
	__builtin_ia32_cvttps2dq512_mask((__v16sf) __a,
	(__v16si) _mm512_setzero_si512 (),
	(__mmask16) -1, _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
	(__v16si) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A,
	(__v16si) _mm512_setzero_si512 (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundps_epi32(A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundps_epi32(U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtps_epi32 (__m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
	(__v16si) _mm512_undefined_epi32 (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
	(__v16si) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundpd_epi32(A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)(__m256i)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtpd_epi32 (__m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_undefined_si256 (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
	(__v8si) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_setzero_si256 (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundps_epu32(A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundps_epu32(U, A, R) __extension__ ({ \
	(__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtps_epu32 ( __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\
	(__v16si)\
	_mm512_undefined_epi32 (),\
	(__mmask16) -1,\
	_MM_FROUND_CUR_DIRECTION);\
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
	(__v16si) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A)
	{
	return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,
	- (__v16si)
	+ (__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U ,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_cvt_roundpd_epu32(A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtpd_epu32 (__m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_undefined_si256 (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
	(__v8si) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_setzero_si256 (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_cvtsd_f64(__m512d __a)
	{
	return __a[0];
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_cvtss_f32(__m512 __a)
	{
	return __a[0];
	}

	/* Unpack and Interleave */

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_unpackhi_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
	1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_unpackhi_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_unpackhi_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_unpacklo_pd(__m512d __a, __m512d __b)
	{
	return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b,
	0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_unpacklo_pd(__A, __B),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U,
	(__v8df)_mm512_unpacklo_pd(__A, __B),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_unpackhi_ps(__m512 __a, __m512 __b)
	{
	return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
	2, 18, 3, 19,
	2+4, 18+4, 3+4, 19+4,
	2+8, 18+8, 3+8, 19+8,
	2+12, 18+12, 3+12, 19+12);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
	(__v16sf)_mm512_unpackhi_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
	(__v16sf)_mm512_unpackhi_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_unpacklo_ps(__m512 __a, __m512 __b)
	{
	return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b,
	0, 16, 1, 17,
	0+4, 16+4, 1+4, 17+4,
	0+8, 16+8, 1+8, 17+8,
	0+12, 16+12, 1+12, 17+12);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
	(__v16sf)_mm512_unpacklo_ps(__A, __B),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16) __U,
	(__v16sf)_mm512_unpacklo_ps(__A, __B),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpackhi_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
	2, 18, 3, 19,
	2+4, 18+4, 3+4, 19+4,
	2+8, 18+8, 3+8, 19+8,
	2+12, 18+12, 3+12, 19+12);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
	(__v16si)_mm512_unpackhi_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
	(__v16si)_mm512_unpackhi_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpacklo_epi32(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B,
	0, 16, 1, 17,
	0+4, 16+4, 1+4, 17+4,
	0+8, 16+8, 1+8, 17+8,
	0+12, 16+12, 1+12, 17+12);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
	(__v16si)_mm512_unpacklo_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U,
	(__v16si)_mm512_unpacklo_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpackhi_epi64(__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
	1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
	(__v8di)_mm512_unpackhi_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
	(__v8di)_mm512_unpackhi_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_unpacklo_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B,
	0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
	(__v8di)_mm512_unpacklo_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U,
	(__v8di)_mm512_unpacklo_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	/* Bit Test */

	static __inline __mmask16 __DEFAULT_FN_ATTRS
	_mm512_test_epi32_mask(__m512i __A, __m512i __B)
	{
	return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
	(__v16si) __B,
	(__mmask16) -1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__mmask16) __builtin_ia32_ptestmd512 ((__v16si) __A,
	(__v16si) __B, __U);
	}

	static __inline __mmask8 __DEFAULT_FN_ATTRS
	_mm512_test_epi64_mask(__m512i __A, __m512i __B)
	{
	return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A,
	(__v8di) __B,
	(__mmask8) -1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__mmask8) __builtin_ia32_ptestmq512 ((__v8di) __A, (__v8di) __B, __U);
	}


	/* SIMD load ops */

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_loadu_si512 (void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P,
	(__v16si) __W,
	(__mmask16) __U);
	}


	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_loadu_pd(void const *__p)
	{
	struct __loadu_pd {
	__m512d __v;
	} __attribute__((__packed__, __may_alias__));
	return ((struct __loadu_pd*)__p)->__v;
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_loadu_ps(void const *__p)
	{
	struct __loadu_ps {
	__m512 __v;
	} __attribute__((__packed__, __may_alias__));
	return ((struct __loadu_ps*)__p)->__v;
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_load_ps(void const *__p)
	{
	return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__p,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) -1);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_load_ps(__mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_load_pd(void const *__p)
	{
	return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__p,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) -1);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_load_pd(__mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_load_si512 (void const *__P)
	{
	return (__m512i ) __P;
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_load_epi32 (void const *__P)
	{
	return (__m512i ) __P;
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_load_epi64 (void const *__P)
	{
	return (__m512i ) __P;
	}

	/* SIMD store ops */

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A)
	{
	__builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A,
	(__mmask8) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_storeu_si512 (void *__P, __m512i __A)
	{
	__builtin_ia32_storedqusi512_mask ((int *) __P, (__v16si) __A,
	(__mmask16) -1);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A)
	{
	__builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A,
	(__mmask16) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A)
	{
	__builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_storeu_pd(void *__P, __m512d __A)
	{
	__builtin_ia32_storeupd512_mask((double *)__P, (__v8df)__A, (__mmask8)-1);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A)
	{
	__builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A,
	(__mmask16) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_storeu_ps(void *__P, __m512 __A)
	{
	__builtin_ia32_storeups512_mask((float *)__P, (__v16sf)__A, (__mmask16)-1);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A)
	{
	__builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_pd(void *__P, __m512d __A)
	{
	(__m512d)__P = __A;
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A)
	{
	__builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A,
	(__mmask16) __U);
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_ps(void *__P, __m512 __A)
	{
	(__m512)__P = __A;
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_si512 (void *__P, __m512i __A)
	{
	(__m512i ) __P = __A;
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_epi32 (void *__P, __m512i __A)
	{
	(__m512i ) __P = __A;
	}

	static __inline void __DEFAULT_FN_ATTRS
	_mm512_store_epi64 (void *__P, __m512i __A)
	{
	(__m512i ) __P = __A;
	}

	/* Mask ops */

	static __inline __mmask16 __DEFAULT_FN_ATTRS
	_mm512_knot(__mmask16 __M)
	{
	return __builtin_ia32_knothi(__M);
	}

	/* Integer compare */

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmpeq_epi32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpeq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_pcmpeqd512_mask((__v16si)__a, (__v16si)__b,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmpeq_epu32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpeq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 0,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpeq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmpeq_epi64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_pcmpeqq512_mask((__v8di)__a, (__v8di)__b,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmpeq_epu64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpeq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 0,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmpge_epi32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpge_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 5,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmpge_epu32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpge_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 5,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmpge_epi64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpge_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 5,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmpge_epu64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpge_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 5,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmpgt_epi32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpgt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_pcmpgtd512_mask((__v16si)__a, (__v16si)__b,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmpgt_epu32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpgt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 6,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpgt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmpgt_epi64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_pcmpgtq512_mask((__v8di)__a, (__v8di)__b,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmpgt_epu64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpgt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 6,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmple_epi32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmple_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 2,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmple_epu32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmple_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 2,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmple_epi64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmple_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 2,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmple_epu64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmple_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 2,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmplt_epi32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmplt_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 1,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmplt_epu32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmplt_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 1,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmplt_epi64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmplt_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 1,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmplt_epu64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmplt_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 1,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmpneq_epi32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpneq_epi32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_cmpd512_mask((__v16si)__a, (__v16si)__b, 4,
	__u);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_cmpneq_epu32_mask(__m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
	(__mmask16)-1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpneq_epu32_mask(__mmask16 __u, __m512i __a, __m512i __b) {
	return (__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)__a, (__v16si)__b, 4,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmpneq_epi64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpneq_epi64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_cmpq512_mask((__v8di)__a, (__v8di)__b, 4,
	__u);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_cmpneq_epu64_mask(__m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
	(__mmask8)-1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_cmpneq_epu64_mask(__mmask8 __u, __m512i __a, __m512i __b) {
	return (__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)__a, (__v8di)__b, 4,
	__u);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi8_epi32(__m128i __A)
	{
	/* This function always performs a signed extension, but __v16qi is a char
	which may be signed or unsigned, so use __v16qs. */
	return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepi8_epi32(__A),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepi8_epi32(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi8_epi64(__m128i __A)
	{
	/* This function always performs a signed extension, but __v16qi is a char
	which may be signed or unsigned, so use __v16qs. */
	return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi8_epi64(__A),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi8_epi64(__A),
	(__v8di)_mm512_setzero_si512 ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_epi64(__m256i __X)
	{
	return (__m512i)__builtin_convertvector((__v8si)__X, __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi32_epi64(__X),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi32_epi64(__X),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi16_epi32(__m256i __A)
	{
	return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepi16_epi32(__A),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepi16_epi32(__A),
	(__v16si)_mm512_setzero_si512 ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepi16_epi64(__m128i __A)
	{
	return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi16_epi64(__A),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepi16_epi64(__A),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu8_epi32(__m128i __A)
	{
	return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepu8_epi32(__A),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepu8_epi32(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu8_epi64(__m128i __A)
	{
	return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu8_epi64(__A),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu8_epi64(__A),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu32_epi64(__m256i __X)
	{
	return (__m512i)__builtin_convertvector((__v8su)__X, __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu32_epi64(__X),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu32_epi64(__X),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu16_epi32(__m256i __A)
	{
	return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepu16_epi32(__A),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_cvtepu16_epi32(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_cvtepu16_epi64(__m128i __A)
	{
	return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu16_epi64(__A),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_cvtepu16_epi64(__A),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_rorv_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_rorv_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prorvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}



	#define _mm512_cmp_epi32_mask(a, b, p) __extension__ ({ \
	(__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
	(__v16si)(__m512i)(b), (int)(p), \
	(__mmask16)-1); })

	#define _mm512_cmp_epu32_mask(a, b, p) __extension__ ({ \
	(__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
	(__v16si)(__m512i)(b), (int)(p), \
	(__mmask16)-1); })

	#define _mm512_cmp_epi64_mask(a, b, p) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
	(__v8di)(__m512i)(b), (int)(p), \
	(__mmask8)-1); })

	#define _mm512_cmp_epu64_mask(a, b, p) __extension__ ({ \
	(__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
	(__v8di)(__m512i)(b), (int)(p), \
	(__mmask8)-1); })

	#define _mm512_mask_cmp_epi32_mask(m, a, b, p) __extension__ ({ \
	(__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \
	(__v16si)(__m512i)(b), (int)(p), \
	(__mmask16)(m)); })

	#define _mm512_mask_cmp_epu32_mask(m, a, b, p) __extension__ ({ \
	(__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \
	(__v16si)(__m512i)(b), (int)(p), \
	(__mmask16)(m)); })

	#define _mm512_mask_cmp_epi64_mask(m, a, b, p) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \
	(__v8di)(__m512i)(b), (int)(p), \
	(__mmask8)(m)); })

	#define _mm512_mask_cmp_epu64_mask(m, a, b, p) __extension__ ({ \
	(__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \
	(__v8di)(__m512i)(b), (int)(p), \
	(__mmask8)(m)); })

	#define _mm512_rol_epi32(a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1); })

	#define _mm512_mask_rol_epi32(W, U, a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U)); })

	#define _mm512_maskz_rol_epi32(U, a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prold512_mask((__v16si)(__m512i)(a), (int)(b), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U)); })

	#define _mm512_rol_epi64(a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)-1); })

	#define _mm512_mask_rol_epi64(W, U, a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
	(__v8di)(__m512i)(W), (__mmask8)(U)); })

	#define _mm512_maskz_rol_epi64(U, a, b) __extension__ ({ \
	(__m512i)__builtin_ia32_prolq512_mask((__v8di)(__m512i)(a), (int)(b), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)(U)); })
	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_rolv_epi32 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvd512_mask ((__v16si) __A,
	(__v16si) __B,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_rolv_epi64 (__m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__m512i) __builtin_ia32_prolvq512_mask ((__v8di) __A,
	(__v8di) __B,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	#define _mm512_ror_epi32(A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1); })

	#define _mm512_mask_ror_epi32(W, U, A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U)); })

	#define _mm512_maskz_ror_epi32(U, A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prord512_mask((__v16si)(__m512i)(A), (int)(B), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U)); })

	#define _mm512_ror_epi64(A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)-1); })

	#define _mm512_mask_ror_epi64(W, U, A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
	(__v8di)(__m512i)(W), (__mmask8)(U)); })

	#define _mm512_maskz_ror_epi64(U, A, B) __extension__ ({ \
	(__m512i)__builtin_ia32_prorq512_mask((__v8di)(__m512i)(A), (int)(B), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)(U)); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_slli_epi32(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_slli_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, int __B) {
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_slli_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_slli_epi64(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_slli_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_slli_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srli_epi32(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srli_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, int __B) {
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srli_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srli_epi64(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srli_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srli_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A)
	{
	__builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
	(__v16si) __A,
	(__v16si) __W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U,
	(__v16si) __A,
	(__v16si) _mm512_setzero_si512 ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
	(__v8di) __A,
	(__v8di) __W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U,
	(__v8di) __A,
	(__v8di) _mm512_setzero_si512 ());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A)
	{
	__builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_movedup_pd (__m512d __A)
	{
	return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A,
	0, 0, 2, 2, 4, 4, 6, 6);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_movedup_pd(__A),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_movedup_pd(__A),
	(__v8df)_mm512_setzero_pd());
	}

	#define _mm512_fixupimm_round_pd(A, B, C, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_fixupimm_pd(A, B, C, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), \
	(int)(imm), (__mmask8)(U), \
	(int)(R)); })

	#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8di)(__m512i)(C), \
	(int)(imm), (__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_fixupimm_round_ps(A, B, C, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_fixupimm_ps(A, B, C, imm) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), \
	(int)(imm), (__mmask16)(U), \
	(int)(R)); })

	#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) __extension__ ({ \
	(__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16si)(__m512i)(C), \
	(int)(imm), (__mmask16)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_fixupimm_round_sd(A, B, C, imm, R) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_fixupimm_sd(A, B, C, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_fixupimm_sd(A, U, B, C, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2di)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_fixupimm_round_ss(A, B, C, imm, R) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_fixupimm_ss(A, B, C, imm) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_fixupimm_ss(A, U, B, C, imm) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) __extension__ ({ \
	(__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4si)(__m128i)(C), (int)(imm), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_getexp_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })


	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_getexp_sd (__m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A,
	(__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_getexp_round_sd(W, U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_getexp_round_sd(U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_getexp_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_getexp_ss (__m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
	(__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_getexp_round_ss(W, U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_getexp_round_ss(U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_getmant_round_sd(A, B, C, D, R) __extension__ ({ \
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_getmant_sd(A, B, C, D) __extension__ ({ \
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_getmant_sd(W, U, A, B, C, D) __extension__ ({\
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R)({\
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_getmant_sd(U, A, B, C, D) __extension__ ({\
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) __extension__ ({\
	(__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_getmant_round_ss(A, B, C, D, R) __extension__ ({ \
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_getmant_ss(A, B, C, D) __extension__ ({ \
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_getmant_ss(W, U, A, B, C, D) __extension__ ({\
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R)({\
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_getmant_ss(U, A, B, C, D) __extension__ ({\
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)_mm_setzero_pd(), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) __extension__ ({\
	(__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(int)(((D)<<2) \| (C)), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kmov (__mmask16 __A)
	{
	return __A;
	}

	#define _mm_comi_round_sd(A, B, P, R) __extension__ ({\
	(int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \
	(int)(P), (int)(R)); })

	#define _mm_comi_round_ss(A, B, P, R) __extension__ ({\
	(int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \
	(int)(P), (int)(R)); })

	#ifdef __x86_64__
	#define _mm_cvt_roundsd_si64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
	#endif

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_epi32 (__m512i __A, __m512i __I,
	__mmask16 __U, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermi2vard512_mask ((__v16si) __A,
	(__v16si) __I
	/* idx */ ,
	(__v16si) __B,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sll_epi32(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sll_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sll_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sll_epi64(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sll_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sll_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sllv_epi32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sllv_epi32(__X, __Y),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sllv_epi32(__X, __Y),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sllv_epi64(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sllv_epi64(__X, __Y),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sllv_epi64(__X, __Y),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sra_epi32(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sra_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_sra_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_sra_epi64(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sra_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_sra_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srav_epi32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srav_epi32(__X, __Y),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srav_epi32(__X, __Y),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srav_epi64(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srav_epi64(__X, __Y),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srav_epi64(__X, __Y),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srl_epi32(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srl_epi32(__A, __B),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srl_epi32(__A, __B),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srl_epi64(__m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srl_epi64(__A, __B),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srl_epi64(__A, __B),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srlv_epi32(__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srlv_epi32(__X, __Y),
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U,
	(__v16si)_mm512_srlv_epi32(__X, __Y),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srlv_epi64 (__m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srlv_epi64(__X, __Y),
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U,
	(__v8di)_mm512_srlv_epi64(__X, __Y),
	(__v8di)_mm512_setzero_si512());
	}

	#define _mm512_ternarylogic_epi32(A, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)-1); })

	#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), \
	(__v16si)(__m512i)(C), (int)(imm), \
	(__mmask16)(U)); })

	#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), \
	(__v16si)(__m512i)(C), \
	(int)(imm), (__mmask16)(U)); })

	#define _mm512_ternarylogic_epi64(A, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)-1); })

	#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)(U)); })

	#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), \
	(__v8di)(__m512i)(C), (int)(imm), \
	(__mmask8)(U)); })

	#ifdef __x86_64__
	#define _mm_cvt_roundsd_i64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R)); })
	#endif

	#define _mm_cvt_roundsd_si32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })

	#define _mm_cvt_roundsd_i32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R)); })

	#define _mm_cvt_roundsd_u32(A, R) __extension__ ({ \
	(unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R)); })

	static __inline__ unsigned __DEFAULT_FN_ATTRS
	_mm_cvtsd_u32 (__m128d __A)
	{
	return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvt_roundsd_u64(A, R) __extension__ ({ \
	(unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \
	(int)(R)); })

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm_cvtsd_u64 (__m128d __A)
	{
	return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df)
	__A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvt_roundss_si32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })

	#define _mm_cvt_roundss_i32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R)); })

	#ifdef __x86_64__
	#define _mm_cvt_roundss_si64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })

	#define _mm_cvt_roundss_i64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R)); })
	#endif

	#define _mm_cvt_roundss_u32(A, R) __extension__ ({ \
	(unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R)); })

	static __inline__ unsigned __DEFAULT_FN_ATTRS
	_mm_cvtss_u32 (__m128 __A)
	{
	return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvt_roundss_u64(A, R) __extension__ ({ \
	(unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \
	(int)(R)); })

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm_cvtss_u64 (__m128 __A)
	{
	return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf)
	__A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvtt_roundsd_i32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })

	#define _mm_cvtt_roundsd_si32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R)); })

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm_cvttsd_i32 (__m128d __A)
	{
	return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvtt_roundsd_si64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })

	#define _mm_cvtt_roundsd_i64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R)); })

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm_cvttsd_i64 (__m128d __A)
	{
	return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvtt_roundsd_u32(A, R) __extension__ ({ \
	(unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R)); })

	static __inline__ unsigned __DEFAULT_FN_ATTRS
	_mm_cvttsd_u32 (__m128d __A)
	{
	return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvtt_roundsd_u64(A, R) __extension__ ({ \
	(unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \
	(int)(R)); })

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm_cvttsd_u64 (__m128d __A)
	{
	return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df)
	__A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvtt_roundss_i32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })

	#define _mm_cvtt_roundss_si32(A, R) __extension__ ({ \
	(int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R)); })

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm_cvttss_i32 (__m128 __A)
	{
	return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvtt_roundss_i64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })

	#define _mm_cvtt_roundss_si64(A, R) __extension__ ({ \
	(long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R)); })

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm_cvttss_i64 (__m128 __A)
	{
	return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvtt_roundss_u32(A, R) __extension__ ({ \
	(unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R)); })

	static __inline__ unsigned __DEFAULT_FN_ATTRS
	_mm_cvttss_u32 (__m128 __A)
	{
	return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvtt_roundss_u64(A, R) __extension__ ({ \
	(unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \
	(int)(R)); })

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm_cvttss_u64 (__m128 __A)
	{
	return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf)
	__A,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_pd (__m512d __A, __m512i __I, __mmask8 __U,
	__m512d __B)
	{
	return (__m512d) __builtin_ia32_vpermi2varpd512_mask ((__v8df) __A,
	(__v8di) __I
	/* idx */ ,
	(__v8df) __B,
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_ps (__m512 __A, __m512i __I, __mmask16 __U,
	__m512 __B)
	{
	return (__m512) __builtin_ia32_vpermi2varps512_mask ((__v16sf) __A,
	(__v16si) __I
	/* idx */ ,
	(__v16sf) __B,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask2_permutex2var_epi64 (__m512i __A, __m512i __I,
	__mmask8 __U, __m512i __B)
	{
	return (__m512i) __builtin_ia32_vpermi2varq512_mask ((__v8di) __A,
	(__v8di) __I
	/* idx */ ,
	(__v8di) __B,
	(__mmask8) __U);
	}

	#define _mm512_permute_pd(X, C) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
	(__v8df)_mm512_undefined_pd(), \
	0 + (((C) >> 0) & 0x1), \
	0 + (((C) >> 1) & 0x1), \
	2 + (((C) >> 2) & 0x1), \
	2 + (((C) >> 3) & 0x1), \
	4 + (((C) >> 4) & 0x1), \
	4 + (((C) >> 5) & 0x1), \
	6 + (((C) >> 6) & 0x1), \
	6 + (((C) >> 7) & 0x1)); })

	#define _mm512_mask_permute_pd(W, U, X, C) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_permute_pd((X), (C)), \
	(__v8df)(__m512d)(W)); })

	#define _mm512_maskz_permute_pd(U, X, C) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_permute_pd((X), (C)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_permute_ps(X, C) __extension__ ({ \
	(__m512)__builtin_shufflevector((__v16sf)(__m512)(X), \
	(__v16sf)_mm512_undefined_ps(), \
	0 + (((C) >> 0) & 0x3), \
	0 + (((C) >> 2) & 0x3), \
	0 + (((C) >> 4) & 0x3), \
	0 + (((C) >> 6) & 0x3), \
	4 + (((C) >> 0) & 0x3), \
	4 + (((C) >> 2) & 0x3), \
	4 + (((C) >> 4) & 0x3), \
	4 + (((C) >> 6) & 0x3), \
	8 + (((C) >> 0) & 0x3), \
	8 + (((C) >> 2) & 0x3), \
	8 + (((C) >> 4) & 0x3), \
	8 + (((C) >> 6) & 0x3), \
	12 + (((C) >> 0) & 0x3), \
	12 + (((C) >> 2) & 0x3), \
	12 + (((C) >> 4) & 0x3), \
	12 + (((C) >> 6) & 0x3)); })

	#define _mm512_mask_permute_ps(W, U, X, C) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_permute_ps((X), (C)), \
	(__v16sf)(__m512)(W)); })

	#define _mm512_maskz_permute_ps(U, X, C) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_permute_ps((X), (C)), \
	(__v16sf)_mm512_setzero_ps()); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_permutevar_pd(__m512d __A, __m512i __C)
	{
	return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_permutevar_pd(__A, __C),
	(__v8df)__W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U,
	(__v8df)_mm512_permutevar_pd(__A, __C),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_permutevar_ps(__m512 __A, __m512i __C)
	{
	return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_permutevar_ps(__A, __C),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_permutevar_ps(__A, __C),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline __m512d __DEFAULT_FN_ATTRS
	_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B)
	{
	return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
	/* idx */ ,
	(__v8df) __A,
	(__v8df) __B,
	(__mmask8) -1);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_pd (__m512d __A, __mmask8 __U, __m512i __I, __m512d __B)
	{
	return (__m512d) __builtin_ia32_vpermt2varpd512_mask ((__v8di) __I
	/* idx */ ,
	(__v8df) __A,
	(__v8df) __B,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_pd (__mmask8 __U, __m512d __A, __m512i __I,
	__m512d __B)
	{
	return (__m512d) __builtin_ia32_vpermt2varpd512_maskz ((__v8di) __I
	/* idx */ ,
	(__v8df) __A,
	(__v8df) __B,
	(__mmask8) __U);
	}

	static __inline __m512 __DEFAULT_FN_ATTRS
	_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B)
	{
	return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
	/* idx */ ,
	(__v16sf) __A,
	(__v16sf) __B,
	(__mmask16) -1);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_permutex2var_ps (__m512 __A, __mmask16 __U, __m512i __I, __m512 __B)
	{
	return (__m512) __builtin_ia32_vpermt2varps512_mask ((__v16si) __I
	/* idx */ ,
	(__v16sf) __A,
	(__v16sf) __B,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_permutex2var_ps (__mmask16 __U, __m512 __A, __m512i __I,
	__m512 __B)
	{
	return (__m512) __builtin_ia32_vpermt2varps512_maskz ((__v16si) __I
	/* idx */ ,
	(__v16sf) __A,
	(__v16sf) __B,
	(__mmask16) __U);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_testn_epi32_mask (__m512i __A, __m512i __B)
	{
	return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
	(__v16si) __B,
	(__mmask16) -1);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B)
	{
	return (__mmask16) __builtin_ia32_ptestnmd512 ((__v16si) __A,
	(__v16si) __B, __U);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_testn_epi64_mask (__m512i __A, __m512i __B)
	{
	return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
	(__v8di) __B,
	(__mmask8) -1);
	}

	static __inline__ __mmask8 __DEFAULT_FN_ATTRS
	_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B)
	{
	return (__mmask8) __builtin_ia32_ptestnmq512 ((__v8di) __A,
	(__v8di) __B, __U);
	}

	#define _mm512_cvtt_roundpd_epu32(A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_undefined_si256(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)(__m256i)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) __extension__ ({ \
	(__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \
	(__v8si)_mm256_setzero_si256(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvttpd_epu32 (__m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_undefined_si256 (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
	(__v8si) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A)
	{
	return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A,
	(__v8si)
	_mm256_setzero_si256 (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_roundscale_round_sd(A, B, imm, R) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(imm), \
	(int)(R)); })

	#define _mm_roundscale_sd(A, B, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(imm), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_roundscale_sd(W, U, A, B, imm) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(imm), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(I), \
	(int)(R)); })

	#define _mm_maskz_roundscale_sd(U, A, B, I) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(I), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) __extension__ ({ \
	(__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(I), \
	(int)(R)); })

	#define _mm_roundscale_round_ss(A, B, imm, R) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(imm), \
	(int)(R)); })

	#define _mm_roundscale_ss(A, B, imm) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(imm), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_roundscale_ss(W, U, A, B, I) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(I), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(I), \
	(int)(R)); })

	#define _mm_maskz_roundscale_ss(U, A, B, I) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(I), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) __extension__ ({ \
	(__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(I), \
	(int)(R)); })

	#define _mm512_scalef_round_pd(A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_scalef_round_pd(W, U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_scalef_round_pd(U, A, B, R) __extension__ ({ \
	(__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_scalef_pd (__m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_undefined_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B)
	{
	return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A,
	(__v8df) __B,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_scalef_round_ps(A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_scalef_round_ps(W, U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_scalef_round_ps(U, A, B, R) __extension__ ({ \
	(__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_scalef_ps (__m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_undefined_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B)
	{
	return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A,
	(__v16sf) __B,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_scalef_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_scalef_sd (__m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A,
	(__v2df)( __B), (__v2df) _mm_setzero_pd(),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_scalef_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_scalef_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_scalef_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_scalef_ss (__m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A,
	(__v4sf)( __B), (__v4sf) _mm_setzero_ps(),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_scalef_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_scalef_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srai_epi32(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
	(__v16si)_mm512_srai_epi32(__A, __B), \
	(__v16si)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, int __B) {
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, \
	(__v16si)_mm512_srai_epi32(__A, __B), \
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_srai_epi64(__m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
	(__v8di)_mm512_srai_epi64(__A, __B), \
	(__v8di)__W);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, int __B)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, \
	(__v8di)_mm512_srai_epi64(__A, __B), \
	(__v8di)_mm512_setzero_si512());
	}

	#define _mm512_shuffle_f32x4(A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), (int)(imm), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1); })

	#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), (int)(imm), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U)); })

	#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_shuf_f32x4_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), (int)(imm), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U)); })

	#define _mm512_shuffle_f64x2(A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), (int)(imm), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1); })

	#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), (int)(imm), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U)); })

	#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_shuf_f64x2_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), (int)(imm), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U)); })

	#define _mm512_shuffle_i32x4(A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), (int)(imm), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)-1); })

	#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), (int)(imm), \
	(__v16si)(__m512i)(W), \
	(__mmask16)(U)); })

	#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_shuf_i32x4_mask((__v16si)(__m512i)(A), \
	(__v16si)(__m512i)(B), (int)(imm), \
	(__v16si)_mm512_setzero_si512(), \
	(__mmask16)(U)); })

	#define _mm512_shuffle_i64x2(A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), (int)(imm), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)-1); })

	#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), (int)(imm), \
	(__v8di)(__m512i)(W), \
	(__mmask8)(U)); })

	#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_shuf_i64x2_mask((__v8di)(__m512i)(A), \
	(__v8di)(__m512i)(B), (int)(imm), \
	(__v8di)_mm512_setzero_si512(), \
	(__mmask8)(U)); })

	#define _mm512_shuffle_pd(A, B, M) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(B), \
	0 + (((M) >> 0) & 0x1), \
	8 + (((M) >> 1) & 0x1), \
	2 + (((M) >> 2) & 0x1), \
	10 + (((M) >> 3) & 0x1), \
	4 + (((M) >> 4) & 0x1), \
	12 + (((M) >> 5) & 0x1), \
	6 + (((M) >> 6) & 0x1), \
	14 + (((M) >> 7) & 0x1)); })

	#define _mm512_mask_shuffle_pd(W, U, A, B, M) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_shuffle_pd((A), (B), (M)), \
	(__v8df)(__m512d)(W)); })

	#define _mm512_maskz_shuffle_pd(U, A, B, M) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_shuffle_pd((A), (B), (M)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_shuffle_ps(A, B, M) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(B), \
	0 + (((M) >> 0) & 0x3), \
	0 + (((M) >> 2) & 0x3), \
	16 + (((M) >> 4) & 0x3), \
	16 + (((M) >> 6) & 0x3), \
	4 + (((M) >> 0) & 0x3), \
	4 + (((M) >> 2) & 0x3), \
	20 + (((M) >> 4) & 0x3), \
	20 + (((M) >> 6) & 0x3), \
	8 + (((M) >> 0) & 0x3), \
	8 + (((M) >> 2) & 0x3), \
	24 + (((M) >> 4) & 0x3), \
	24 + (((M) >> 6) & 0x3), \
	12 + (((M) >> 0) & 0x3), \
	12 + (((M) >> 2) & 0x3), \
	28 + (((M) >> 4) & 0x3), \
	28 + (((M) >> 6) & 0x3)); })

	#define _mm512_mask_shuffle_ps(W, U, A, B, M) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
	(__v16sf)(__m512)(W)); })

	#define _mm512_maskz_shuffle_ps(U, A, B, M) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_shuffle_ps((A), (B), (M)), \
	(__v16sf)_mm512_setzero_ps()); })

	#define _mm_sqrt_round_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_sqrt_round_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) _mm_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_sqrt_round_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_sqrt_round_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)-1, (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_sqrt_round_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(W), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) _mm_setzero_ps (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_sqrt_round_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_broadcast_f32x4(__m128 __A)
	{
	return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A,
	0, 1, 2, 3, 0, 1, 2, 3,
	0, 1, 2, 3, 0, 1, 2, 3);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
	(__v16sf)_mm512_broadcast_f32x4(__A),
	(__v16sf)__O);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__M,
	(__v16sf)_mm512_broadcast_f32x4(__A),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_broadcast_f64x4(__m256d __A)
	{
	return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A,
	0, 1, 2, 3, 0, 1, 2, 3);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
	(__v8df)_mm512_broadcast_f64x4(__A),
	(__v8df)__O);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M,
	(__v8df)_mm512_broadcast_f64x4(__A),
	(__v8df)_mm512_setzero_pd());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcast_i32x4(__m128i __A)
	{
	return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A,
	0, 1, 2, 3, 0, 1, 2, 3,
	0, 1, 2, 3, 0, 1, 2, 3);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
	(__v16si)_mm512_broadcast_i32x4(__A),
	(__v16si)__O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A)
	{
	return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M,
	(__v16si)_mm512_broadcast_i32x4(__A),
	(__v16si)_mm512_setzero_si512());
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_broadcast_i64x4(__m256i __A)
	{
	return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A,
	0, 1, 2, 3, 0, 1, 2, 3);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_broadcast_i64x4(__A),
	(__v8di)__O);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A)
	{
	return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M,
	(__v8di)_mm512_broadcast_i64x4(__A),
	(__v8di)_mm512_setzero_si512());
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512(__M,
	(__v8df) _mm512_broadcastsd_pd(__A),
	(__v8df) __O);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A)
	{
	return (__m512d)__builtin_ia32_selectpd_512(__M,
	(__v8df) _mm512_broadcastsd_pd(__A),
	(__v8df) _mm512_setzero_pd());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A)
	{
	return (__m512)__builtin_ia32_selectps_512(__M,
	(__v16sf) _mm512_broadcastss_ps(__A),
	(__v16sf) __O);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A)
	{
	return (__m512)__builtin_ia32_selectps_512(__M,
	(__v16sf) _mm512_broadcastss_ps(__A),
	(__v16sf) _mm512_setzero_ps());
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi32_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask16) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
	(__v16qi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi32_epi16 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_undefined_si256 (),
	(__mmask16) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
	(__v16hi) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi64_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
	(__v16qi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi64_epi32 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
	(__v8si) _mm256_undefined_si256 (),
	(__mmask8) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
	(__v8si) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A,
	(__v8si) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtsepi64_epi16 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
	(__v8hi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
	(__v8hi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A,
	(__v8hi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi32_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask16) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
	(__v16qi) __O,
	__M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi32_epi16 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_undefined_si256 (),
	(__mmask16) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
	(__v16hi) __O,
	__M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi64_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
	(__v16qi) __O,
	__M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi64_epi32 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
	(__v8si) _mm256_undefined_si256 (),
	(__mmask8) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
	(__v8si) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A,
	(__v8si) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtusepi64_epi16 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
	(__v8hi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
	(__v8hi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A,
	(__v8hi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask16) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
	(__v16qi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtepi32_epi16 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_undefined_si256 (),
	(__mmask16) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
	(__v16hi) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A,
	(__v16hi) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A)
	{
	__builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtepi64_epi8 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
	(__v16qi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
	(__v16qi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A,
	(__v16qi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_cvtepi64_epi32 (__m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
	(__v8si) _mm256_undefined_si256 (),
	(__mmask8) -1);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
	(__v8si) __O, __M);
	}

	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A)
	{
	return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A,
	(__v8si) _mm256_setzero_si256 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_cvtepi64_epi16 (__m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
	(__v8hi) _mm_undefined_si128 (),
	(__mmask8) -1);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
	(__v8hi) __O, __M);
	}

	static __inline__ __m128i __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A)
	{
	return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A,
	(__v8hi) _mm_setzero_si128 (),
	__M);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A)
	{
	__builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M);
	}

	#define _mm512_extracti32x4_epi32(A, imm) __extension__ ({ \
	(__m128i)__builtin_shufflevector((__v16si)(__m512i)(A), \
	(__v16si)_mm512_undefined_epi32(), \
	0 + ((imm) & 0x3) * 4, \
	1 + ((imm) & 0x3) * 4, \
	2 + ((imm) & 0x3) * 4, \
	3 + ((imm) & 0x3) * 4); })

	#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) __extension__ ({ \
	(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
	(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
	(__v4si)(W)); })

	#define _mm512_maskz_extracti32x4_epi32(U, A, imm) __extension__ ({ \
	(__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \
	(__v4si)_mm512_extracti32x4_epi32((A), (imm)), \
	(__v4si)_mm_setzero_si128()); })

	#define _mm512_extracti64x4_epi64(A, imm) __extension__ ({ \
	(__m256i)__builtin_shufflevector((__v8di)(__m512i)(A), \
	(__v8di)_mm512_undefined_epi32(), \
	((imm) & 1) ? 4 : 0, \
	((imm) & 1) ? 5 : 1, \
	((imm) & 1) ? 6 : 2, \
	((imm) & 1) ? 7 : 3); })

	#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) __extension__ ({ \
	(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
	(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
	(__v4di)(W)); })

	#define _mm512_maskz_extracti64x4_epi64(U, A, imm) __extension__ ({ \
	(__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \
	(__v4di)_mm512_extracti64x4_epi64((A), (imm)), \
	(__v4di)_mm256_setzero_si256()); })

	#define _mm512_insertf64x4(A, B, imm) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
	(__v8df)_mm512_castpd256_pd512((__m256d)(B)), \
	((imm) & 0x1) ? 0 : 8, \
	((imm) & 0x1) ? 1 : 9, \
	((imm) & 0x1) ? 2 : 10, \
	((imm) & 0x1) ? 3 : 11, \
	((imm) & 0x1) ? 8 : 4, \
	((imm) & 0x1) ? 9 : 5, \
	((imm) & 0x1) ? 10 : 6, \
	((imm) & 0x1) ? 11 : 7); })

	#define _mm512_mask_insertf64x4(W, U, A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_insertf64x4((A), (B), (imm)), \
	(__v8df)(W)); })

	#define _mm512_maskz_insertf64x4(U, A, B, imm) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_insertf64x4((A), (B), (imm)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_inserti64x4(A, B, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
	(__v8di)_mm512_castsi256_si512((__m256i)(B)), \
	((imm) & 0x1) ? 0 : 8, \
	((imm) & 0x1) ? 1 : 9, \
	((imm) & 0x1) ? 2 : 10, \
	((imm) & 0x1) ? 3 : 11, \
	((imm) & 0x1) ? 8 : 4, \
	((imm) & 0x1) ? 9 : 5, \
	((imm) & 0x1) ? 10 : 6, \
	((imm) & 0x1) ? 11 : 7); })

	#define _mm512_mask_inserti64x4(W, U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_inserti64x4((A), (B), (imm)), \
	(__v8di)(W)); })

	#define _mm512_maskz_inserti64x4(U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_inserti64x4((A), (B), (imm)), \
	(__v8di)_mm512_setzero_si512()); })

	#define _mm512_insertf32x4(A, B, imm) __extension__ ({ \
	(__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_castps128_ps512((__m128)(B)),\
	(((imm) & 0x3) == 0) ? 16 : 0, \
	(((imm) & 0x3) == 0) ? 17 : 1, \
	(((imm) & 0x3) == 0) ? 18 : 2, \
	(((imm) & 0x3) == 0) ? 19 : 3, \
	(((imm) & 0x3) == 1) ? 16 : 4, \
	(((imm) & 0x3) == 1) ? 17 : 5, \
	(((imm) & 0x3) == 1) ? 18 : 6, \
	(((imm) & 0x3) == 1) ? 19 : 7, \
	(((imm) & 0x3) == 2) ? 16 : 8, \
	(((imm) & 0x3) == 2) ? 17 : 9, \
	(((imm) & 0x3) == 2) ? 18 : 10, \
	(((imm) & 0x3) == 2) ? 19 : 11, \
	(((imm) & 0x3) == 3) ? 16 : 12, \
	(((imm) & 0x3) == 3) ? 17 : 13, \
	(((imm) & 0x3) == 3) ? 18 : 14, \
	(((imm) & 0x3) == 3) ? 19 : 15); })

	#define _mm512_mask_insertf32x4(W, U, A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
	(__v16sf)(W)); })

	#define _mm512_maskz_insertf32x4(U, A, B, imm) __extension__ ({ \
	(__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
	(__v16sf)_mm512_insertf32x4((A), (B), (imm)), \
	(__v16sf)_mm512_setzero_ps()); })

	#define _mm512_inserti32x4(A, B, imm) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
	(__v16si)_mm512_castsi128_si512((__m128i)(B)),\
	(((imm) & 0x3) == 0) ? 16 : 0, \
	(((imm) & 0x3) == 0) ? 17 : 1, \
	(((imm) & 0x3) == 0) ? 18 : 2, \
	(((imm) & 0x3) == 0) ? 19 : 3, \
	(((imm) & 0x3) == 1) ? 16 : 4, \
	(((imm) & 0x3) == 1) ? 17 : 5, \
	(((imm) & 0x3) == 1) ? 18 : 6, \
	(((imm) & 0x3) == 1) ? 19 : 7, \
	(((imm) & 0x3) == 2) ? 16 : 8, \
	(((imm) & 0x3) == 2) ? 17 : 9, \
	(((imm) & 0x3) == 2) ? 18 : 10, \
	(((imm) & 0x3) == 2) ? 19 : 11, \
	(((imm) & 0x3) == 3) ? 16 : 12, \
	(((imm) & 0x3) == 3) ? 17 : 13, \
	(((imm) & 0x3) == 3) ? 18 : 14, \
	(((imm) & 0x3) == 3) ? 19 : 15); })

	#define _mm512_mask_inserti32x4(W, U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_inserti32x4((A), (B), (imm)), \
	(__v16si)(W)); })

	#define _mm512_maskz_inserti32x4(U, A, B, imm) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_inserti32x4((A), (B), (imm)), \
	(__v16si)_mm512_setzero_si512()); })

	#define _mm512_getmant_round_pd(A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_getmant_pd(A, B, C) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_getmant_pd(W, U, A, B, C) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_getmant_pd(U, A, B, C) __extension__ ({ \
	(__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_getmant_round_ps(A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2) \| (B)), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_getmant_ps(A, B, C) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2)\|(B)), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_mask_getmant_ps(W, U, A, B, C) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2)\|(B)), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_maskz_getmant_ps(U, A, B, C) __extension__ ({ \
	(__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \
	(int)(((C)<<2)\|(B)), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm512_getexp_round_pd(A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_getexp_round_pd(W, U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_getexp_round_pd(U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_getexp_pd (__m512d __A)
	{
	return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
	(__v8df) _mm512_undefined_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A,
	(__v8df) _mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_getexp_round_ps(A, R) __extension__ ({ \
	(__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_undefined_ps(), \
	(__mmask16)-1, (int)(R)); })

	#define _mm512_mask_getexp_round_ps(W, U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)(__m512)(W), \
	(__mmask16)(U), (int)(R)); })

	#define _mm512_maskz_getexp_round_ps(U, A, R) __extension__ ({ \
	(__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \
	(__v16sf)_mm512_setzero_ps(), \
	(__mmask16)(U), (int)(R)); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_getexp_ps (__m512 __A)
	{
	return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
	(__v16sf) _mm512_undefined_ps (),
	(__mmask16) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm512_i64gather_ps(index, addr, scale) __extension__ ({ \
	(__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \
	(float const *)(addr), \
	(__v8di)(__m512i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) __extension__({\
	(__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\
	(float const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i64gather_epi32(index, addr, scale) __extension__ ({\
	(__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_ps(), \
	(int const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)-1, (int)(scale)); })

	#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \
	(int const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i64gather_pd(index, addr, scale) __extension__ ({\
	(__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \
	(double const *)(addr), \
	(__v8di)(__m512i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \
	(double const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i64gather_epi64(index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_pd(), \
	(long long const *)(addr), \
	(__v8di)(__m512i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \
	(long long const *)(addr), \
	(__v8di)(__m512i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i32gather_ps(index, addr, scale) __extension__ ({\
	(__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
	(float const *)(addr), \
	(__v16sf)(__m512)(index), \
	(__mmask16)-1, (int)(scale)); })

	#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
	(float const *)(addr), \
	(__v16sf)(__m512)(index), \
	(__mmask16)(mask), (int)(scale)); })

	#define _mm512_i32gather_epi32(index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \
	(int const *)(addr), \
	(__v16si)(__m512i)(index), \
	(__mmask16)-1, (int)(scale)); })

	#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \
	(int const *)(addr), \
	(__v16si)(__m512i)(index), \
	(__mmask16)(mask), (int)(scale)); })

	#define _mm512_i32gather_pd(index, addr, scale) __extension__ ({\
	(__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \
	(double const *)(addr), \
	(__v8si)(__m256i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \
	(double const *)(addr), \
	(__v8si)(__m256i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i32gather_epi64(index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \
	(long long const *)(addr), \
	(__v8si)(__m256i)(index), (__mmask8)-1, \
	(int)(scale)); })

	#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) __extension__ ({\
	(__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \
	(long long const *)(addr), \
	(__v8si)(__m256i)(index), \
	(__mmask8)(mask), (int)(scale)); })

	#define _mm512_i64scatter_ps(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)-1, \
	(__v8di)(__m512i)(index), \
	(__v8sf)(__m256)(v1), (int)(scale)); })

	#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv16sf((float *)(addr), (__mmask8)(mask), \
	(__v8di)(__m512i)(index), \
	(__v8sf)(__m256)(v1), (int)(scale)); })

	#define _mm512_i64scatter_epi32(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)-1, \
	(__v8di)(__m512i)(index), \
	(__v8si)(__m256i)(v1), (int)(scale)); })

	#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv16si((int *)(addr), (__mmask8)(mask), \
	(__v8di)(__m512i)(index), \
	(__v8si)(__m256i)(v1), (int)(scale)); })

	#define _mm512_i64scatter_pd(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)-1, \
	(__v8di)(__m512i)(index), \
	(__v8df)(__m512d)(v1), (int)(scale)); })

	#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv8df((double *)(addr), (__mmask8)(mask), \
	(__v8di)(__m512i)(index), \
	(__v8df)(__m512d)(v1), (int)(scale)); })

	#define _mm512_i64scatter_epi64(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)-1, \
	(__v8di)(__m512i)(index), \
	(__v8di)(__m512i)(v1), (int)(scale)); })

	#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scatterdiv8di((long long *)(addr), (__mmask8)(mask), \
	(__v8di)(__m512i)(index), \
	(__v8di)(__m512i)(v1), (int)(scale)); })

	#define _mm512_i32scatter_ps(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)-1, \
	(__v16si)(__m512i)(index), \
	(__v16sf)(__m512)(v1), (int)(scale)); })

	#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv16sf((float *)(addr), (__mmask16)(mask), \
	(__v16si)(__m512i)(index), \
	(__v16sf)(__m512)(v1), (int)(scale)); })

	#define _mm512_i32scatter_epi32(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)-1, \
	(__v16si)(__m512i)(index), \
	(__v16si)(__m512i)(v1), (int)(scale)); })

	#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv16si((int *)(addr), (__mmask16)(mask), \
	(__v16si)(__m512i)(index), \
	(__v16si)(__m512i)(v1), (int)(scale)); })

	#define _mm512_i32scatter_pd(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)-1, \
	(__v8si)(__m256i)(index), \
	(__v8df)(__m512d)(v1), (int)(scale)); })

	#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv8df((double *)(addr), (__mmask8)(mask), \
	(__v8si)(__m256i)(index), \
	(__v8df)(__m512d)(v1), (int)(scale)); })

	#define _mm512_i32scatter_epi64(addr, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)-1, \
	(__v8si)(__m256i)(index), \
	(__v8di)(__m512i)(v1), (int)(scale)); })

	#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) __extension__ ({\
	__builtin_ia32_scattersiv8di((long long *)(addr), (__mmask8)(mask), \
	(__v8si)(__m256i)(index), \
	(__v8di)(__m512i)(v1), (int)(scale)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
	(__v4sf) __A,
	(__v4sf) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fmadd_round_ss(W, U, A, B, R) __extension__({\
	(__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
	{
	return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(C), (__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask3 ((__v4sf) __W,
	(__v4sf) __X,
	(__v4sf) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
	(__v4sf) __A,
	-(__v4sf) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fmsub_round_ss(W, U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
	{
	return (__m128) __builtin_ia32_vfmaddss3_maskz ((__v4sf) __A,
	(__v4sf) __B,
	-(__v4sf) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	-(__v4sf)(__m128)(C), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
	{
	return (__m128) __builtin_ia32_vfmsubss3_mask3 ((__v4sf) __W,
	(__v4sf) __X,
	(__v4sf) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
	-(__v4sf) __A,
	(__v4sf) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
	-(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
	{
	return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
	(__v4sf) __B,
	(__v4sf) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	(__v4sf)(__m128)(C), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask3 (-(__v4sf) __W,
	(__v4sf) __X,
	(__v4sf) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) __extension__({\
	(__m128)__builtin_ia32_vfmaddss3_mask3(-(__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	return (__m128) __builtin_ia32_vfmaddss3_mask ((__v4sf) __W,
	-(__v4sf) __A,
	-(__v4sf) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \
	-(__v4sf)(__m128)(A), \
	-(__v4sf)(__m128)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C)
	{
	return (__m128) __builtin_ia32_vfmaddss3_maskz (-(__v4sf) __A,
	(__v4sf) __B,
	-(__v4sf) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) __extension__ ({\
	(__m128)__builtin_ia32_vfmaddss3_maskz(-(__v4sf)(__m128)(A), \
	(__v4sf)(__m128)(B), \
	-(__v4sf)(__m128)(C), (__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U)
	{
	return (__m128) __builtin_ia32_vfnmsubss3_mask3 ((__v4sf) __W,
	(__v4sf) __X,
	(__v4sf) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) __extension__({\
	(__m128)__builtin_ia32_vfnmsubss3_mask3((__v4sf)(__m128)(W), \
	(__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
	(__v2df) __A,
	(__v2df) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fmadd_round_sd(W, U, A, B, R) __extension__({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
	(__v2df) __B,
	(__v2df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(C), (__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask3 ((__v2df) __W,
	(__v2df) __X,
	(__v2df) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
	(__v2df) __A,
	-(__v2df) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fmsub_round_sd(W, U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(A), \
	-(__v2df)(__m128d)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( (__v2df) __A,
	(__v2df) __B,
	-(__v2df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	-(__v2df)(__m128d)(C), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
	{
	return (__m128d) __builtin_ia32_vfmsubsd3_mask3 ((__v2df) __W,
	(__v2df) __X,
	(__v2df) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
	-(__v2df) __A,
	(__v2df) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
	-(__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
	(__v2df) __B,
	(__v2df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	(__v2df)(__m128d)(C), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask3 (-(__v2df) __W,
	(__v2df) __X,
	(__v2df) __Y,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) __extension__({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask3(-(__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_mask ( (__v2df) __W,
	-(__v2df) __A,
	-(__v2df) __B,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \
	-(__v2df)(__m128d)(A), \
	-(__v2df)(__m128d)(B), (__mmask8)(U), \
	(int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C)
	{
	return (__m128d) __builtin_ia32_vfmaddsd3_maskz ( -(__v2df) __A,
	(__v2df) __B,
	-(__v2df) __C,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) __extension__ ({\
	(__m128d)__builtin_ia32_vfmaddsd3_maskz(-(__v2df)(__m128d)(A), \
	(__v2df)(__m128d)(B), \
	-(__v2df)(__m128d)(C), \
	(__mmask8)(U), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U)
	{
	return (__m128d) __builtin_ia32_vfnmsubsd3_mask3 ((__v2df) (__W),
	(__v2df) __X,
	(__v2df) (__Y),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) __extension__({\
	(__m128d)__builtin_ia32_vfnmsubsd3_mask3((__v2df)(__m128d)(W), \
	(__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_permutex_pd(X, C) __extension__ ({ \
	(__m512d)__builtin_shufflevector((__v8df)(__m512d)(X), \
	(__v8df)_mm512_undefined_pd(), \
	0 + (((C) >> 0) & 0x3), \
	0 + (((C) >> 2) & 0x3), \
	0 + (((C) >> 4) & 0x3), \
	0 + (((C) >> 6) & 0x3), \
	4 + (((C) >> 0) & 0x3), \
	4 + (((C) >> 2) & 0x3), \
	4 + (((C) >> 4) & 0x3), \
	4 + (((C) >> 6) & 0x3)); })

	#define _mm512_mask_permutex_pd(W, U, X, C) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_permutex_pd((X), (C)), \
	(__v8df)(__m512d)(W)); })

	#define _mm512_maskz_permutex_pd(U, X, C) __extension__ ({ \
	(__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
	(__v8df)_mm512_permutex_pd((X), (C)), \
	(__v8df)_mm512_setzero_pd()); })

	#define _mm512_permutex_epi64(X, C) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v8di)(__m512i)(X), \
	(__v8di)_mm512_undefined_epi32(), \
	0 + (((C) >> 0) & 0x3), \
	0 + (((C) >> 2) & 0x3), \
	0 + (((C) >> 4) & 0x3), \
	0 + (((C) >> 6) & 0x3), \
	4 + (((C) >> 0) & 0x3), \
	4 + (((C) >> 2) & 0x3), \
	4 + (((C) >> 4) & 0x3), \
	4 + (((C) >> 6) & 0x3)); })

	#define _mm512_mask_permutex_epi64(W, U, X, C) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_permutex_epi64((X), (C)), \
	(__v8di)(__m512i)(W)); })

	#define _mm512_maskz_permutex_epi64(U, X, C) __extension__ ({ \
	(__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \
	(__v8di)_mm512_permutex_epi64((X), (C)), \
	(__v8di)_mm512_setzero_si512()); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_permutexvar_pd (__m512i __X, __m512d __Y)
	{
	return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
	(__v8di) __X,
	(__v8df) _mm512_undefined_pd (),
	(__mmask8) -1);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y)
	{
	return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
	(__v8di) __X,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y)
	{
	return (__m512d) __builtin_ia32_permvardf512_mask ((__v8df) __Y,
	(__v8di) __X,
	(__v8df) _mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
	(__v8di) __X,
	(__v8di) _mm512_setzero_si512 (),
	__M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
	(__v8di) __X,
	(__v8di) _mm512_undefined_epi32 (),
	(__mmask8) -1);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X,
	__m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvardi512_mask ((__v8di) __Y,
	(__v8di) __X,
	(__v8di) __W,
	__M);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_permutexvar_ps (__m512i __X, __m512 __Y)
	{
	return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
	(__v16si) __X,
	(__v16sf) _mm512_undefined_ps (),
	(__mmask16) -1);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y)
	{
	return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
	(__v16si) __X,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y)
	{
	return (__m512) __builtin_ia32_permvarsf512_mask ((__v16sf) __Y,
	(__v16si) __X,
	(__v16sf) _mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
	(__v16si) __X,
	(__v16si) _mm512_setzero_si512 (),
	__M);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
	(__v16si) __X,
	(__v16si) _mm512_undefined_epi32 (),
	(__mmask16) -1);
	}

	#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X,
	__m512i __Y)
	{
	return (__m512i) __builtin_ia32_permvarsi512_mask ((__v16si) __Y,
	(__v16si) __X,
	(__v16si) __W,
	__M);
	}

	#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kand (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kandn (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kor (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_kortestc (__mmask16 __A, __mmask16 __B)
	{
	return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_kortestz (__mmask16 __A, __mmask16 __B)
	{
	return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kunpackb (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kxnor (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ __mmask16 __DEFAULT_FN_ATTRS
	_mm512_kxor (__mmask16 __A, __mmask16 __B)
	{
	return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_stream_si512 (__m512i * __P, __m512i __A)
	{
	- __builtin_nontemporal_store((__v8di)__A, (__v8di*)__P);
	+ typedef __v8di __v8di_aligned __attribute__((aligned(64)));
	+ __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_stream_load_si512 (void *__P)
	{
	- return (__m512i) __builtin_nontemporal_load((const __v8di *)__P);
	+ typedef __v8di __v8di_aligned __attribute__((aligned(64)));
	+ return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_stream_pd (double *__P, __m512d __A)
	{
	- __builtin_nontemporal_store((__v8df)__A, (__v8df*)__P);
	+ typedef __v8df __v8df_aligned __attribute__((aligned(64)));
	+ __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_stream_ps (float *__P, __m512 __A)
	{
	- __builtin_nontemporal_store((__v16sf)__A, (__v16sf*)__P);
	+ typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
	+ __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A,
	(__v8di)
	_mm512_setzero_si512 (),
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A,
	(__v16sf)
	_mm512_setzero_ps (),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A,
	(__v16si)
	_mm512_setzero_si512 (),
	(__mmask16) __U);
	}

	#define _mm_cmp_round_ss_mask(X, Y, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (int)(P), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (int)(P), \
	(__mmask8)(M), (int)(R)); })

	#define _mm_cmp_ss_mask(X, Y, P) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (int)(P), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_cmp_ss_mask(M, X, Y, P) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \
	(__v4sf)(__m128)(Y), (int)(P), \
	(__mmask8)(M), \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_cmp_round_sd_mask(X, Y, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (int)(P), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (int)(P), \
	(__mmask8)(M), (int)(R)); })

	#define _mm_cmp_sd_mask(X, Y, P) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (int)(P), \
	(__mmask8)-1, \
	_MM_FROUND_CUR_DIRECTION); })

	#define _mm_mask_cmp_sd_mask(M, X, Y, P) __extension__ ({ \
	(__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \
	(__v2df)(__m128d)(Y), (int)(P), \
	(__mmask8)(M), \
	_MM_FROUND_CUR_DIRECTION); })

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_movehdup_ps (__m512 __A)
	{
	return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
	1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_movehdup_ps(__A),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_movehdup_ps(__A),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_moveldup_ps (__m512 __A)
	{
	return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A,
	0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_moveldup_ps(__A),
	(__v16sf)__W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
	(__v16sf)_mm512_moveldup_ps(__A),
	(__v16sf)_mm512_setzero_ps());
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B)
	{
	- __m128 res = __A;
	+ __m128 res = __A;
	res[0] = (__U & 1) ? __B[0] : __W[0];
	- return res;
	+ return res;
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B)
	{
	- __m128 res = __A;
	- res[0] = (__U & 1) ? __B[0] : 0;
	- return res;
	+ __m128 res = __A;
	+ res[0] = (__U & 1) ? __B[0] : 0;
	+ return res;
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B)
	{
	- __m128d res = __A;
	+ __m128d res = __A;
	res[0] = (__U & 1) ? __B[0] : __W[0];
	- return res;
	+ return res;
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B)
	{
	- __m128d res = __A;
	- res[0] = (__U & 1) ? __B[0] : 0;
	- return res;
	+ __m128d res = __A;
	+ res[0] = (__U & 1) ? __B[0] : 0;
	+ return res;
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A)
	{
	- __builtin_ia32_storess128_mask ((__v16sf *)__W,
	+ __builtin_ia32_storess128_mask ((__v16sf *)__W,
	(__v16sf) _mm512_castps128_ps512(__A),
	(__mmask16) __U & (__mmask16)1);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A)
	{
	- __builtin_ia32_storesd128_mask ((__v8df *)__W,
	+ __builtin_ia32_storesd128_mask ((__v8df *)__W,
	(__v8df) _mm512_castpd128_pd512(__A),
	(__mmask8) __U & 1);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A)
	{
	__m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W,
	(__v4sf) {0.0, 0.0, 0.0, 0.0},
	0, 4, 4, 4);

	return (__m128) __builtin_shufflevector(
	__builtin_ia32_loadss128_mask ((__v16sf *) __A,
	(__v16sf) _mm512_castps128_ps512(src),
	(__mmask16) __U & 1),
	_mm512_undefined_ps(), 0, 1, 2, 3);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_load_ss (__mmask8 __U, const float* __A)
	{
	return (__m128) __builtin_shufflevector(
	__builtin_ia32_loadss128_mask ((__v16sf *) __A,
	(__v16sf) _mm512_setzero_ps(),
	(__mmask16) __U & 1),
	_mm512_undefined_ps(), 0, 1, 2, 3);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A)
	{
	__m128d src = (__v2df) __builtin_shufflevector((__v2df) __W,
	(__v2df) {0.0, 0.0}, 0, 2);

	return (__m128d) __builtin_shufflevector(
	__builtin_ia32_loadsd128_mask ((__v8df *) __A,
	(__v8df) _mm512_castpd128_pd512(src),
	(__mmask8) __U & 1),
	_mm512_undefined_pd(), 0, 1);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_load_sd (__mmask8 __U, const double* __A)
	{
	return (__m128d) __builtin_shufflevector(
	__builtin_ia32_loadsd128_mask ((__v8df *) __A,
	(__v8df) _mm512_setzero_pd(),
	(__mmask8) __U & 1),
	_mm512_undefined_pd(), 0, 1);
	}

	#define _mm512_shuffle_epi32(A, I) __extension__ ({ \
	(__m512i)__builtin_shufflevector((__v16si)(__m512i)(A), \
	(__v16si)_mm512_undefined_epi32(), \
	0 + (((I) >> 0) & 0x3), \
	0 + (((I) >> 2) & 0x3), \
	0 + (((I) >> 4) & 0x3), \
	0 + (((I) >> 6) & 0x3), \
	4 + (((I) >> 0) & 0x3), \
	4 + (((I) >> 2) & 0x3), \
	4 + (((I) >> 4) & 0x3), \
	4 + (((I) >> 6) & 0x3), \
	8 + (((I) >> 0) & 0x3), \
	8 + (((I) >> 2) & 0x3), \
	8 + (((I) >> 4) & 0x3), \
	8 + (((I) >> 6) & 0x3), \
	12 + (((I) >> 0) & 0x3), \
	12 + (((I) >> 2) & 0x3), \
	12 + (((I) >> 4) & 0x3), \
	12 + (((I) >> 6) & 0x3)); })

	#define _mm512_mask_shuffle_epi32(W, U, A, I) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_shuffle_epi32((A), (I)), \
	(__v16si)(__m512i)(W)); })

	#define _mm512_maskz_shuffle_epi32(U, A, I) __extension__ ({ \
	(__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
	(__v16si)_mm512_shuffle_epi32((A), (I)), \
	(__v16si)_mm512_setzero_si512()); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A,
	(__v8df) _mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A,
	(__v8di) _mm512_setzero_pd (),
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
	(__v8df) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P)
	{
	return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P,
	(__v8df) _mm512_setzero_pd(),
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
	(__v8di) __W,
	(__mmask8) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P,
	(__v8di) _mm512_setzero_pd(),
	(__mmask8) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P)
	{
	return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P,
	(__v16sf) _mm512_setzero_ps(),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P)
	{
	return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P,
	(__v16si) _mm512_setzero_ps(),
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
	(__v16sf) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A,
	(__v16sf) _mm512_setzero_ps(),
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
	(__v16si) __W,
	(__mmask16) __U);
	}

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A)
	{
	return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A,
	(__v16si) _mm512_setzero_ps(),
	(__mmask16) __U);
	}

	#define _mm512_cvt_roundps_pd(A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
	(__v8df)_mm512_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm512_mask_cvt_roundps_pd(W, U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
	(__v8df)(__m512d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm512_maskz_cvt_roundps_pd(U, A, R) __extension__ ({ \
	(__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \
	(__v8df)_mm512_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_cvtps_pd (__m256 __A)
	{
	return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
	(__v8df)
	_mm512_undefined_pd (),
	(__mmask8) -1,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A)
	{
	return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
	(__v8df) __W,
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A)
	{
	return (__m512d) __builtin_ia32_cvtps2pd512_mask ((__v8sf) __A,
	(__v8df)
	_mm512_setzero_pd (),
	(__mmask8) __U,
	_MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_cvtpslo_pd (__m512 __A)
	{
	return (__m512) _mm512_cvtps_pd(_mm512_castps512_ps256(__A));
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A)
	{
	return (__m512) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A));
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
	(__v8df) __A,
	(__v8df) __W);
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A)
	{
	return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U,
	(__v8df) __A,
	(__v8df) _mm512_setzero_pd ());
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
	(__v16sf) __A,
	(__v16sf) __W);
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A)
	{
	return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U,
	(__v16sf) __A,
	(__v16sf) _mm512_setzero_ps ());
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A)
	{
	__builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A,
	(__mmask8) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A)
	{
	__builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A,
	(__mmask8) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A)
	{
	__builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A,
	(__mmask16) __U);
	}

	static __inline__ void __DEFAULT_FN_ATTRS
	_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A)
	{
	__builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A,
	(__mmask16) __U);
	}

	#define _mm_cvt_roundsd_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
	(__v2df)(__m128d)(B), \
	(__v4sf)_mm_undefined_ps(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
	(__v2df)(__m128d)(B), \
	(__v4sf)(__m128)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \
	(__v2df)(__m128d)(B), \
	(__v4sf)_mm_setzero_ps(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B)
	{
	return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
	(__v2df)(__B),
	- (__v4sf)(__W),
	+ (__v4sf)(__W),
	(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B)
	{
	return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)(__A),
	(__v2df)(__B),
	- (__v4sf)_mm_setzero_ps(),
	+ (__v4sf)_mm_setzero_ps(),
	(__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	}

	#define _mm_cvtss_i32 _mm_cvtss_si32
	#define _mm_cvtsd_i32 _mm_cvtsd_si32
	#define _mm_cvti32_sd _mm_cvtsi32_sd
	#define _mm_cvti32_ss _mm_cvtsi32_ss
	#ifdef __x86_64__
	#define _mm_cvtss_i64 _mm_cvtss_si64
	#define _mm_cvtsd_i64 _mm_cvtsd_si64
	#define _mm_cvti64_sd _mm_cvtsi64_sd
	#define _mm_cvti64_ss _mm_cvtsi64_ss
	#endif

	#ifdef __x86_64__
	#define _mm_cvt_roundi64_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
	(int)(R)); })

	#define _mm_cvt_roundsi64_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \
	(int)(R)); })
	#endif

	#define _mm_cvt_roundsi32_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })

	#define _mm_cvt_roundi32_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R)); })

	#ifdef __x86_64__
	#define _mm_cvt_roundsi64_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
	(int)(R)); })

	#define _mm_cvt_roundi64_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \
	(int)(R)); })
	#endif

	#define _mm_cvt_roundss_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
	(__v4sf)(__m128)(B), \
	(__v2df)_mm_undefined_pd(), \
	(__mmask8)-1, (int)(R)); })

	#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
	(__v4sf)(__m128)(B), \
	(__v2df)(__m128d)(W), \
	(__mmask8)(U), (int)(R)); })

	#define _mm_maskz_cvt_roundss_sd(U, A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \
	(__v4sf)(__m128)(B), \
	(__v2df)_mm_setzero_pd(), \
	(__mmask8)(U), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B)
	{
	return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
	(__v4sf)(__B),
	(__v2df)(__W),
	- (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	+ (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B)
	{
	return __builtin_ia32_cvtss2sd_round_mask((__v2df)(__A),
	(__v4sf)(__B),
	- (__v2df)_mm_setzero_pd(),
	- (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	+ (__v2df)_mm_setzero_pd(),
	+ (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION);
	}

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_cvtu32_sd (__m128d __A, unsigned __B)
	{
	return (__m128d) __builtin_ia32_cvtusi2sd32 ((__v2df) __A, __B);
	}

	#ifdef __x86_64__
	#define _mm_cvt_roundu64_sd(A, B, R) __extension__ ({ \
	(__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \
	(unsigned long long)(B), (int)(R)); })

	static __inline__ __m128d __DEFAULT_FN_ATTRS
	_mm_cvtu64_sd (__m128d __A, unsigned long long __B)
	{
	return (__m128d) __builtin_ia32_cvtusi2sd64 ((__v2df) __A, __B,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	#define _mm_cvt_roundu32_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \
	(int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_cvtu32_ss (__m128 __A, unsigned __B)
	{
	return (__m128) __builtin_ia32_cvtusi2ss32 ((__v4sf) __A, __B,
	_MM_FROUND_CUR_DIRECTION);
	}

	#ifdef __x86_64__
	#define _mm_cvt_roundu64_ss(A, B, R) __extension__ ({ \
	(__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \
	(unsigned long long)(B), (int)(R)); })

	static __inline__ __m128 __DEFAULT_FN_ATTRS
	_mm_cvtu64_ss (__m128 __A, unsigned long long __B)
	{
	return (__m128) __builtin_ia32_cvtusi2ss64 ((__v4sf) __A, __B,
	_MM_FROUND_CUR_DIRECTION);
	}
	#endif

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A)
	{
	return (__m512i) __builtin_ia32_pbroadcastd512_gpr_mask (__A, (__v16si) __O,
	__M);
	}

	#ifdef __x86_64__
	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A)
	{
	return (__m512i) __builtin_ia32_pbroadcastq512_gpr_mask (__A, (__v8di) __O,
	__M);
	}
	#endif

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59,
	char __e58, char __e57, char __e56, char __e55, char __e54, char __e53,
	char __e52, char __e51, char __e50, char __e49, char __e48, char __e47,
	char __e46, char __e45, char __e44, char __e43, char __e42, char __e41,
	char __e40, char __e39, char __e38, char __e37, char __e36, char __e35,
	char __e34, char __e33, char __e32, char __e31, char __e30, char __e29,
	char __e28, char __e27, char __e26, char __e25, char __e24, char __e23,
	char __e22, char __e21, char __e20, char __e19, char __e18, char __e17,
	char __e16, char __e15, char __e14, char __e13, char __e12, char __e11,
	char __e10, char __e9, char __e8, char __e7, char __e6, char __e5,
	char __e4, char __e3, char __e2, char __e1, char __e0) {

	return __extension__ (__m512i)(__v64qi)
	{__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
	__e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
	__e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
	__e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31,
	__e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39,
	__e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47,
	__e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55,
	__e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63};
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28,
	short __e27, short __e26, short __e25, short __e24, short __e23,
	short __e22, short __e21, short __e20, short __e19, short __e18,
	short __e17, short __e16, short __e15, short __e14, short __e13,
	short __e12, short __e11, short __e10, short __e9, short __e8,
	short __e7, short __e6, short __e5, short __e4, short __e3,
	short __e2, short __e1, short __e0) {
	return __extension__ (__m512i)(__v32hi)
	{__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7,
	__e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15,
	__e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23,
	__e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 };
	}

	static __inline __m512i __DEFAULT_FN_ATTRS
	_mm512_set_epi32 (int __A, int __B, int __C, int __D,
	int __E, int __F, int __G, int __H,
	int __I, int __J, int __K, int __L,
	int __M, int __N, int __O, int __P)
	{
	return __extension__ (__m512i)(__v16si)
	{ __P, __O, __N, __M, __L, __K, __J, __I,
	__H, __G, __F, __E, __D, __C, __B, __A };
	}

	#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \
	e8,e9,e10,e11,e12,e13,e14,e15) \
	_mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \
	(e5),(e4),(e3),(e2),(e1),(e0))

	static __inline__ __m512i __DEFAULT_FN_ATTRS
	_mm512_set_epi64 (long long __A, long long __B, long long __C,
	long long __D, long long __E, long long __F,
	long long __G, long long __H)
	{
	return __extension__ (__m512i) (__v8di)
	{ __H, __G, __F, __E, __D, __C, __B, __A };
	}

	#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \
	_mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_set_pd (double __A, double __B, double __C, double __D,
	double __E, double __F, double __G, double __H)
	{
	return __extension__ (__m512d)
	{ __H, __G, __F, __E, __D, __C, __B, __A };
	}

	#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \
	_mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0))

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_set_ps (float __A, float __B, float __C, float __D,
	float __E, float __F, float __G, float __H,
	float __I, float __J, float __K, float __L,
	float __M, float __N, float __O, float __P)
	{
	return __extension__ (__m512)
	{ __P, __O, __N, __M, __L, __K, __J, __I,
	__H, __G, __F, __E, __D, __C, __B, __A };
	}

	#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \
	_mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \
	(e4),(e3),(e2),(e1),(e0))

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_abs_ps(__m512 __A)
	{
	return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
	}

	static __inline__ __m512 __DEFAULT_FN_ATTRS
	_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A)
	{
	return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ;
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_abs_pd(__m512d __A)
	{
	return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ;
	}

	static __inline__ __m512d __DEFAULT_FN_ATTRS
	_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A)
	{
	return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A);
	}

	// Vector-reduction arithmetic accepts vectors as inputs and produces scalars as
	// outputs. This class of vector operation forms the basis of many scientific
	// computations. In vector-reduction arithmetic, the evaluation off is
	// independent of the order of the input elements of V.

	// Used bisection method. At each step, we partition the vector with previous
	// step in half, and the operation is performed on its two halves.
	// This takes log2(n) steps where n is the number of elements in the vector.

	// Vec512 - Vector with size 512.
	// Operator - Can be one of following: +,*,&,\|
	// T2 - Can get 'i' for int and 'f' for float.
	// T1 - Can get 'i' for int and 'd' for double.

	#define _mm512_reduce_operator_64bit(Vec512, Operator, T2, T1) \
	__extension__({ \
	__m256##T1 Vec256 = __builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	0, 1, 2, 3) \
	Operator \
	__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	4, 5, 6, 7); \
	__m128##T1 Vec128 = __builtin_shufflevector( \
	(__v4d##T2)Vec256, \
	(__v4d##T2)Vec256, \
	0, 1) \
	Operator \
	__builtin_shufflevector( \
	(__v4d##T2)Vec256, \
	(__v4d##T2)Vec256, \
	2, 3); \
	Vec128 = __builtin_shufflevector((__v2d##T2)Vec128, \
	(__v2d##T2)Vec128, 0, -1) \
	Operator \
	__builtin_shufflevector((__v2d##T2)Vec128, \
	(__v2d##T2)Vec128, 1, -1); \
	return Vec128[0]; \
	})

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_add_epi64(__m512i __W) {
	_mm512_reduce_operator_64bit(__W, +, i, i);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_mul_epi64(__m512i __W) {
	_mm512_reduce_operator_64bit(__W, *, i, i);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_and_epi64(__m512i __W) {
	_mm512_reduce_operator_64bit(__W, &, i, i);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_or_epi64(__m512i __W) {
	_mm512_reduce_operator_64bit(__W, \|, i, i);
	}

	static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_add_pd(__m512d __W) {
	_mm512_reduce_operator_64bit(__W, +, f, d);
	}

	static __inline__ double __DEFAULT_FN_ATTRS _mm512_reduce_mul_pd(__m512d __W) {
	_mm512_reduce_operator_64bit(__W, *, f, d);
	}

	// Vec512 - Vector with size 512.
	-// Vec512Neutral - All vector elements set to the identity element.
	+// Vec512Neutral - All vector elements set to the identity element.
	// Identity element: {+,0},{*,1},{&,0xFFFFFFFFFFFFFFFF},{\|,0}
	// Operator - Can be one of following: +,*,&,\|
	// Mask - Intrinsic Mask
	// T2 - Can get 'i' for int and 'f' for float.
	// T1 - Can get 'i' for int and 'd' for packed double-precision.
	// T3 - Can be Pd for packed double or q for q-word.

	#define _mm512_mask_reduce_operator_64bit(Vec512, Vec512Neutral, Operator, \
	Mask, T2, T1, T3) \
	__extension__({ \
	Vec512 = __builtin_ia32_select##T3##_512( \
	(__mmask8)Mask, \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512Neutral); \
	_mm512_reduce_operator_64bit(Vec512, Operator, T2, T1); \
	})

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), +, __M, i, i, q);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(1), *, __M, i, i, q);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) {
	- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
	+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
	&, __M, i, i, q);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) {
	- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), \|, __M,
	+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_epi64(0), \|, __M,
	i, i, q);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) {
	- _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
	+ _mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(0), +, __M,
	f, d, pd);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) {
	_mm512_mask_reduce_operator_64bit(__W, _mm512_set1_pd(1), *, __M,
	f, d, pd);
	}

	// Vec512 - Vector with size 512.
	// Operator - Can be one of following: +,*,&,\|
	// T2 - Can get 'i' for int and ' ' for packed single.
	// T1 - Can get 'i' for int and 'f' for float.

	#define _mm512_reduce_operator_32bit(Vec512, Operator, T2, T1) __extension__({ \
	__m256##T1 Vec256 = \
	(__m256##T1)(__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, 1, 2, 3, 4, 5, 6, 7) \
	Operator \
	__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	8, 9, 10, 11, 12, 13, 14, 15)); \
	__m128##T1 Vec128 = \
	(__m128##T1)(__builtin_shufflevector( \
	(__v8s##T2)Vec256, \
	(__v8s##T2)Vec256, \
	0, 1, 2, 3) \
	Operator \
	__builtin_shufflevector( \
	(__v8s##T2)Vec256, \
	(__v8s##T2)Vec256, \
	4, 5, 6, 7)); \
	Vec128 = (__m128##T1)(__builtin_shufflevector( \
	(__v4s##T2)Vec128, \
	(__v4s##T2)Vec128, \
	0, 1, -1, -1) \
	Operator \
	__builtin_shufflevector( \
	(__v4s##T2)Vec128, \
	(__v4s##T2)Vec128, \
	2, 3, -1, -1)); \
	Vec128 = (__m128##T1)(__builtin_shufflevector( \
	(__v4s##T2)Vec128, \
	(__v4s##T2)Vec128, \
	0, -1, -1, -1) \
	Operator \
	__builtin_shufflevector( \
	(__v4s##T2)Vec128, \
	(__v4s##T2)Vec128, \
	1, -1, -1, -1)); \
	return Vec128[0]; \
	})

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_reduce_add_epi32(__m512i __W) {
	_mm512_reduce_operator_32bit(__W, +, i, i);
	}

	-static __inline__ int __DEFAULT_FN_ATTRS
	+static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_reduce_mul_epi32(__m512i __W) {
	_mm512_reduce_operator_32bit(__W, *, i, i);
	}

	-static __inline__ int __DEFAULT_FN_ATTRS
	+static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_reduce_and_epi32(__m512i __W) {
	_mm512_reduce_operator_32bit(__W, &, i, i);
	}

	-static __inline__ int __DEFAULT_FN_ATTRS
	+static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_reduce_or_epi32(__m512i __W) {
	_mm512_reduce_operator_32bit(__W, \|, i, i);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_reduce_add_ps(__m512 __W) {
	_mm512_reduce_operator_32bit(__W, +, f, );
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_reduce_mul_ps(__m512 __W) {
	_mm512_reduce_operator_32bit(__W, *, f, );
	}

	// Vec512 - Vector with size 512.
	-// Vec512Neutral - All vector elements set to the identity element.
	+// Vec512Neutral - All vector elements set to the identity element.
	// Identity element: {+,0},{*,1},{&,0xFFFFFFFF},{\|,0}
	// Operator - Can be one of following: +,*,&,\|
	// Mask - Intrinsic Mask
	// T2 - Can get 'i' for int and 'f' for float.
	// T1 - Can get 'i' for int and 'd' for double.
	// T3 - Can be Ps for packed single or d for d-word.

	#define _mm512_mask_reduce_operator_32bit(Vec512, Vec512Neutral, Operator, \
	Mask, T2, T1, T3) \
	__extension__({ \
	Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
	(__mmask16)Mask, \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512Neutral); \
	_mm512_reduce_operator_32bit(Vec512, Operator, T2, T1); \
	})

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), +, __M, i, i, d);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(1), *, __M, i, i, d);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) {
	- _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
	+ _mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0xFFFFFFFF), &, __M,
	i, i, d);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_epi32(0), \|, __M, i, i, d);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(0), +, __M, f, , ps);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) {
	_mm512_mask_reduce_operator_32bit(__W, _mm512_set1_ps(1), *, __M, f, , ps);
	}

	// Used bisection method. At each step, we partition the vector with previous
	// step in half, and the operation is performed on its two halves.
	// This takes log2(n) steps where n is the number of elements in the vector.
	// This macro uses only intrinsics from the AVX512F feature.

	// Vec512 - Vector with size of 512.
	// IntrinName - Can be one of following: {max\|min}_{epi64\|epu64\|pd} for example:
	// __mm512_max_epi64
	// T1 - Can get 'i' for int and 'd' for double.[__m512{i\|d}]
	// T2 - Can get 'i' for int and 'f' for float. [__v8d{i\|f}]

	#define _mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2) __extension__({ \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	0, 1, 2, 3, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	4, 5, 6, 7, -1, -1, -1, -1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	0, 1, -1, -1, -1, -1, -1, -1),\
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	2, 3, -1, -1, -1, -1, -1, \
	-1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	0, -1, -1, -1, -1, -1, -1, -1),\
	(__m512##T1)__builtin_shufflevector( \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512, \
	1, -1, -1, -1, -1, -1, -1, -1))\
	; \
	return Vec512[0]; \
	})

	-static __inline__ long long __DEFAULT_FN_ATTRS
	+static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_reduce_max_epi64(__m512i __V) {
	_mm512_reduce_maxMin_64bit(__V, max_epi64, i, i);
	}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm512_reduce_max_epu64(__m512i __V) {
	_mm512_reduce_maxMin_64bit(__V, max_epu64, i, i);
	}

	-static __inline__ double __DEFAULT_FN_ATTRS
	+static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_reduce_max_pd(__m512d __V) {
	_mm512_reduce_maxMin_64bit(__V, max_pd, d, f);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS _mm512_reduce_min_epi64
	(__m512i __V) {
	_mm512_reduce_maxMin_64bit(__V, min_epi64, i, i);
	}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm512_reduce_min_epu64(__m512i __V) {
	_mm512_reduce_maxMin_64bit(__V, min_epu64, i, i);
	}

	-static __inline__ double __DEFAULT_FN_ATTRS
	+static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_reduce_min_pd(__m512d __V) {
	_mm512_reduce_maxMin_64bit(__V, min_pd, d, f);
	}

	// Vec512 - Vector with size 512.
	// Vec512Neutral - A 512 length vector with elements set to the identity element
	// Identity element: {max_epi,0x8000000000000000}
	// {max_epu,0x0000000000000000}
	// {max_pd, 0xFFF0000000000000}
	// {min_epi,0x7FFFFFFFFFFFFFFF}
	// {min_epu,0xFFFFFFFFFFFFFFFF}
	// {min_pd, 0x7FF0000000000000}
	//
	// IntrinName - Can be one of following: {max\|min}_{epi64\|epu64\|pd} for example:
	// __mm512_max_epi64
	// T1 - Can get 'i' for int and 'd' for double.[__m512{i\|d}]
	// T2 - Can get 'i' for int and 'f' for float. [__v8d{i\|f}]
	// T3 - Can get 'q' q word and 'pd' for packed double.
	// [__builtin_ia32_select{q\|pd}_512]
	// Mask - Intrinsic Mask

	#define _mm512_mask_reduce_maxMin_64bit(Vec512, Vec512Neutral, IntrinName, T1, \
	T2, T3, Mask) \
	__extension__({ \
	Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
	(__mmask8)Mask, \
	(__v8d##T2)Vec512, \
	(__v8d##T2)Vec512Neutral); \
	_mm512_reduce_maxMin_64bit(Vec512, IntrinName, T1, T2); \
	})

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x8000000000000000),
	max_epi64, i, i, q, __M);
	}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x0000000000000000),
	max_epu64, i, i, q, __M);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, -_mm512_set1_pd(__builtin_inf()),
	max_pd, d, f, pd, __M);
	}

	static __inline__ long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
	min_epi64, i, i, q, __M);
	}

	static __inline__ unsigned long long __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_epi64(0xFFFFFFFFFFFFFFFF),
	min_epu64, i, i, q, __M);
	}

	static __inline__ double __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) {
	_mm512_mask_reduce_maxMin_64bit(__V, _mm512_set1_pd(__builtin_inf()),
	min_pd, d, f, pd, __M);
	}

	// Vec512 - Vector with size 512.
	// IntrinName - Can be one of following: {max\|min}_{epi32\|epu32\|ps} for example:
	// __mm512_max_epi32
	// T1 - Can get 'i' for int and ' ' .[__m512{i\|}]
	// T2 - Can get 'i' for int and 'f' for float.[__v16s{i\|f}]

	#define _mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2) __extension__({ \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, 1, 2, 3, 4, 5, 6, 7, \
	-1, -1, -1, -1, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	8, 9, 10, 11, 12, 13, 14, 15, \
	-1, -1, -1, -1, -1, -1, -1, -1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, 1, 2, 3, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	4, 5, 6, 7, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, 1, -1, -1, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	2, 3, -1, -1, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1)); \
	Vec512 = _mm512_##IntrinName( \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	0, -1, -1, -1, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1), \
	(__m512##T1)__builtin_shufflevector( \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512, \
	1, -1, -1, -1, -1, -1, -1, -1, \
	-1, -1, -1, -1, -1, -1, -1, -1)); \
	return Vec512[0]; \
	})

	static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_max_epi32(__m512i a) {
	_mm512_reduce_maxMin_32bit(a, max_epi32, i, i);
	}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS
	_mm512_reduce_max_epu32(__m512i a) {
	_mm512_reduce_maxMin_32bit(a, max_epu32, i, i);
	}

	static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_max_ps(__m512 a) {
	_mm512_reduce_maxMin_32bit(a, max_ps, , f);
	}

	static __inline__ int __DEFAULT_FN_ATTRS _mm512_reduce_min_epi32(__m512i a) {
	_mm512_reduce_maxMin_32bit(a, min_epi32, i, i);
	}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS
	_mm512_reduce_min_epu32(__m512i a) {
	_mm512_reduce_maxMin_32bit(a, min_epu32, i, i);
	}

	static __inline__ float __DEFAULT_FN_ATTRS _mm512_reduce_min_ps(__m512 a) {
	_mm512_reduce_maxMin_32bit(a, min_ps, , f);
	}

	// Vec512 - Vector with size 512.
	// Vec512Neutral - A 512 length vector with elements set to the identity element
	// Identity element: {max_epi,0x80000000}
	// {max_epu,0x00000000}
	// {max_ps, 0xFF800000}
	// {min_epi,0x7FFFFFFF}
	// {min_epu,0xFFFFFFFF}
	// {min_ps, 0x7F800000}
	//
	// IntrinName - Can be one of following: {max\|min}_{epi32\|epu32\|ps} for example:
	// __mm512_max_epi32
	// T1 - Can get 'i' for int and ' ' .[__m512{i\|}]
	// T2 - Can get 'i' for int and 'f' for float.[__v16s{i\|f}]
	// T3 - Can get 'q' q word and 'pd' for packed double.
	// [__builtin_ia32_select{q\|pd}_512]
	// Mask - Intrinsic Mask

	#define _mm512_mask_reduce_maxMin_32bit(Vec512, Vec512Neutral, IntrinName, T1, \
	T2, T3, Mask) \
	__extension__({ \
	Vec512 = (__m512##T1)__builtin_ia32_select##T3##_512( \
	(__mmask16)Mask, \
	(__v16s##T2)Vec512, \
	(__v16s##T2)Vec512Neutral); \
	_mm512_reduce_maxMin_32bit(Vec512, IntrinName, T1, T2); \
	})

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x80000000), max_epi32,
	i, i, d, __M);
	}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x00000000), max_epu32,
	i, i, d, __M);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) {
	_mm512_mask_reduce_maxMin_32bit(__V,-_mm512_set1_ps(__builtin_inff()), max_ps, , f,
	ps, __M);
	}

	static __inline__ int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0x7FFFFFFF), min_epi32,
	i, i, d, __M);
	}

	static __inline__ unsigned int __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_epi32(0xFFFFFFFF), min_epu32,
	i, i, d, __M);
	}

	static __inline__ float __DEFAULT_FN_ATTRS
	_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
	_mm512_mask_reduce_maxMin_32bit(__V, _mm512_set1_ps(__builtin_inff()), min_ps, , f,
	ps, __M);
	}

	#undef __DEFAULT_FN_ATTRS

	#endif // __AVX512FINTRIN_H
	Index: head/contrib/llvm/tools/clang/lib/Headers/avxintrin.h
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Headers/avxintrin.h (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Headers/avxintrin.h (revision 322320)
	@@ -1,5159 +1,5162 @@
	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*
	*===-----------------------------------------------------------------------===
	*/

	#ifndef __IMMINTRIN_H
	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
	#endif

	#ifndef __AVXINTRIN_H
	#define __AVXINTRIN_H

	typedef double __v4df __attribute__ ((__vector_size__ (32)));
	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
	typedef int __v8si __attribute__ ((__vector_size__ (32)));
	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
	typedef char __v32qi __attribute__ ((__vector_size__ (32)));

	/* Unsigned types */
	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));

	/* We need an explicitly signed variant for char. Note that this shouldn't
	* appear in the interface though. */
	typedef signed char __v32qs __attribute__((__vector_size__(32)));

	typedef float __m256 __attribute__ ((__vector_size__ (32)));
	typedef double __m256d __attribute__((__vector_size__(32)));
	typedef long long __m256i __attribute__((__vector_size__(32)));

	/* Define the default attributes for the functions in this file. */
	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx")))

	/* Arithmetic */
	/// \brief Adds two 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// \returns A 256-bit vector of [4 x double] containing the sums of both
	/// operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_add_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)((__v4df)__a+(__v4df)__b);
	}

	/// \brief Adds two 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// \returns A 256-bit vector of [8 x float] containing the sums of both
	/// operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_add_ps(__m256 __a, __m256 __b)
	{
	return (__m256)((__v8sf)__a+(__v8sf)__b);
	}

	/// \brief Subtracts two 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing the minuend.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing the subtrahend.
	/// \returns A 256-bit vector of [4 x double] containing the differences between
	/// both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_sub_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)((__v4df)__a-(__v4df)__b);
	}

	/// \brief Subtracts two 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the minuend.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing the subtrahend.
	/// \returns A 256-bit vector of [8 x float] containing the differences between
	/// both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_sub_ps(__m256 __a, __m256 __b)
	{
	return (__m256)((__v8sf)__a-(__v8sf)__b);
	}

	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
	/// two 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing the left source operand.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing the right source operand.
	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
	/// and differences between both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_addsub_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
	}

	/// \brief Adds the even-indexed values and subtracts the odd-indexed values of
	/// two 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the left source operand.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing the right source operand.
	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
	/// differences between both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_addsub_ps(__m256 __a, __m256 __b)
	{
	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
	}

	/// \brief Divides two 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing the dividend.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing the divisor.
	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
	/// operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_div_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)((__v4df)__a/(__v4df)__b);
	}

	/// \brief Divides two 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the dividend.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing the divisor.
	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
	/// operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_div_ps(__m256 __a, __m256 __b)
	{
	return (__m256)((__v8sf)__a/(__v8sf)__b);
	}

	/// \brief Compares two 256-bit vectors of [4 x double] and returns the greater
	/// of each pair of values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the operands.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the operands.
	/// \returns A 256-bit vector of [4 x double] containing the maximum values
	/// between both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_max_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
	}

	/// \brief Compares two 256-bit vectors of [8 x float] and returns the greater
	/// of each pair of values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the operands.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the operands.
	/// \returns A 256-bit vector of [8 x float] containing the maximum values
	/// between both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_max_ps(__m256 __a, __m256 __b)
	{
	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
	}

	/// \brief Compares two 256-bit vectors of [4 x double] and returns the lesser
	/// of each pair of values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the operands.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the operands.
	/// \returns A 256-bit vector of [4 x double] containing the minimum values
	/// between both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_min_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
	}

	/// \brief Compares two 256-bit vectors of [8 x float] and returns the lesser
	/// of each pair of values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the operands.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the operands.
	/// \returns A 256-bit vector of [8 x float] containing the minimum values
	/// between both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_min_ps(__m256 __a, __m256 __b)
	{
	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
	}

	/// \brief Multiplies two 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the operands.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the operands.
	/// \returns A 256-bit vector of [4 x double] containing the products of both
	/// operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_mul_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)((__v4df)__a * (__v4df)__b);
	}

	/// \brief Multiplies two 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the operands.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the operands.
	/// \returns A 256-bit vector of [8 x float] containing the products of both
	/// operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_mul_ps(__m256 __a, __m256 __b)
	{
	return (__m256)((__v8sf)__a * (__v8sf)__b);
	}

	/// \brief Calculates the square roots of the values in a 256-bit vector of
	/// [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
	/// values in the operand.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_sqrt_pd(__m256d __a)
	{
	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
	}

	/// \brief Calculates the square roots of the values in a 256-bit vector of
	/// [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
	/// values in the operand.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_sqrt_ps(__m256 __a)
	{
	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
	}

	/// \brief Calculates the reciprocal square roots of the values in a 256-bit
	/// vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
	/// roots of the values in the operand.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_rsqrt_ps(__m256 __a)
	{
	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
	}

	/// \brief Calculates the reciprocals of the values in a 256-bit vector of
	/// [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
	/// values in the operand.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_rcp_ps(__m256 __a)
	{
	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
	}

	/// \brief Rounds the values in a 256-bit vector of [4 x double] as specified
	/// by the byte operand. The source values are rounded to integer values and
	/// returned as 64-bit double-precision floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_round_pd(__m256d V, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
	///
	/// \param V
	/// A 256-bit vector of [4 x double].
	/// \param M
	/// An integer value that specifies the rounding operation. \n
	/// Bits [7:4] are reserved. \n
	/// Bit [3] is a precision exception value: \n
	/// 0: A normal PE exception is used. \n
	/// 1: The PE field is not updated. \n
	/// Bit [2] is the rounding control source: \n
	/// 0: Use bits [1:0] of \a M. \n
	/// 1: Use the current MXCSR setting. \n
	/// Bits [1:0] contain the rounding control definition: \n
	/// 00: Nearest. \n
	/// 01: Downward (toward negative infinity). \n
	/// 10: Upward (toward positive infinity). \n
	/// 11: Truncated.
	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
	#define _mm256_round_pd(V, M) __extension__ ({ \
	(__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)); })

	/// \brief Rounds the values stored in a 256-bit vector of [8 x float] as
	/// specified by the byte operand. The source values are rounded to integer
	/// values and returned as floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_round_ps(__m256 V, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
	///
	/// \param V
	/// A 256-bit vector of [8 x float].
	/// \param M
	/// An integer value that specifies the rounding operation. \n
	/// Bits [7:4] are reserved. \n
	/// Bit [3] is a precision exception value: \n
	/// 0: A normal PE exception is used. \n
	/// 1: The PE field is not updated. \n
	/// Bit [2] is the rounding control source: \n
	/// 0: Use bits [1:0] of \a M. \n
	/// 1: Use the current MXCSR setting. \n
	/// Bits [1:0] contain the rounding control definition: \n
	/// 00: Nearest. \n
	/// 01: Downward (toward negative infinity). \n
	/// 10: Upward (toward positive infinity). \n
	/// 11: Truncated.
	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
	#define _mm256_round_ps(V, M) __extension__ ({ \
	(__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)); })

	/// \brief Rounds up the values stored in a 256-bit vector of [4 x double]. The
	/// source values are rounded up to integer values and returned as 64-bit
	/// double-precision floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_ceil_pd(__m256d V);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
	///
	/// \param V
	/// A 256-bit vector of [4 x double].
	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)

	/// \brief Rounds down the values stored in a 256-bit vector of [4 x double].
	/// The source values are rounded down to integer values and returned as
	/// 64-bit double-precision floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_floor_pd(__m256d V);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
	///
	/// \param V
	/// A 256-bit vector of [4 x double].
	/// \returns A 256-bit vector of [4 x double] containing the rounded down
	/// values.
	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)

	/// \brief Rounds up the values stored in a 256-bit vector of [8 x float]. The
	/// source values are rounded up to integer values and returned as
	/// floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_ceil_ps(__m256 V);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
	///
	/// \param V
	/// A 256-bit vector of [8 x float].
	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)

	/// \brief Rounds down the values stored in a 256-bit vector of [8 x float]. The
	/// source values are rounded down to integer values and returned as
	/// floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_floor_ps(__m256 V);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
	///
	/// \param V
	/// A 256-bit vector of [8 x float].
	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)

	/* Logical */
	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
	/// values between both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_and_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)((__v4du)__a & (__v4du)__b);
	}

	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
	/// values between both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_and_ps(__m256 __a, __m256 __b)
	{
	return (__m256)((__v8su)__a & (__v8su)__b);
	}

	/// \brief Performs a bitwise AND of two 256-bit vectors of [4 x double], using
	/// the one's complement of the values contained in the first source operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing the left source operand. The
	/// one's complement of this value is used in the bitwise AND.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing the right source operand.
	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
	/// values of the second operand and the one's complement of the first
	/// operand.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_andnot_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)(~(__v4du)__a & (__v4du)__b);
	}

	/// \brief Performs a bitwise AND of two 256-bit vectors of [8 x float], using
	/// the one's complement of the values contained in the first source operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the left source operand. The
	/// one's complement of this value is used in the bitwise AND.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing the right source operand.
	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
	/// values of the second operand and the one's complement of the first
	/// operand.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_andnot_ps(__m256 __a, __m256 __b)
	{
	return (__m256)(~(__v8su)__a & (__v8su)__b);
	}

	/// \brief Performs a bitwise OR of two 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VORPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
	/// values between both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_or_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)((__v4du)__a \| (__v4du)__b);
	}

	/// \brief Performs a bitwise OR of two 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VORPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
	/// values between both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_or_ps(__m256 __a, __m256 __b)
	{
	return (__m256)((__v8su)__a \| (__v8su)__b);
	}

	/// \brief Performs a bitwise XOR of two 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
	/// values between both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_xor_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)((__v4du)__a ^ (__v4du)__b);
	}

	/// \brief Performs a bitwise XOR of two 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
	/// values between both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_xor_ps(__m256 __a, __m256 __b)
	{
	return (__m256)((__v8su)__a ^ (__v8su)__b);
	}

	/* Horizontal arithmetic */
	/// \brief Horizontally adds the adjacent pairs of values contained in two
	/// 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// The horizontal sums of the values are returned in the even-indexed
	/// elements of a vector of [4 x double].
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// The horizontal sums of the values are returned in the odd-indexed
	/// elements of a vector of [4 x double].
	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
	/// both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_hadd_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
	}

	/// \brief Horizontally adds the adjacent pairs of values contained in two
	/// 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// The horizontal sums of the values are returned in the elements with
	/// index 0, 1, 4, 5 of a vector of [8 x float].
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// The horizontal sums of the values are returned in the elements with
	/// index 2, 3, 6, 7 of a vector of [8 x float].
	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
	/// both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_hadd_ps(__m256 __a, __m256 __b)
	{
	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
	}

	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
	/// 256-bit vectors of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// The horizontal differences between the values are returned in the
	/// even-indexed elements of a vector of [4 x double].
	/// \param __b
	/// A 256-bit vector of [4 x double] containing one of the source operands.
	/// The horizontal differences between the values are returned in the
	/// odd-indexed elements of a vector of [4 x double].
	/// \returns A 256-bit vector of [4 x double] containing the horizontal
	/// differences of both operands.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_hsub_pd(__m256d __a, __m256d __b)
	{
	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
	}

	/// \brief Horizontally subtracts the adjacent pairs of values contained in two
	/// 256-bit vectors of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// The horizontal differences between the values are returned in the
	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
	/// \param __b
	/// A 256-bit vector of [8 x float] containing one of the source operands.
	/// The horizontal differences between the values are returned in the
	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
	/// \returns A 256-bit vector of [8 x float] containing the horizontal
	/// differences of both operands.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_hsub_ps(__m256 __a, __m256 __b)
	{
	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
	}

	/* Vector permutations */
	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
	/// by the 128-bit integer vector operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [2 x double].
	/// \param __c
	/// A 128-bit integer vector operand specifying how the values are to be
	/// copied. \n
	/// Bit [1]: \n
	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
	/// vector. \n
	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
	/// returned vector. \n
	/// Bit [65]: \n
	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
	/// returned vector. \n
	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
	/// returned vector.
	/// \returns A 128-bit vector of [2 x double] containing the copied values.
	static __inline __m128d __DEFAULT_FN_ATTRS
	_mm_permutevar_pd(__m128d __a, __m128i __c)
	{
	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
	}

	/// \brief Copies the values in a 256-bit vector of [4 x double] as specified
	/// by the 256-bit integer vector operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \param __c
	/// A 256-bit integer vector operand specifying how the values are to be
	/// copied. \n
	/// Bit [1]: \n
	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
	/// vector. \n
	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
	/// returned vector. \n
	/// Bit [65]: \n
	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
	/// returned vector. \n
	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
	/// returned vector. \n
	/// Bit [129]: \n
	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
	/// returned vector. \n
	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
	/// returned vector. \n
	/// Bit [193]: \n
	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
	/// returned vector. \n
	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
	/// returned vector.
	/// \returns A 256-bit vector of [4 x double] containing the copied values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_permutevar_pd(__m256d __a, __m256i __c)
	{
	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
	}

	/// \brief Copies the values stored in a 128-bit vector of [4 x float] as
	/// specified by the 128-bit integer vector operand.
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \param __c
	/// A 128-bit integer vector operand specifying how the values are to be
	/// copied. \n
	/// Bits [1:0]: \n
	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// Bits [33:32]: \n
	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// Bits [65:64]: \n
	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// Bits [97:96]: \n
	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
	/// returned vector.
	/// \returns A 128-bit vector of [4 x float] containing the copied values.
	static __inline __m128 __DEFAULT_FN_ATTRS
	_mm_permutevar_ps(__m128 __a, __m128i __c)
	{
	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
	}

	/// \brief Copies the values stored in a 256-bit vector of [8 x float] as
	/// specified by the 256-bit integer vector operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \param __c
	/// A 256-bit integer vector operand specifying how the values are to be
	/// copied. \n
	/// Bits [1:0]: \n
	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// Bits [33:32]: \n
	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// Bits [65:64]: \n
	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// Bits [97:96]: \n
	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// Bits [129:128]: \n
	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
	/// returned vector. \n
	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
	/// returned vector. \n
	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
	/// returned vector. \n
	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
	/// returned vector. \n
	/// Bits [161:160]: \n
	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
	/// returned vector. \n
	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
	/// returned vector. \n
	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
	/// returned vector. \n
	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
	/// returned vector. \n
	/// Bits [193:192]: \n
	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
	/// returned vector. \n
	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
	/// returned vector. \n
	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
	/// returned vector. \n
	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
	/// returned vector. \n
	/// Bits [225:224]: \n
	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
	/// returned vector. \n
	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
	/// returned vector. \n
	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
	/// returned vector. \n
	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
	/// returned vector.
	/// \returns A 256-bit vector of [8 x float] containing the copied values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_permutevar_ps(__m256 __a, __m256i __c)
	{
	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
	}

	/// \brief Copies the values in a 128-bit vector of [2 x double] as specified
	/// by the immediate integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128d _mm_permute_pd(__m128d A, const int C);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
	///
	/// \param A
	/// A 128-bit vector of [2 x double].
	/// \param C
	/// An immediate integer operand specifying how the values are to be
	/// copied. \n
	/// Bit [0]: \n
	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
	/// vector. \n
	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
	/// returned vector. \n
	/// Bit [1]: \n
	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
	/// returned vector. \n
	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
	/// returned vector.
	/// \returns A 128-bit vector of [2 x double] containing the copied values.
	#define _mm_permute_pd(A, C) __extension__ ({ \
	(__m128d)__builtin_shufflevector((__v2df)(__m128d)(A), \
	(__v2df)_mm_undefined_pd(), \
	((C) >> 0) & 0x1, ((C) >> 1) & 0x1); })

	/// \brief Copies the values in a 256-bit vector of [4 x double] as specified by
	/// the immediate integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_permute_pd(__m256d A, const int C);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
	///
	/// \param A
	/// A 256-bit vector of [4 x double].
	/// \param C
	/// An immediate integer operand specifying how the values are to be
	/// copied. \n
	/// Bit [0]: \n
	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
	/// vector. \n
	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
	/// returned vector. \n
	/// Bit [1]: \n
	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
	/// returned vector. \n
	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
	/// returned vector. \n
	/// Bit [2]: \n
	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
	/// returned vector. \n
	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
	/// returned vector. \n
	/// Bit [3]: \n
	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
	/// returned vector. \n
	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
	/// returned vector.
	/// \returns A 256-bit vector of [4 x double] containing the copied values.
	#define _mm256_permute_pd(A, C) __extension__ ({ \
	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
	(__v4df)_mm256_undefined_pd(), \
	0 + (((C) >> 0) & 0x1), \
	0 + (((C) >> 1) & 0x1), \
	2 + (((C) >> 2) & 0x1), \
	2 + (((C) >> 3) & 0x1)); })

	/// \brief Copies the values in a 128-bit vector of [4 x float] as specified by
	/// the immediate integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128 _mm_permute_ps(__m128 A, const int C);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
	///
	/// \param A
	/// A 128-bit vector of [4 x float].
	/// \param C
	/// An immediate integer operand specifying how the values are to be
	/// copied. \n
	/// Bits [1:0]: \n
	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// Bits [3:2]: \n
	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// Bits [5:4]: \n
	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// Bits [7:6]: \n
	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
	/// returned vector.
	/// \returns A 128-bit vector of [4 x float] containing the copied values.
	#define _mm_permute_ps(A, C) __extension__ ({ \
	(__m128)__builtin_shufflevector((__v4sf)(__m128)(A), \
	(__v4sf)_mm_undefined_ps(), \
	((C) >> 0) & 0x3, ((C) >> 2) & 0x3, \
	((C) >> 4) & 0x3, ((C) >> 6) & 0x3); })

	/// \brief Copies the values in a 256-bit vector of [8 x float] as specified by
	/// the immediate integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_permute_ps(__m256 A, const int C);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
	///
	/// \param A
	/// A 256-bit vector of [8 x float].
	/// \param C
	/// An immediate integer operand specifying how the values are to be \n
	/// copied. \n
	/// Bits [1:0]: \n
	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
	/// returned vector. \n
	/// Bits [3:2]: \n
	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
	/// returned vector. \n
	/// Bits [5:4]: \n
	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
	/// returned vector. \n
	/// Bits [7:6]: \n
	/// 00: Bits [31:qq0] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
	/// returned vector. \n
	/// Bits [1:0]: \n
	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
	/// returned vector. \n
	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
	/// returned vector. \n
	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
	/// returned vector. \n
	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
	/// returned vector. \n
	/// Bits [3:2]: \n
	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
	/// returned vector. \n
	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
	/// returned vector. \n
	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
	/// returned vector. \n
	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
	/// returned vector. \n
	/// Bits [5:4]: \n
	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
	/// returned vector. \n
	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
	/// returned vector. \n
	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
	/// returned vector. \n
	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
	/// returned vector. \n
	/// Bits [7:6]: \n
	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
	/// returned vector. \n
	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
	/// returned vector. \n
	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
	/// returned vector. \n
	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
	/// returned vector.
	/// \returns A 256-bit vector of [8 x float] containing the copied values.
	#define _mm256_permute_ps(A, C) __extension__ ({ \
	(__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
	(__v8sf)_mm256_undefined_ps(), \
	0 + (((C) >> 0) & 0x3), \
	0 + (((C) >> 2) & 0x3), \
	0 + (((C) >> 4) & 0x3), \
	0 + (((C) >> 6) & 0x3), \
	4 + (((C) >> 0) & 0x3), \
	4 + (((C) >> 2) & 0x3), \
	4 + (((C) >> 4) & 0x3), \
	4 + (((C) >> 6) & 0x3)); })

	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
	/// [4 x double], as specified by the immediate integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
	///
	/// \param V1
	/// A 256-bit vector of [4 x double].
	/// \param V2
	/// A 256-bit vector of [4 x double.
	/// \param M
	/// An immediate integer operand specifying how the values are to be
	/// permuted. \n
	/// Bits [1:0]: \n
	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
	/// destination. \n
	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
	/// destination. \n
	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
	/// destination. \n
	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
	/// destination. \n
	/// Bits [5:4]: \n
	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
	/// destination. \n
	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
	/// destination. \n
	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
	/// destination. \n
	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
	/// destination.
	/// \returns A 256-bit vector of [4 x double] containing the copied values.
	#define _mm256_permute2f128_pd(V1, V2, M) __extension__ ({ \
	(__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
	(__v4df)(__m256d)(V2), (M)); })

	/// \brief Permutes 128-bit data values stored in two 256-bit vectors of
	/// [8 x float], as specified by the immediate integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
	///
	/// \param V1
	/// A 256-bit vector of [8 x float].
	/// \param V2
	/// A 256-bit vector of [8 x float].
	/// \param M
	/// An immediate integer operand specifying how the values are to be
	/// permuted. \n
	/// Bits [1:0]: \n
	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
	/// destination. \n
	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
	/// destination. \n
	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
	/// destination. \n
	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
	/// destination. \n
	/// Bits [5:4]: \n
	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
	/// destination. \n
	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
	/// destination. \n
	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
	/// destination. \n
	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
	/// destination.
	/// \returns A 256-bit vector of [8 x float] containing the copied values.
	#define _mm256_permute2f128_ps(V1, V2, M) __extension__ ({ \
	(__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
	(__v8sf)(__m256)(V2), (M)); })

	/// \brief Permutes 128-bit data values stored in two 256-bit integer vectors,
	/// as specified by the immediate integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
	///
	/// \param V1
	/// A 256-bit integer vector.
	/// \param V2
	/// A 256-bit integer vector.
	/// \param M
	/// An immediate integer operand specifying how the values are to be copied.
	/// Bits [1:0]: \n
	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
	/// destination. \n
	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
	/// destination. \n
	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
	/// destination. \n
	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
	/// destination. \n
	/// Bits [5:4]: \n
	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
	/// destination. \n
	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
	/// destination. \n
	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
	/// destination. \n
	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
	/// destination.
	/// \returns A 256-bit integer vector containing the copied values.
	#define _mm256_permute2f128_si256(V1, V2, M) __extension__ ({ \
	(__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
	(__v8si)(__m256i)(V2), (M)); })

	/* Vector Blend */
	/// \brief Merges 64-bit double-precision data values stored in either of the
	/// two 256-bit vectors of [4 x double], as specified by the immediate
	/// integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
	///
	/// \param V1
	/// A 256-bit vector of [4 x double].
	/// \param V2
	/// A 256-bit vector of [4 x double].
	/// \param M
	/// An immediate integer operand, with mask bits [3:0] specifying how the
	/// values are to be copied. The position of the mask bit corresponds to the
	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
	/// element in operand \a V1 is copied to the same position in the
	/// destination. When a mask bit is 1, the corresponding 64-bit element in
	/// operand \a V2 is copied to the same position in the destination.
	/// \returns A 256-bit vector of [4 x double] containing the copied values.
	#define _mm256_blend_pd(V1, V2, M) __extension__ ({ \
	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(V1), \
	(__v4df)(__m256d)(V2), \
	(((M) & 0x01) ? 4 : 0), \
	(((M) & 0x02) ? 5 : 1), \
	(((M) & 0x04) ? 6 : 2), \
	(((M) & 0x08) ? 7 : 3)); })

	/// \brief Merges 32-bit single-precision data values stored in either of the
	/// two 256-bit vectors of [8 x float], as specified by the immediate
	/// integer operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
	///
	/// \param V1
	/// A 256-bit vector of [8 x float].
	/// \param V2
	/// A 256-bit vector of [8 x float].
	/// \param M
	/// An immediate integer operand, with mask bits [7:0] specifying how the
	/// values are to be copied. The position of the mask bit corresponds to the
	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
	/// element in operand \a V1 is copied to the same position in the
	/// destination. When a mask bit is 1, the corresponding 32-bit element in
	/// operand \a V2 is copied to the same position in the destination.
	/// \returns A 256-bit vector of [8 x float] containing the copied values.
	#define _mm256_blend_ps(V1, V2, M) __extension__ ({ \
	(__m256)__builtin_shufflevector((__v8sf)(__m256)(V1), \
	(__v8sf)(__m256)(V2), \
	(((M) & 0x01) ? 8 : 0), \
	(((M) & 0x02) ? 9 : 1), \
	(((M) & 0x04) ? 10 : 2), \
	(((M) & 0x08) ? 11 : 3), \
	(((M) & 0x10) ? 12 : 4), \
	(((M) & 0x20) ? 13 : 5), \
	(((M) & 0x40) ? 14 : 6), \
	(((M) & 0x80) ? 15 : 7)); })

	/// \brief Merges 64-bit double-precision data values stored in either of the
	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
	/// operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \param __b
	/// A 256-bit vector of [4 x double].
	/// \param __c
	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
	/// how the values are to be copied. The position of the mask bit corresponds
	/// to the most significant bit of a copied value. When a mask bit is 0, the
	/// corresponding 64-bit element in operand \a __a is copied to the same
	/// position in the destination. When a mask bit is 1, the corresponding
	/// 64-bit element in operand \a __b is copied to the same position in the
	/// destination.
	/// \returns A 256-bit vector of [4 x double] containing the copied values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
	{
	return (__m256d)__builtin_ia32_blendvpd256(
	(__v4df)__a, (__v4df)__b, (__v4df)__c);
	}

	/// \brief Merges 32-bit single-precision data values stored in either of the
	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
	/// operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \param __b
	/// A 256-bit vector of [8 x float].
	/// \param __c
	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
	/// and 31 specifying how the values are to be copied. The position of the
	/// mask bit corresponds to the most significant bit of a copied value. When
	/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
	/// copied to the same position in the destination. When a mask bit is 1, the
	/// corresponding 32-bit element in operand \a __b is copied to the same
	/// position in the destination.
	/// \returns A 256-bit vector of [8 x float] containing the copied values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
	{
	return (__m256)__builtin_ia32_blendvps256(
	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
	}

	/* Vector Dot Product */
	/// \brief Computes two dot products in parallel, using the lower and upper
	/// halves of two [8 x float] vectors as input to the two computations, and
	/// returning the two dot products in the lower and upper halves of the
	/// [8 x float] result.
	///
	/// The immediate integer operand controls which input elements will
	/// contribute to the dot product, and where the final results are returned.
	/// In general, for each dot product, the four corresponding elements of the
	/// input vectors are multiplied; the first two and second two products are
	/// summed, then the two sums are added to form the final result.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
	///
	/// \param V1
	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
	/// \param V2
	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
	/// \param M
	/// An immediate integer argument. Bits [7:4] determine which elements of
	/// the input vectors are used, with bit [4] corresponding to the lowest
	/// element and bit [7] corresponding to the highest element of each [4 x
	/// float] subvector. If a bit is set, the corresponding elements from the
	/// two input vectors are used as an input for dot product; otherwise that
	/// input is treated as zero. Bits [3:0] determine which elements of the
	/// result will receive a copy of the final dot product, with bit [0]
	/// corresponding to the lowest element and bit [3] corresponding to the
	/// highest element of each [4 x float] subvector. If a bit is set, the dot
	/// product is returned in the corresponding element; otherwise that element
	/// is set to zero. The bitmask is applied in the same way to each of the
	/// two parallel dot product computations.
	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
	#define _mm256_dp_ps(V1, V2, M) __extension__ ({ \
	(__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
	(__v8sf)(__m256)(V2), (M)); })

	/* Vector shuffle */
	/// \brief Selects 8 float values from the 256-bit operands of [8 x float], as
	/// specified by the immediate value operand.
	///
	/// The four selected elements in each operand are copied to the destination
	/// according to the bits specified in the immediate operand. The selected
	/// elements from the first 256-bit operand are copied to bits [63:0] and
	/// bits [191:128] of the destination, and the selected elements from the
	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
	/// the destination. For example, if bits [7:0] of the immediate operand
	/// contain a value of 0xFF, the 256-bit destination vector would contain the
	/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
	///
	/// \param a
	/// A 256-bit vector of [8 x float]. The four selected elements in this
	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
	/// according to the bits specified in the immediate operand.
	/// \param b
	/// A 256-bit vector of [8 x float]. The four selected elements in this
	/// operand are copied to bits [127:64] and bits [255:192] in the
	/// destination, according to the bits specified in the immediate operand.
	/// \param mask
	/// An immediate value containing an 8-bit value specifying which elements to
	/// copy from \a a and \a b \n.
	/// Bits [3:0] specify the values copied from operand \a a. \n
	/// Bits [7:4] specify the values copied from operand \a b. \n
	/// The destinations within the 256-bit destination are assigned values as
	/// follows, according to the bit value assignments described below: \n
	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
	/// destination. \n
	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
	/// destination. \n
	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
	/// destination. \n
	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
	/// the destination. \n
	/// Bit value assignments: \n
	/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
	/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
	/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
	/// 11: Bits [127:96] and [255:224] are copied from the selected operand.
	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
	#define _mm256_shuffle_ps(a, b, mask) __extension__ ({ \
	(__m256)__builtin_shufflevector((__v8sf)(__m256)(a), \
	(__v8sf)(__m256)(b), \
	0 + (((mask) >> 0) & 0x3), \
	0 + (((mask) >> 2) & 0x3), \
	8 + (((mask) >> 4) & 0x3), \
	8 + (((mask) >> 6) & 0x3), \
	4 + (((mask) >> 0) & 0x3), \
	4 + (((mask) >> 2) & 0x3), \
	12 + (((mask) >> 4) & 0x3), \
	12 + (((mask) >> 6) & 0x3)); })

	/// \brief Selects four double-precision values from the 256-bit operands of
	/// [4 x double], as specified by the immediate value operand.
	///
	/// The selected elements from the first 256-bit operand are copied to bits
	/// [63:0] and bits [191:128] in the destination, and the selected elements
	/// from the second 256-bit operand are copied to bits [127:64] and bits
	/// [255:192] in the destination. For example, if bits [3:0] of the immediate
	/// operand contain a value of 0xF, the 256-bit destination vector would
	/// contain the following values: b[3], a[3], b[1], a[1].
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
	///
	/// \param a
	/// A 256-bit vector of [4 x double].
	/// \param b
	/// A 256-bit vector of [4 x double].
	/// \param mask
	/// An immediate value containing 8-bit values specifying which elements to
	/// copy from \a a and \a b: \n
	/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
	/// destination. \n
	/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
	/// destination. \n
	/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
	/// destination. \n
	/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
	/// destination. \n
	/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
	/// destination. \n
	/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
	/// destination. \n
	/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
	/// destination. \n
	/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
	/// destination.
	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
	#define _mm256_shuffle_pd(a, b, mask) __extension__ ({ \
	(__m256d)__builtin_shufflevector((__v4df)(__m256d)(a), \
	(__v4df)(__m256d)(b), \
	0 + (((mask) >> 0) & 0x1), \
	4 + (((mask) >> 1) & 0x1), \
	2 + (((mask) >> 2) & 0x1), \
	6 + (((mask) >> 3) & 0x1)); })

	/* Compare */
	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
	#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */

	/// \brief Compares each of the corresponding double-precision values of two
	/// 128-bit vectors of [2 x double], using the operation specified by the
	/// immediate integer operand.
	///
	/// Returns a [2 x double] vector consisting of two doubles corresponding to
	/// the two comparison results: zero if the comparison is false, and all 1's
	/// if the comparison is true.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
	///
	/// \param a
	/// A 128-bit vector of [2 x double].
	/// \param b
	/// A 128-bit vector of [2 x double].
	/// \param c
	/// An immediate integer operand, with bits [4:0] specifying which comparison
	/// operation to use: \n
	/// 0x00 : Equal (ordered, non-signaling)
	/// 0x01 : Less-than (ordered, signaling)
	/// 0x02 : Less-than-or-equal (ordered, signaling)
	/// 0x03 : Unordered (non-signaling)
	/// 0x04 : Not-equal (unordered, non-signaling)
	/// 0x05 : Not-less-than (unordered, signaling)
	/// 0x06 : Not-less-than-or-equal (unordered, signaling)
	/// 0x07 : Ordered (non-signaling)
	/// 0x08 : Equal (unordered, non-signaling)
	/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
	/// 0x0a : Not-greater-than (unordered, signaling)
	/// 0x0b : False (ordered, non-signaling)
	/// 0x0c : Not-equal (ordered, non-signaling)
	/// 0x0d : Greater-than-or-equal (ordered, signaling)
	/// 0x0e : Greater-than (ordered, signaling)
	/// 0x0f : True (unordered, non-signaling)
	/// 0x10 : Equal (ordered, signaling)
	/// 0x11 : Less-than (ordered, non-signaling)
	/// 0x12 : Less-than-or-equal (ordered, non-signaling)
	/// 0x13 : Unordered (signaling)
	/// 0x14 : Not-equal (unordered, signaling)
	/// 0x15 : Not-less-than (unordered, non-signaling)
	/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
	/// 0x17 : Ordered (signaling)
	/// 0x18 : Equal (unordered, signaling)
	/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
	/// 0x1a : Not-greater-than (unordered, non-signaling)
	/// 0x1b : False (ordered, signaling)
	/// 0x1c : Not-equal (ordered, signaling)
	/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
	/// 0x1e : Greater-than (ordered, non-signaling)
	/// 0x1f : True (unordered, signaling)
	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
	#define _mm_cmp_pd(a, b, c) __extension__ ({ \
	(__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
	(__v2df)(__m128d)(b), (c)); })

	/// \brief Compares each of the corresponding values of two 128-bit vectors of
	/// [4 x float], using the operation specified by the immediate integer
	/// operand.
	///
	/// Returns a [4 x float] vector consisting of four floats corresponding to
	/// the four comparison results: zero if the comparison is false, and all 1's
	/// if the comparison is true.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
	///
	/// \param a
	/// A 128-bit vector of [4 x float].
	/// \param b
	/// A 128-bit vector of [4 x float].
	/// \param c
	/// An immediate integer operand, with bits [4:0] specifying which comparison
	/// operation to use: \n
	/// 0x00 : Equal (ordered, non-signaling)
	/// 0x01 : Less-than (ordered, signaling)
	/// 0x02 : Less-than-or-equal (ordered, signaling)
	/// 0x03 : Unordered (non-signaling)
	/// 0x04 : Not-equal (unordered, non-signaling)
	/// 0x05 : Not-less-than (unordered, signaling)
	/// 0x06 : Not-less-than-or-equal (unordered, signaling)
	/// 0x07 : Ordered (non-signaling)
	/// 0x08 : Equal (unordered, non-signaling)
	/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
	/// 0x0a : Not-greater-than (unordered, signaling)
	/// 0x0b : False (ordered, non-signaling)
	/// 0x0c : Not-equal (ordered, non-signaling)
	/// 0x0d : Greater-than-or-equal (ordered, signaling)
	/// 0x0e : Greater-than (ordered, signaling)
	/// 0x0f : True (unordered, non-signaling)
	/// 0x10 : Equal (ordered, signaling)
	/// 0x11 : Less-than (ordered, non-signaling)
	/// 0x12 : Less-than-or-equal (ordered, non-signaling)
	/// 0x13 : Unordered (signaling)
	/// 0x14 : Not-equal (unordered, signaling)
	/// 0x15 : Not-less-than (unordered, non-signaling)
	/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
	/// 0x17 : Ordered (signaling)
	/// 0x18 : Equal (unordered, signaling)
	/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
	/// 0x1a : Not-greater-than (unordered, non-signaling)
	/// 0x1b : False (ordered, signaling)
	/// 0x1c : Not-equal (ordered, signaling)
	/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
	/// 0x1e : Greater-than (ordered, non-signaling)
	/// 0x1f : True (unordered, signaling)
	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
	#define _mm_cmp_ps(a, b, c) __extension__ ({ \
	(__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
	(__v4sf)(__m128)(b), (c)); })

	/// \brief Compares each of the corresponding double-precision values of two
	/// 256-bit vectors of [4 x double], using the operation specified by the
	/// immediate integer operand.
	///
	/// Returns a [4 x double] vector consisting of four doubles corresponding to
	/// the four comparison results: zero if the comparison is false, and all 1's
	/// if the comparison is true.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
	///
	/// \param a
	/// A 256-bit vector of [4 x double].
	/// \param b
	/// A 256-bit vector of [4 x double].
	/// \param c
	/// An immediate integer operand, with bits [4:0] specifying which comparison
	/// operation to use: \n
	/// 0x00 : Equal (ordered, non-signaling)
	/// 0x01 : Less-than (ordered, signaling)
	/// 0x02 : Less-than-or-equal (ordered, signaling)
	/// 0x03 : Unordered (non-signaling)
	/// 0x04 : Not-equal (unordered, non-signaling)
	/// 0x05 : Not-less-than (unordered, signaling)
	/// 0x06 : Not-less-than-or-equal (unordered, signaling)
	/// 0x07 : Ordered (non-signaling)
	/// 0x08 : Equal (unordered, non-signaling)
	/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
	/// 0x0a : Not-greater-than (unordered, signaling)
	/// 0x0b : False (ordered, non-signaling)
	/// 0x0c : Not-equal (ordered, non-signaling)
	/// 0x0d : Greater-than-or-equal (ordered, signaling)
	/// 0x0e : Greater-than (ordered, signaling)
	/// 0x0f : True (unordered, non-signaling)
	/// 0x10 : Equal (ordered, signaling)
	/// 0x11 : Less-than (ordered, non-signaling)
	/// 0x12 : Less-than-or-equal (ordered, non-signaling)
	/// 0x13 : Unordered (signaling)
	/// 0x14 : Not-equal (unordered, signaling)
	/// 0x15 : Not-less-than (unordered, non-signaling)
	/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
	/// 0x17 : Ordered (signaling)
	/// 0x18 : Equal (unordered, signaling)
	/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
	/// 0x1a : Not-greater-than (unordered, non-signaling)
	/// 0x1b : False (ordered, signaling)
	/// 0x1c : Not-equal (ordered, signaling)
	/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
	/// 0x1e : Greater-than (ordered, non-signaling)
	/// 0x1f : True (unordered, signaling)
	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
	#define _mm256_cmp_pd(a, b, c) __extension__ ({ \
	(__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
	(__v4df)(__m256d)(b), (c)); })

	/// \brief Compares each of the corresponding values of two 256-bit vectors of
	/// [8 x float], using the operation specified by the immediate integer
	/// operand.
	///
	/// Returns a [8 x float] vector consisting of eight floats corresponding to
	/// the eight comparison results: zero if the comparison is false, and all
	/// 1's if the comparison is true.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
	///
	/// \param a
	/// A 256-bit vector of [8 x float].
	/// \param b
	/// A 256-bit vector of [8 x float].
	/// \param c
	/// An immediate integer operand, with bits [4:0] specifying which comparison
	/// operation to use: \n
	/// 0x00 : Equal (ordered, non-signaling)
	/// 0x01 : Less-than (ordered, signaling)
	/// 0x02 : Less-than-or-equal (ordered, signaling)
	/// 0x03 : Unordered (non-signaling)
	/// 0x04 : Not-equal (unordered, non-signaling)
	/// 0x05 : Not-less-than (unordered, signaling)
	/// 0x06 : Not-less-than-or-equal (unordered, signaling)
	/// 0x07 : Ordered (non-signaling)
	/// 0x08 : Equal (unordered, non-signaling)
	/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
	/// 0x0a : Not-greater-than (unordered, signaling)
	/// 0x0b : False (ordered, non-signaling)
	/// 0x0c : Not-equal (ordered, non-signaling)
	/// 0x0d : Greater-than-or-equal (ordered, signaling)
	/// 0x0e : Greater-than (ordered, signaling)
	/// 0x0f : True (unordered, non-signaling)
	/// 0x10 : Equal (ordered, signaling)
	/// 0x11 : Less-than (ordered, non-signaling)
	/// 0x12 : Less-than-or-equal (ordered, non-signaling)
	/// 0x13 : Unordered (signaling)
	/// 0x14 : Not-equal (unordered, signaling)
	/// 0x15 : Not-less-than (unordered, non-signaling)
	/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
	/// 0x17 : Ordered (signaling)
	/// 0x18 : Equal (unordered, signaling)
	/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
	/// 0x1a : Not-greater-than (unordered, non-signaling)
	/// 0x1b : False (ordered, signaling)
	/// 0x1c : Not-equal (ordered, signaling)
	/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
	/// 0x1e : Greater-than (ordered, non-signaling)
	/// 0x1f : True (unordered, signaling)
	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
	#define _mm256_cmp_ps(a, b, c) __extension__ ({ \
	(__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
	(__v8sf)(__m256)(b), (c)); })

	/// \brief Compares each of the corresponding scalar double-precision values of
	/// two 128-bit vectors of [2 x double], using the operation specified by the
	/// immediate integer operand.
	///
	/// If the result is true, all 64 bits of the destination vector are set;
	/// otherwise they are cleared.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
	///
	/// \param a
	/// A 128-bit vector of [2 x double].
	/// \param b
	/// A 128-bit vector of [2 x double].
	/// \param c
	/// An immediate integer operand, with bits [4:0] specifying which comparison
	/// operation to use: \n
	/// 0x00 : Equal (ordered, non-signaling)
	/// 0x01 : Less-than (ordered, signaling)
	/// 0x02 : Less-than-or-equal (ordered, signaling)
	/// 0x03 : Unordered (non-signaling)
	/// 0x04 : Not-equal (unordered, non-signaling)
	/// 0x05 : Not-less-than (unordered, signaling)
	/// 0x06 : Not-less-than-or-equal (unordered, signaling)
	/// 0x07 : Ordered (non-signaling)
	/// 0x08 : Equal (unordered, non-signaling)
	/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
	/// 0x0a : Not-greater-than (unordered, signaling)
	/// 0x0b : False (ordered, non-signaling)
	/// 0x0c : Not-equal (ordered, non-signaling)
	/// 0x0d : Greater-than-or-equal (ordered, signaling)
	/// 0x0e : Greater-than (ordered, signaling)
	/// 0x0f : True (unordered, non-signaling)
	/// 0x10 : Equal (ordered, signaling)
	/// 0x11 : Less-than (ordered, non-signaling)
	/// 0x12 : Less-than-or-equal (ordered, non-signaling)
	/// 0x13 : Unordered (signaling)
	/// 0x14 : Not-equal (unordered, signaling)
	/// 0x15 : Not-less-than (unordered, non-signaling)
	/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
	/// 0x17 : Ordered (signaling)
	/// 0x18 : Equal (unordered, signaling)
	/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
	/// 0x1a : Not-greater-than (unordered, non-signaling)
	/// 0x1b : False (ordered, signaling)
	/// 0x1c : Not-equal (ordered, signaling)
	/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
	/// 0x1e : Greater-than (ordered, non-signaling)
	/// 0x1f : True (unordered, signaling)
	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
	#define _mm_cmp_sd(a, b, c) __extension__ ({ \
	(__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
	(__v2df)(__m128d)(b), (c)); })

	/// \brief Compares each of the corresponding scalar values of two 128-bit
	/// vectors of [4 x float], using the operation specified by the immediate
	/// integer operand.
	///
	/// If the result is true, all 32 bits of the destination vector are set;
	/// otherwise they are cleared.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
	///
	/// \param a
	/// A 128-bit vector of [4 x float].
	/// \param b
	/// A 128-bit vector of [4 x float].
	/// \param c
	/// An immediate integer operand, with bits [4:0] specifying which comparison
	/// operation to use: \n
	/// 0x00 : Equal (ordered, non-signaling)
	/// 0x01 : Less-than (ordered, signaling)
	/// 0x02 : Less-than-or-equal (ordered, signaling)
	/// 0x03 : Unordered (non-signaling)
	/// 0x04 : Not-equal (unordered, non-signaling)
	/// 0x05 : Not-less-than (unordered, signaling)
	/// 0x06 : Not-less-than-or-equal (unordered, signaling)
	/// 0x07 : Ordered (non-signaling)
	/// 0x08 : Equal (unordered, non-signaling)
	/// 0x09 : Not-greater-than-or-equal (unordered, signaling)
	/// 0x0a : Not-greater-than (unordered, signaling)
	/// 0x0b : False (ordered, non-signaling)
	/// 0x0c : Not-equal (ordered, non-signaling)
	/// 0x0d : Greater-than-or-equal (ordered, signaling)
	/// 0x0e : Greater-than (ordered, signaling)
	/// 0x0f : True (unordered, non-signaling)
	/// 0x10 : Equal (ordered, signaling)
	/// 0x11 : Less-than (ordered, non-signaling)
	/// 0x12 : Less-than-or-equal (ordered, non-signaling)
	/// 0x13 : Unordered (signaling)
	/// 0x14 : Not-equal (unordered, signaling)
	/// 0x15 : Not-less-than (unordered, non-signaling)
	/// 0x16 : Not-less-than-or-equal (unordered, non-signaling)
	/// 0x17 : Ordered (signaling)
	/// 0x18 : Equal (unordered, signaling)
	/// 0x19 : Not-greater-than-or-equal (unordered, non-signaling)
	/// 0x1a : Not-greater-than (unordered, non-signaling)
	/// 0x1b : False (ordered, signaling)
	/// 0x1c : Not-equal (ordered, signaling)
	/// 0x1d : Greater-than-or-equal (ordered, non-signaling)
	/// 0x1e : Greater-than (ordered, non-signaling)
	/// 0x1f : True (unordered, signaling)
	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
	#define _mm_cmp_ss(a, b, c) __extension__ ({ \
	(__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
	(__v4sf)(__m128)(b), (c)); })

	/// \brief Takes a [8 x i32] vector and returns the vector element value
	/// indexed by the immediate constant operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
	/// instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x i32].
	/// \param __imm
	/// An immediate integer operand with bits [2:0] determining which vector
	/// element is extracted and returned.
	/// \returns A 32-bit integer containing the extracted 32 bits of extended
	/// packed data.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_extract_epi32(__m256i __a, const int __imm)
	{
	__v8si __b = (__v8si)__a;
	return __b[__imm & 7];
	}

	/// \brief Takes a [16 x i16] vector and returns the vector element value
	/// indexed by the immediate constant operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
	/// instruction.
	///
	/// \param __a
	/// A 256-bit integer vector of [16 x i16].
	/// \param __imm
	/// An immediate integer operand with bits [3:0] determining which vector
	/// element is extracted and returned.
	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
	/// packed data.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_extract_epi16(__m256i __a, const int __imm)
	{
	__v16hi __b = (__v16hi)__a;
	return (unsigned short)__b[__imm & 15];
	}

	/// \brief Takes a [32 x i8] vector and returns the vector element value
	/// indexed by the immediate constant operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
	/// instruction.
	///
	/// \param __a
	/// A 256-bit integer vector of [32 x i8].
	/// \param __imm
	/// An immediate integer operand with bits [4:0] determining which vector
	/// element is extracted and returned.
	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
	/// packed data.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_extract_epi8(__m256i __a, const int __imm)
	{
	__v32qi __b = (__v32qi)__a;
	return (unsigned char)__b[__imm & 31];
	}

	#ifdef __x86_64__
	/// \brief Takes a [4 x i64] vector and returns the vector element value
	/// indexed by the immediate constant operand.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
	/// instruction.
	///
	/// \param __a
	/// A 256-bit integer vector of [4 x i64].
	/// \param __imm
	/// An immediate integer operand with bits [1:0] determining which vector
	/// element is extracted and returned.
	/// \returns A 64-bit integer containing the extracted 64 bits of extended
	/// packed data.
	static __inline long long __DEFAULT_FN_ATTRS
	_mm256_extract_epi64(__m256i __a, const int __imm)
	{
	__v4di __b = (__v4di)__a;
	return __b[__imm & 3];
	}
	#endif

	/// \brief Takes a [8 x i32] vector and replaces the vector element value
	/// indexed by the immediate constant operand by a new value. Returns the
	/// modified vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
	/// instruction.
	///
	/// \param __a
	/// A vector of [8 x i32] to be used by the insert operation.
	/// \param __b
	/// An integer value. The replacement value for the insert operation.
	/// \param __imm
	/// An immediate integer specifying the index of the vector element to be
	/// replaced.
	/// \returns A copy of vector \a __a, after replacing its element indexed by
	/// \a __imm with \a __b.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_insert_epi32(__m256i __a, int __b, int const __imm)
	{
	__v8si __c = (__v8si)__a;
	__c[__imm & 7] = __b;
	return (__m256i)__c;
	}


	/// \brief Takes a [16 x i16] vector and replaces the vector element value
	/// indexed by the immediate constant operand with a new value. Returns the
	/// modified vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
	/// instruction.
	///
	/// \param __a
	/// A vector of [16 x i16] to be used by the insert operation.
	/// \param __b
	/// An i16 integer value. The replacement value for the insert operation.
	/// \param __imm
	/// An immediate integer specifying the index of the vector element to be
	/// replaced.
	/// \returns A copy of vector \a __a, after replacing its element indexed by
	/// \a __imm with \a __b.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_insert_epi16(__m256i __a, int __b, int const __imm)
	{
	__v16hi __c = (__v16hi)__a;
	__c[__imm & 15] = __b;
	return (__m256i)__c;
	}

	/// \brief Takes a [32 x i8] vector and replaces the vector element value
	/// indexed by the immediate constant operand with a new value. Returns the
	/// modified vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
	/// instruction.
	///
	/// \param __a
	/// A vector of [32 x i8] to be used by the insert operation.
	/// \param __b
	/// An i8 integer value. The replacement value for the insert operation.
	/// \param __imm
	/// An immediate integer specifying the index of the vector element to be
	/// replaced.
	/// \returns A copy of vector \a __a, after replacing its element indexed by
	/// \a __imm with \a __b.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_insert_epi8(__m256i __a, int __b, int const __imm)
	{
	__v32qi __c = (__v32qi)__a;
	__c[__imm & 31] = __b;
	return (__m256i)__c;
	}

	#ifdef __x86_64__
	/// \brief Takes a [4 x i64] vector and replaces the vector element value
	/// indexed by the immediate constant operand with a new value. Returns the
	/// modified vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
	/// instruction.
	///
	/// \param __a
	/// A vector of [4 x i64] to be used by the insert operation.
	/// \param __b
	/// A 64-bit integer value. The replacement value for the insert operation.
	/// \param __imm
	/// An immediate integer specifying the index of the vector element to be
	/// replaced.
	/// \returns A copy of vector \a __a, after replacing its element indexed by
	/// \a __imm with \a __b.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_insert_epi64(__m256i __a, long long __b, int const __imm)
	{
	__v4di __c = (__v4di)__a;
	__c[__imm & 3] = __b;
	return (__m256i)__c;
	}
	#endif

	/* Conversion */
	/// \brief Converts a vector of [4 x i32] into a vector of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
	///
	/// \param __a
	/// A 128-bit integer vector of [4 x i32].
	/// \returns A 256-bit vector of [4 x double] containing the converted values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_cvtepi32_pd(__m128i __a)
	{
	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
	}

	/// \brief Converts a vector of [8 x i32] into a vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \returns A 256-bit vector of [8 x float] containing the converted values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_cvtepi32_ps(__m256i __a)
	{
	return (__m256)__builtin_ia32_cvtdq2ps256((__v8si) __a);
	}

	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of
	/// [4 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \returns A 128-bit vector of [4 x float] containing the converted values.
	static __inline __m128 __DEFAULT_FN_ATTRS
	_mm256_cvtpd_ps(__m256d __a)
	{
	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
	}

	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \returns A 256-bit integer vector containing the converted values.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_cvtps_epi32(__m256 __a)
	{
	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
	}

	/// \brief Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
	/// x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \returns A 256-bit vector of [4 x double] containing the converted values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_cvtps_pd(__m128 __a)
	{
	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
	}

	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
	/// x i32], truncating the result by rounding towards zero when it is
	/// inexact.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \returns A 128-bit integer vector containing the converted values.
	static __inline __m128i __DEFAULT_FN_ATTRS
	_mm256_cvttpd_epi32(__m256d __a)
	{
	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
	}

	/// \brief Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
	/// x i32]. When a conversion is inexact, the value returned is rounded
	/// according to the rounding control bits in the MXCSR register.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \returns A 128-bit integer vector containing the converted values.
	static __inline __m128i __DEFAULT_FN_ATTRS
	_mm256_cvtpd_epi32(__m256d __a)
	{
	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
	}

	/// \brief Converts a vector of [8 x float] into a vector of [8 x i32],
	/// truncating the result by rounding towards zero when it is inexact.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \returns A 256-bit integer vector containing the converted values.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_cvttps_epi32(__m256 __a)
	{
	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
	}

	/// \brief Returns the first element of the input vector of [4 x double].
	///
	/// \headerfile <avxintrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \returns A 64 bit double containing the first element of the input vector.
	static __inline double __DEFAULT_FN_ATTRS
	_mm256_cvtsd_f64(__m256d __a)
	{
	return __a[0];
	}

	/// \brief Returns the first element of the input vector of [8 x i32].
	///
	/// \headerfile <avxintrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x i32].
	/// \returns A 32 bit integer containing the first element of the input vector.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_cvtsi256_si32(__m256i __a)
	{
	__v8si __b = (__v8si)__a;
	return __b[0];
	}

	/// \brief Returns the first element of the input vector of [8 x float].
	///
	/// \headerfile <avxintrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \returns A 32 bit float containing the first element of the input vector.
	static __inline float __DEFAULT_FN_ATTRS
	_mm256_cvtss_f32(__m256 __a)
	{
	return __a[0];
	}

	/* Vector replicate */
	/// \brief Moves and duplicates high-order (odd-indexed) values from a 256-bit
	/// vector of [8 x float] to float values in a 256-bit vector of
	/// [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float]. \n
	/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
	/// the return value. \n
	/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
	/// the return value. \n
	/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
	/// return value. \n
	/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
	/// return value.
	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
	/// values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_movehdup_ps(__m256 __a)
	{
	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
	}

	/// \brief Moves and duplicates low-order (even-indexed) values from a 256-bit
	/// vector of [8 x float] to float values in a 256-bit vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float]. \n
	/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
	/// the return value. \n
	/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
	/// the return value. \n
	/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
	/// return value. \n
	/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
	/// return value.
	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
	/// values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_moveldup_ps(__m256 __a)
	{
	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
	}

	/// \brief Moves and duplicates double-precision floating point values from a
	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
	/// vector of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double]. \n
	/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
	/// return value. \n
	/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
	/// the return value.
	/// \returns A 256-bit vector of [4 x double] containing the moved and
	/// duplicated values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_movedup_pd(__m256d __a)
	{
	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
	}

	/* Unpack and Interleave */
	/// \brief Unpacks the odd-indexed vector elements from two 256-bit vectors of
	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit floating-point vector of [4 x double]. \n
	/// Bits [127:64] are written to bits [63:0] of the return value. \n
	/// Bits [255:192] are written to bits [191:128] of the return value. \n
	/// \param __b
	/// A 256-bit floating-point vector of [4 x double]. \n
	/// Bits [127:64] are written to bits [127:64] of the return value. \n
	/// Bits [255:192] are written to bits [255:192] of the return value. \n
	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
	{
	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
	}

	/// \brief Unpacks the even-indexed vector elements from two 256-bit vectors of
	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit floating-point vector of [4 x double]. \n
	/// Bits [63:0] are written to bits [63:0] of the return value. \n
	/// Bits [191:128] are written to bits [191:128] of the return value.
	/// \param __b
	/// A 256-bit floating-point vector of [4 x double]. \n
	/// Bits [63:0] are written to bits [127:64] of the return value. \n
	/// Bits [191:128] are written to bits [255:192] of the return value. \n
	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
	{
	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
	}

	/// \brief Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
	/// vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float]. \n
	/// Bits [95:64] are written to bits [31:0] of the return value. \n
	/// Bits [127:96] are written to bits [95:64] of the return value. \n
	/// Bits [223:192] are written to bits [159:128] of the return value. \n
	/// Bits [255:224] are written to bits [223:192] of the return value.
	/// \param __b
	/// A 256-bit vector of [8 x float]. \n
	/// Bits [95:64] are written to bits [63:32] of the return value. \n
	/// Bits [127:96] are written to bits [127:96] of the return value. \n
	/// Bits [223:192] are written to bits [191:160] of the return value. \n
	/// Bits [255:224] are written to bits [255:224] of the return value.
	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
	{
	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
	}

	/// \brief Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
	/// vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float]. \n
	/// Bits [31:0] are written to bits [31:0] of the return value. \n
	/// Bits [63:32] are written to bits [95:64] of the return value. \n
	/// Bits [159:128] are written to bits [159:128] of the return value. \n
	/// Bits [191:160] are written to bits [223:192] of the return value.
	/// \param __b
	/// A 256-bit vector of [8 x float]. \n
	/// Bits [31:0] are written to bits [63:32] of the return value. \n
	/// Bits [63:32] are written to bits [127:96] of the return value. \n
	/// Bits [159:128] are written to bits [191:160] of the return value. \n
	/// Bits [191:160] are written to bits [255:224] of the return value.
	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
	{
	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
	}

	/* Bit Test */
	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
	/// element-by-element comparison of the double-precision element in the
	/// first source vector and the corresponding element in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the ZF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [2 x double].
	/// \param __b
	/// A 128-bit vector of [2 x double].
	/// \returns the ZF flag in the EFLAGS register.
	static __inline int __DEFAULT_FN_ATTRS
	_mm_testz_pd(__m128d __a, __m128d __b)
	{
	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
	}

	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
	/// element-by-element comparison of the double-precision element in the
	/// first source vector and the corresponding element in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the CF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [2 x double].
	/// \param __b
	/// A 128-bit vector of [2 x double].
	/// \returns the CF flag in the EFLAGS register.
	static __inline int __DEFAULT_FN_ATTRS
	_mm_testc_pd(__m128d __a, __m128d __b)
	{
	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
	}

	/// \brief Given two 128-bit floating-point vectors of [2 x double], perform an
	/// element-by-element comparison of the double-precision element in the
	/// first source vector and the corresponding element in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
	/// otherwise it returns 0.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [2 x double].
	/// \param __b
	/// A 128-bit vector of [2 x double].
	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
	static __inline int __DEFAULT_FN_ATTRS
	_mm_testnzc_pd(__m128d __a, __m128d __b)
	{
	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
	}

	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
	/// element-by-element comparison of the single-precision element in the
	/// first source vector and the corresponding element in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the ZF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \param __b
	/// A 128-bit vector of [4 x float].
	/// \returns the ZF flag.
	static __inline int __DEFAULT_FN_ATTRS
	_mm_testz_ps(__m128 __a, __m128 __b)
	{
	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
	}

	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
	/// element-by-element comparison of the single-precision element in the
	/// first source vector and the corresponding element in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the CF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \param __b
	/// A 128-bit vector of [4 x float].
	/// \returns the CF flag.
	static __inline int __DEFAULT_FN_ATTRS
	_mm_testc_ps(__m128 __a, __m128 __b)
	{
	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
	}

	/// \brief Given two 128-bit floating-point vectors of [4 x float], perform an
	/// element-by-element comparison of the single-precision element in the
	/// first source vector and the corresponding element in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
	/// otherwise it returns 0.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \param __b
	/// A 128-bit vector of [4 x float].
	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
	static __inline int __DEFAULT_FN_ATTRS
	_mm_testnzc_ps(__m128 __a, __m128 __b)
	{
	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
	}

	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
	/// element-by-element comparison of the double-precision elements in the
	/// first source vector and the corresponding elements in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the ZF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \param __b
	/// A 256-bit vector of [4 x double].
	/// \returns the ZF flag.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testz_pd(__m256d __a, __m256d __b)
	{
	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
	}

	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
	/// element-by-element comparison of the double-precision elements in the
	/// first source vector and the corresponding elements in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the CF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \param __b
	/// A 256-bit vector of [4 x double].
	/// \returns the CF flag.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testc_pd(__m256d __a, __m256d __b)
	{
	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
	}

	/// \brief Given two 256-bit floating-point vectors of [4 x double], perform an
	/// element-by-element comparison of the double-precision elements in the
	/// first source vector and the corresponding elements in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of double-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
	/// otherwise it returns 0.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double].
	/// \param __b
	/// A 256-bit vector of [4 x double].
	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testnzc_pd(__m256d __a, __m256d __b)
	{
	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
	}

	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
	/// element-by-element comparison of the single-precision element in the
	/// first source vector and the corresponding element in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the ZF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \param __b
	/// A 256-bit vector of [8 x float].
	/// \returns the ZF flag.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testz_ps(__m256 __a, __m256 __b)
	{
	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
	}

	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
	/// element-by-element comparison of the single-precision element in the
	/// first source vector and the corresponding element in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the CF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \param __b
	/// A 256-bit vector of [8 x float].
	/// \returns the CF flag.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testc_ps(__m256 __a, __m256 __b)
	{
	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
	}

	/// \brief Given two 256-bit floating-point vectors of [8 x float], perform an
	/// element-by-element comparison of the single-precision elements in the
	/// first source vector and the corresponding elements in the second source
	/// vector.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
	/// ZF flag is set to 1. \n
	/// If there is at least one pair of single-precision elements where the
	/// sign-bit of the first element is 0 and the sign-bit of the second element
	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
	/// otherwise it returns 0.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float].
	/// \param __b
	/// A 256-bit vector of [8 x float].
	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testnzc_ps(__m256 __a, __m256 __b)
	{
	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
	}

	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
	/// of the two source vectors.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of bits where both bits are 1, the ZF flag
	/// is set to 0. Otherwise the ZF flag is set to 1. \n
	/// If there is at least one pair of bits where the bit from the first source
	/// vector is 0 and the bit from the second source vector is 1, the CF flag
	/// is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the ZF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \param __b
	/// A 256-bit integer vector.
	/// \returns the ZF flag.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testz_si256(__m256i __a, __m256i __b)
	{
	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
	}

	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
	/// of the two source vectors.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of bits where both bits are 1, the ZF flag
	/// is set to 0. Otherwise the ZF flag is set to 1. \n
	/// If there is at least one pair of bits where the bit from the first source
	/// vector is 0 and the bit from the second source vector is 1, the CF flag
	/// is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns the value of the CF flag.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \param __b
	/// A 256-bit integer vector.
	/// \returns the CF flag.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testc_si256(__m256i __a, __m256i __b)
	{
	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
	}

	/// \brief Given two 256-bit integer vectors, perform a bit-by-bit comparison
	/// of the two source vectors.
	///
	/// The EFLAGS register is updated as follows: \n
	/// If there is at least one pair of bits where both bits are 1, the ZF flag
	/// is set to 0. Otherwise the ZF flag is set to 1. \n
	/// If there is at least one pair of bits where the bit from the first source
	/// vector is 0 and the bit from the second source vector is 1, the CF flag
	/// is set to 0. Otherwise the CF flag is set to 1. \n
	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
	/// otherwise it returns 0.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \param __b
	/// A 256-bit integer vector.
	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_testnzc_si256(__m256i __a, __m256i __b)
	{
	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
	}

	/* Vector extract sign mask */
	/// \brief Extracts the sign bits of double-precision floating point elements
	/// in a 256-bit vector of [4 x double] and writes them to the lower order
	/// bits of the return value.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [4 x double] containing the double-precision
	/// floating point values with sign bits to be extracted.
	/// \returns The sign bits from the operand, written to bits [3:0].
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_movemask_pd(__m256d __a)
	{
	return __builtin_ia32_movmskpd256((__v4df)__a);
	}

	/// \brief Extracts the sign bits of double-precision floating point elements
	/// in a 256-bit vector of [8 x float] and writes them to the lower order
	/// bits of the return value.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
	///
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the double-precision floating
	/// point values with sign bits to be extracted.
	/// \returns The sign bits from the operand, written to bits [7:0].
	static __inline int __DEFAULT_FN_ATTRS
	_mm256_movemask_ps(__m256 __a)
	{
	return __builtin_ia32_movmskps256((__v8sf)__a);
	}

	/* Vector __zero */
	/// \brief Zeroes the contents of all XMM or YMM registers.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_zeroall(void)
	{
	__builtin_ia32_vzeroall();
	}

	/// \brief Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_zeroupper(void)
	{
	__builtin_ia32_vzeroupper();
	}

	/* Vector load with broadcast */
	/// \brief Loads a scalar single-precision floating point value from the
	/// specified address pointed to by \a __a and broadcasts it to the elements
	/// of a [4 x float] vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
	///
	/// \param __a
	/// The single-precision floating point value to be broadcast.
	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
	/// equal to the broadcast value.
	static __inline __m128 __DEFAULT_FN_ATTRS
	_mm_broadcast_ss(float const *__a)
	{
	float __f = *__a;
	return (__m128)(__v4sf){ __f, __f, __f, __f };
	}

	/// \brief Loads a scalar double-precision floating point value from the
	/// specified address pointed to by \a __a and broadcasts it to the elements
	/// of a [4 x double] vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
	///
	/// \param __a
	/// The double-precision floating point value to be broadcast.
	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
	/// equal to the broadcast value.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_broadcast_sd(double const *__a)
	{
	double __d = *__a;
	return (__m256d)(__v4df){ __d, __d, __d, __d };
	}

	/// \brief Loads a scalar single-precision floating point value from the
	/// specified address pointed to by \a __a and broadcasts it to the elements
	/// of a [8 x float] vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
	///
	/// \param __a
	/// The single-precision floating point value to be broadcast.
	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
	/// equal to the broadcast value.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_broadcast_ss(float const *__a)
	{
	float __f = *__a;
	return (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
	}

	/// \brief Loads the data from a 128-bit vector of [2 x double] from the
	/// specified address pointed to by \a __a and broadcasts it to 128-bit
	/// elements in a 256-bit vector of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
	///
	/// \param __a
	/// The 128-bit vector of [2 x double] to be broadcast.
	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
	/// equal to the broadcast value.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_broadcast_pd(__m128d const *__a)
	{
	return (__m256d)__builtin_ia32_vbroadcastf128_pd256((__v2df const *)__a);
	}

	/// \brief Loads the data from a 128-bit vector of [4 x float] from the
	/// specified address pointed to by \a __a and broadcasts it to 128-bit
	/// elements in a 256-bit vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
	///
	/// \param __a
	/// The 128-bit vector of [4 x float] to be broadcast.
	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
	/// equal to the broadcast value.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_broadcast_ps(__m128 const *__a)
	{
	return (__m256)__builtin_ia32_vbroadcastf128_ps256((__v4sf const *)__a);
	}

	/* SIMD load ops */
	/// \brief Loads 4 double-precision floating point values from a 32-byte aligned
	/// memory location pointed to by \a __p into a vector of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
	///
	/// \param __p
	/// A 32-byte aligned pointer to a memory location containing
	/// double-precision floating point values.
	/// \returns A 256-bit vector of [4 x double] containing the moved values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_load_pd(double const *__p)
	{
	return (__m256d )__p;
	}

	/// \brief Loads 8 single-precision floating point values from a 32-byte aligned
	/// memory location pointed to by \a __p into a vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
	///
	/// \param __p
	/// A 32-byte aligned pointer to a memory location containing float values.
	/// \returns A 256-bit vector of [8 x float] containing the moved values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_load_ps(float const *__p)
	{
	return (__m256 )__p;
	}

	/// \brief Loads 4 double-precision floating point values from an unaligned
	/// memory location pointed to by \a __p into a vector of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location containing double-precision floating
	/// point values.
	/// \returns A 256-bit vector of [4 x double] containing the moved values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_loadu_pd(double const *__p)
	{
	struct __loadu_pd {
	__m256d __v;
	} __attribute__((__packed__, __may_alias__));
	return ((struct __loadu_pd*)__p)->__v;
	}

	/// \brief Loads 8 single-precision floating point values from an unaligned
	/// memory location pointed to by \a __p into a vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location containing single-precision floating
	/// point values.
	/// \returns A 256-bit vector of [8 x float] containing the moved values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_loadu_ps(float const *__p)
	{
	struct __loadu_ps {
	__m256 __v;
	} __attribute__((__packed__, __may_alias__));
	return ((struct __loadu_ps*)__p)->__v;
	}

	/// \brief Loads 256 bits of integer data from a 32-byte aligned memory
	/// location pointed to by \a __p into elements of a 256-bit integer vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
	///
	/// \param __p
	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
	/// values.
	/// \returns A 256-bit integer vector containing the moved values.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_load_si256(__m256i const *__p)
	{
	return *__p;
	}

	/// \brief Loads 256 bits of integer data from an unaligned memory location
	/// pointed to by \a __p into a 256-bit integer vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
	///
	/// \param __p
	/// A pointer to a 256-bit integer vector containing integer values.
	/// \returns A 256-bit integer vector containing the moved values.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_loadu_si256(__m256i const *__p)
	{
	struct __loadu_si256 {
	__m256i __v;
	} __attribute__((__packed__, __may_alias__));
	return ((struct __loadu_si256*)__p)->__v;
	}

	/// \brief Loads 256 bits of integer data from an unaligned memory location
	/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
	/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
	/// line boundary.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
	///
	/// \param __p
	/// A pointer to a 256-bit integer vector containing integer values.
	/// \returns A 256-bit integer vector containing the moved values.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_lddqu_si256(__m256i const *__p)
	{
	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
	}

	/* SIMD store ops */
	/// \brief Stores double-precision floating point values from a 256-bit vector
	/// of [4 x double] to a 32-byte aligned memory location pointed to by
	/// \a __p.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
	///
	/// \param __p
	/// A 32-byte aligned pointer to a memory location that will receive the
	/// double-precision floaing point values.
	/// \param __a
	/// A 256-bit vector of [4 x double] containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_store_pd(double *__p, __m256d __a)
	{
	(__m256d )__p = __a;
	}

	/// \brief Stores single-precision floating point values from a 256-bit vector
	/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
	///
	/// \param __p
	/// A 32-byte aligned pointer to a memory location that will receive the
	/// float values.
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_store_ps(float *__p, __m256 __a)
	{
	(__m256 )__p = __a;
	}

	/// \brief Stores double-precision floating point values from a 256-bit vector
	/// of [4 x double] to an unaligned memory location pointed to by \a __p.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that will receive the double-precision
	/// floating point values.
	/// \param __a
	/// A 256-bit vector of [4 x double] containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_storeu_pd(double *__p, __m256d __a)
	{
	struct __storeu_pd {
	__m256d __v;
	} __attribute__((__packed__, __may_alias__));
	((struct __storeu_pd*)__p)->__v = __a;
	}

	/// \brief Stores single-precision floating point values from a 256-bit vector
	/// of [8 x float] to an unaligned memory location pointed to by \a __p.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that will receive the float values.
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_storeu_ps(float *__p, __m256 __a)
	{
	struct __storeu_ps {
	__m256 __v;
	} __attribute__((__packed__, __may_alias__));
	((struct __storeu_ps*)__p)->__v = __a;
	}

	/// \brief Stores integer values from a 256-bit integer vector to a 32-byte
	/// aligned memory location pointed to by \a __p.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
	///
	/// \param __p
	/// A 32-byte aligned pointer to a memory location that will receive the
	/// integer values.
	/// \param __a
	/// A 256-bit integer vector containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_store_si256(__m256i *__p, __m256i __a)
	{
	*__p = __a;
	}

	/// \brief Stores integer values from a 256-bit integer vector to an unaligned
	/// memory location pointed to by \a __p.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that will receive the integer values.
	/// \param __a
	/// A 256-bit integer vector containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_storeu_si256(__m256i *__p, __m256i __a)
	{
	struct __storeu_si256 {
	__m256i __v;
	} __attribute__((__packed__, __may_alias__));
	((struct __storeu_si256*)__p)->__v = __a;
	}

	/* Conditional load ops */
	/// \brief Conditionally loads double-precision floating point elements from a
	/// memory location pointed to by \a __p into a 128-bit vector of
	/// [2 x double], depending on the mask bits associated with each data
	/// element.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that contains the double-precision
	/// floating point values.
	/// \param __m
	/// A 128-bit integer vector containing the mask. The most significant bit of
	/// each data element represents the mask bits. If a mask bit is zero, the
	/// corresponding value in the memory location is not loaded and the
	/// corresponding field in the return value is set to zero.
	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
	static __inline __m128d __DEFAULT_FN_ATTRS
	_mm_maskload_pd(double const *__p, __m128i __m)
	{
	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
	}

	/// \brief Conditionally loads double-precision floating point elements from a
	/// memory location pointed to by \a __p into a 256-bit vector of
	/// [4 x double], depending on the mask bits associated with each data
	/// element.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that contains the double-precision
	/// floating point values.
	/// \param __m
	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
	/// significant bit of each quadword element represents the mask bits. If a
	/// mask bit is zero, the corresponding value in the memory location is not
	/// loaded and the corresponding field in the return value is set to zero.
	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_maskload_pd(double const *__p, __m256i __m)
	{
	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
	(__v4di)__m);
	}

	/// \brief Conditionally loads single-precision floating point elements from a
	/// memory location pointed to by \a __p into a 128-bit vector of
	/// [4 x float], depending on the mask bits associated with each data
	/// element.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that contains the single-precision
	/// floating point values.
	/// \param __m
	/// A 128-bit integer vector containing the mask. The most significant bit of
	/// each data element represents the mask bits. If a mask bit is zero, the
	/// corresponding value in the memory location is not loaded and the
	/// corresponding field in the return value is set to zero.
	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
	static __inline __m128 __DEFAULT_FN_ATTRS
	_mm_maskload_ps(float const *__p, __m128i __m)
	{
	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
	}

	/// \brief Conditionally loads single-precision floating point elements from a
	/// memory location pointed to by \a __p into a 256-bit vector of
	/// [8 x float], depending on the mask bits associated with each data
	/// element.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that contains the single-precision
	/// floating point values.
	/// \param __m
	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
	/// significant bit of each dword element represents the mask bits. If a mask
	/// bit is zero, the corresponding value in the memory location is not loaded
	/// and the corresponding field in the return value is set to zero.
	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_maskload_ps(float const *__p, __m256i __m)
	{
	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
	}

	/* Conditional store ops */
	/// \brief Moves single-precision floating point values from a 256-bit vector
	/// of [8 x float] to a memory location pointed to by \a __p, according to
	/// the specified mask.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that will receive the float values.
	/// \param __m
	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
	/// significant bit of each dword element in the mask vector represents the
	/// mask bits. If a mask bit is zero, the corresponding value from vector
	/// \a __a is not stored and the corresponding field in the memory location
	/// pointed to by \a __p is not changed.
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the values to be stored.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
	{
	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
	}

	/// \brief Moves double-precision values from a 128-bit vector of [2 x double]
	/// to a memory location pointed to by \a __p, according to the specified
	/// mask.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that will receive the float values.
	/// \param __m
	/// A 128-bit integer vector containing the mask. The most significant bit of
	/// each field in the mask vector represents the mask bits. If a mask bit is
	/// zero, the corresponding value from vector \a __a is not stored and the
	/// corresponding field in the memory location pointed to by \a __p is not
	/// changed.
	/// \param __a
	/// A 128-bit vector of [2 x double] containing the values to be stored.
	static __inline void __DEFAULT_FN_ATTRS
	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
	{
	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
	}

	/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
	/// to a memory location pointed to by \a __p, according to the specified
	/// mask.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that will receive the float values.
	/// \param __m
	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
	/// significant bit of each quadword element in the mask vector represents
	/// the mask bits. If a mask bit is zero, the corresponding value from vector
	/// __a is not stored and the corresponding field in the memory location
	/// pointed to by \a __p is not changed.
	/// \param __a
	/// A 256-bit vector of [4 x double] containing the values to be stored.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
	{
	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
	}

	/// \brief Moves single-precision floating point values from a 128-bit vector
	/// of [4 x float] to a memory location pointed to by \a __p, according to
	/// the specified mask.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
	///
	/// \param __p
	/// A pointer to a memory location that will receive the float values.
	/// \param __m
	/// A 128-bit integer vector containing the mask. The most significant bit of
	/// each field in the mask vector represents the mask bits. If a mask bit is
	/// zero, the corresponding value from vector __a is not stored and the
	/// corresponding field in the memory location pointed to by \a __p is not
	/// changed.
	/// \param __a
	/// A 128-bit vector of [4 x float] containing the values to be stored.
	static __inline void __DEFAULT_FN_ATTRS
	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
	{
	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
	}

	/* Cacheability support ops */
	/// \brief Moves integer data from a 256-bit integer vector to a 32-byte
	/// aligned memory location. To minimize caching, the data is flagged as
	/// non-temporal (unlikely to be used again soon).
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
	///
	/// \param __a
	/// A pointer to a 32-byte aligned memory location that will receive the
	/// integer values.
	/// \param __b
	/// A 256-bit integer vector containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_stream_si256(__m256i *__a, __m256i __b)
	{
	- __builtin_nontemporal_store((__v4di)__b, (__v4di*)__a);
	+ typedef __v4di __v4di_aligned __attribute__((aligned(32)));
	+ __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
	}

	/// \brief Moves double-precision values from a 256-bit vector of [4 x double]
	/// to a 32-byte aligned memory location. To minimize caching, the data is
	/// flagged as non-temporal (unlikely to be used again soon).
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
	///
	/// \param __a
	/// A pointer to a 32-byte aligned memory location that will receive the
	/// double-precision floating-point values.
	/// \param __b
	/// A 256-bit vector of [4 x double] containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_stream_pd(double *__a, __m256d __b)
	{
	- __builtin_nontemporal_store((__v4df)__b, (__v4df*)__a);
	+ typedef __v4df __v4df_aligned __attribute__((aligned(32)));
	+ __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
	}

	/// \brief Moves single-precision floating point values from a 256-bit vector
	/// of [8 x float] to a 32-byte aligned memory location. To minimize
	/// caching, the data is flagged as non-temporal (unlikely to be used again
	/// soon).
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
	///
	/// \param __p
	/// A pointer to a 32-byte aligned memory location that will receive the
	/// single-precision floating point values.
	/// \param __a
	/// A 256-bit vector of [8 x float] containing the values to be moved.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_stream_ps(float *__p, __m256 __a)
	{
	- __builtin_nontemporal_store((__v8sf)__a, (__v8sf*)__p);
	+ typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
	+ __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
	}

	/* Create vectors */
	/// \brief Create a 256-bit vector of [4 x double] with undefined values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \returns A 256-bit vector of [4 x double] containing undefined values.
	static __inline__ __m256d __DEFAULT_FN_ATTRS
	_mm256_undefined_pd(void)
	{
	return (__m256d)__builtin_ia32_undef256();
	}

	/// \brief Create a 256-bit vector of [8 x float] with undefined values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \returns A 256-bit vector of [8 x float] containing undefined values.
	static __inline__ __m256 __DEFAULT_FN_ATTRS
	_mm256_undefined_ps(void)
	{
	return (__m256)__builtin_ia32_undef256();
	}

	/// \brief Create a 256-bit integer vector with undefined values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \returns A 256-bit integer vector containing undefined values.
	static __inline__ __m256i __DEFAULT_FN_ATTRS
	_mm256_undefined_si256(void)
	{
	return (__m256i)__builtin_ia32_undef256();
	}

	/// \brief Constructs a 256-bit floating-point vector of [4 x double]
	/// initialized with the specified double-precision floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
	/// instruction.
	///
	/// \param __a
	/// A double-precision floating-point value used to initialize bits [255:192]
	/// of the result.
	/// \param __b
	/// A double-precision floating-point value used to initialize bits [191:128]
	/// of the result.
	/// \param __c
	/// A double-precision floating-point value used to initialize bits [127:64]
	/// of the result.
	/// \param __d
	/// A double-precision floating-point value used to initialize bits [63:0]
	/// of the result.
	/// \returns An initialized 256-bit floating-point vector of [4 x double].
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_set_pd(double __a, double __b, double __c, double __d)
	{
	return (__m256d){ __d, __c, __b, __a };
	}

	/// \brief Constructs a 256-bit floating-point vector of [8 x float] initialized
	/// with the specified single-precision floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __a
	/// A single-precision floating-point value used to initialize bits [255:224]
	/// of the result.
	/// \param __b
	/// A single-precision floating-point value used to initialize bits [223:192]
	/// of the result.
	/// \param __c
	/// A single-precision floating-point value used to initialize bits [191:160]
	/// of the result.
	/// \param __d
	/// A single-precision floating-point value used to initialize bits [159:128]
	/// of the result.
	/// \param __e
	/// A single-precision floating-point value used to initialize bits [127:96]
	/// of the result.
	/// \param __f
	/// A single-precision floating-point value used to initialize bits [95:64]
	/// of the result.
	/// \param __g
	/// A single-precision floating-point value used to initialize bits [63:32]
	/// of the result.
	/// \param __h
	/// A single-precision floating-point value used to initialize bits [31:0]
	/// of the result.
	/// \returns An initialized 256-bit floating-point vector of [8 x float].
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_set_ps(float __a, float __b, float __c, float __d,
	float __e, float __f, float __g, float __h)
	{
	return (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
	}

	/// \brief Constructs a 256-bit integer vector initialized with the specified
	/// 32-bit integral values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __i0
	/// A 32-bit integral value used to initialize bits [255:224] of the result.
	/// \param __i1
	/// A 32-bit integral value used to initialize bits [223:192] of the result.
	/// \param __i2
	/// A 32-bit integral value used to initialize bits [191:160] of the result.
	/// \param __i3
	/// A 32-bit integral value used to initialize bits [159:128] of the result.
	/// \param __i4
	/// A 32-bit integral value used to initialize bits [127:96] of the result.
	/// \param __i5
	/// A 32-bit integral value used to initialize bits [95:64] of the result.
	/// \param __i6
	/// A 32-bit integral value used to initialize bits [63:32] of the result.
	/// \param __i7
	/// A 32-bit integral value used to initialize bits [31:0] of the result.
	/// \returns An initialized 256-bit integer vector.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
	int __i4, int __i5, int __i6, int __i7)
	{
	return (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
	}

	/// \brief Constructs a 256-bit integer vector initialized with the specified
	/// 16-bit integral values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __w15
	/// A 16-bit integral value used to initialize bits [255:240] of the result.
	/// \param __w14
	/// A 16-bit integral value used to initialize bits [239:224] of the result.
	/// \param __w13
	/// A 16-bit integral value used to initialize bits [223:208] of the result.
	/// \param __w12
	/// A 16-bit integral value used to initialize bits [207:192] of the result.
	/// \param __w11
	/// A 16-bit integral value used to initialize bits [191:176] of the result.
	/// \param __w10
	/// A 16-bit integral value used to initialize bits [175:160] of the result.
	/// \param __w09
	/// A 16-bit integral value used to initialize bits [159:144] of the result.
	/// \param __w08
	/// A 16-bit integral value used to initialize bits [143:128] of the result.
	/// \param __w07
	/// A 16-bit integral value used to initialize bits [127:112] of the result.
	/// \param __w06
	/// A 16-bit integral value used to initialize bits [111:96] of the result.
	/// \param __w05
	/// A 16-bit integral value used to initialize bits [95:80] of the result.
	/// \param __w04
	/// A 16-bit integral value used to initialize bits [79:64] of the result.
	/// \param __w03
	/// A 16-bit integral value used to initialize bits [63:48] of the result.
	/// \param __w02
	/// A 16-bit integral value used to initialize bits [47:32] of the result.
	/// \param __w01
	/// A 16-bit integral value used to initialize bits [31:16] of the result.
	/// \param __w00
	/// A 16-bit integral value used to initialize bits [15:0] of the result.
	/// \returns An initialized 256-bit integer vector.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
	short __w11, short __w10, short __w09, short __w08,
	short __w07, short __w06, short __w05, short __w04,
	short __w03, short __w02, short __w01, short __w00)
	{
	return (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
	}

	/// \brief Constructs a 256-bit integer vector initialized with the specified
	/// 8-bit integral values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __b31
	/// An 8-bit integral value used to initialize bits [255:248] of the result.
	/// \param __b30
	/// An 8-bit integral value used to initialize bits [247:240] of the result.
	/// \param __b29
	/// An 8-bit integral value used to initialize bits [239:232] of the result.
	/// \param __b28
	/// An 8-bit integral value used to initialize bits [231:224] of the result.
	/// \param __b27
	/// An 8-bit integral value used to initialize bits [223:216] of the result.
	/// \param __b26
	/// An 8-bit integral value used to initialize bits [215:208] of the result.
	/// \param __b25
	/// An 8-bit integral value used to initialize bits [207:200] of the result.
	/// \param __b24
	/// An 8-bit integral value used to initialize bits [199:192] of the result.
	/// \param __b23
	/// An 8-bit integral value used to initialize bits [191:184] of the result.
	/// \param __b22
	/// An 8-bit integral value used to initialize bits [183:176] of the result.
	/// \param __b21
	/// An 8-bit integral value used to initialize bits [175:168] of the result.
	/// \param __b20
	/// An 8-bit integral value used to initialize bits [167:160] of the result.
	/// \param __b19
	/// An 8-bit integral value used to initialize bits [159:152] of the result.
	/// \param __b18
	/// An 8-bit integral value used to initialize bits [151:144] of the result.
	/// \param __b17
	/// An 8-bit integral value used to initialize bits [143:136] of the result.
	/// \param __b16
	/// An 8-bit integral value used to initialize bits [135:128] of the result.
	/// \param __b15
	/// An 8-bit integral value used to initialize bits [127:120] of the result.
	/// \param __b14
	/// An 8-bit integral value used to initialize bits [119:112] of the result.
	/// \param __b13
	/// An 8-bit integral value used to initialize bits [111:104] of the result.
	/// \param __b12
	/// An 8-bit integral value used to initialize bits [103:96] of the result.
	/// \param __b11
	/// An 8-bit integral value used to initialize bits [95:88] of the result.
	/// \param __b10
	/// An 8-bit integral value used to initialize bits [87:80] of the result.
	/// \param __b09
	/// An 8-bit integral value used to initialize bits [79:72] of the result.
	/// \param __b08
	/// An 8-bit integral value used to initialize bits [71:64] of the result.
	/// \param __b07
	/// An 8-bit integral value used to initialize bits [63:56] of the result.
	/// \param __b06
	/// An 8-bit integral value used to initialize bits [55:48] of the result.
	/// \param __b05
	/// An 8-bit integral value used to initialize bits [47:40] of the result.
	/// \param __b04
	/// An 8-bit integral value used to initialize bits [39:32] of the result.
	/// \param __b03
	/// An 8-bit integral value used to initialize bits [31:24] of the result.
	/// \param __b02
	/// An 8-bit integral value used to initialize bits [23:16] of the result.
	/// \param __b01
	/// An 8-bit integral value used to initialize bits [15:8] of the result.
	/// \param __b00
	/// An 8-bit integral value used to initialize bits [7:0] of the result.
	/// \returns An initialized 256-bit integer vector.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
	char __b27, char __b26, char __b25, char __b24,
	char __b23, char __b22, char __b21, char __b20,
	char __b19, char __b18, char __b17, char __b16,
	char __b15, char __b14, char __b13, char __b12,
	char __b11, char __b10, char __b09, char __b08,
	char __b07, char __b06, char __b05, char __b04,
	char __b03, char __b02, char __b01, char __b00)
	{
	return (__m256i)(__v32qi){
	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
	};
	}

	/// \brief Constructs a 256-bit integer vector initialized with the specified
	/// 64-bit integral values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
	/// instruction.
	///
	/// \param __a
	/// A 64-bit integral value used to initialize bits [255:192] of the result.
	/// \param __b
	/// A 64-bit integral value used to initialize bits [191:128] of the result.
	/// \param __c
	/// A 64-bit integral value used to initialize bits [127:64] of the result.
	/// \param __d
	/// A 64-bit integral value used to initialize bits [63:0] of the result.
	/// \returns An initialized 256-bit integer vector.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
	{
	return (__m256i)(__v4di){ __d, __c, __b, __a };
	}

	/* Create vectors with elements in reverse order */
	/// \brief Constructs a 256-bit floating-point vector of [4 x double],
	/// initialized in reverse order with the specified double-precision
	/// floating-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
	/// instruction.
	///
	/// \param __a
	/// A double-precision floating-point value used to initialize bits [63:0]
	/// of the result.
	/// \param __b
	/// A double-precision floating-point value used to initialize bits [127:64]
	/// of the result.
	/// \param __c
	/// A double-precision floating-point value used to initialize bits [191:128]
	/// of the result.
	/// \param __d
	/// A double-precision floating-point value used to initialize bits [255:192]
	/// of the result.
	/// \returns An initialized 256-bit floating-point vector of [4 x double].
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_setr_pd(double __a, double __b, double __c, double __d)
	{
	return (__m256d){ __a, __b, __c, __d };
	}

	/// \brief Constructs a 256-bit floating-point vector of [8 x float],
	/// initialized in reverse order with the specified single-precision
	/// float-point values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __a
	/// A single-precision floating-point value used to initialize bits [31:0]
	/// of the result.
	/// \param __b
	/// A single-precision floating-point value used to initialize bits [63:32]
	/// of the result.
	/// \param __c
	/// A single-precision floating-point value used to initialize bits [95:64]
	/// of the result.
	/// \param __d
	/// A single-precision floating-point value used to initialize bits [127:96]
	/// of the result.
	/// \param __e
	/// A single-precision floating-point value used to initialize bits [159:128]
	/// of the result.
	/// \param __f
	/// A single-precision floating-point value used to initialize bits [191:160]
	/// of the result.
	/// \param __g
	/// A single-precision floating-point value used to initialize bits [223:192]
	/// of the result.
	/// \param __h
	/// A single-precision floating-point value used to initialize bits [255:224]
	/// of the result.
	/// \returns An initialized 256-bit floating-point vector of [8 x float].
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_setr_ps(float __a, float __b, float __c, float __d,
	float __e, float __f, float __g, float __h)
	{
	return (__m256){ __a, __b, __c, __d, __e, __f, __g, __h };
	}

	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
	/// with the specified 32-bit integral values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __i0
	/// A 32-bit integral value used to initialize bits [31:0] of the result.
	/// \param __i1
	/// A 32-bit integral value used to initialize bits [63:32] of the result.
	/// \param __i2
	/// A 32-bit integral value used to initialize bits [95:64] of the result.
	/// \param __i3
	/// A 32-bit integral value used to initialize bits [127:96] of the result.
	/// \param __i4
	/// A 32-bit integral value used to initialize bits [159:128] of the result.
	/// \param __i5
	/// A 32-bit integral value used to initialize bits [191:160] of the result.
	/// \param __i6
	/// A 32-bit integral value used to initialize bits [223:192] of the result.
	/// \param __i7
	/// A 32-bit integral value used to initialize bits [255:224] of the result.
	/// \returns An initialized 256-bit integer vector.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
	int __i4, int __i5, int __i6, int __i7)
	{
	return (__m256i)(__v8si){ __i0, __i1, __i2, __i3, __i4, __i5, __i6, __i7 };
	}

	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
	/// with the specified 16-bit integral values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __w15
	/// A 16-bit integral value used to initialize bits [15:0] of the result.
	/// \param __w14
	/// A 16-bit integral value used to initialize bits [31:16] of the result.
	/// \param __w13
	/// A 16-bit integral value used to initialize bits [47:32] of the result.
	/// \param __w12
	/// A 16-bit integral value used to initialize bits [63:48] of the result.
	/// \param __w11
	/// A 16-bit integral value used to initialize bits [79:64] of the result.
	/// \param __w10
	/// A 16-bit integral value used to initialize bits [95:80] of the result.
	/// \param __w09
	/// A 16-bit integral value used to initialize bits [111:96] of the result.
	/// \param __w08
	/// A 16-bit integral value used to initialize bits [127:112] of the result.
	/// \param __w07
	/// A 16-bit integral value used to initialize bits [143:128] of the result.
	/// \param __w06
	/// A 16-bit integral value used to initialize bits [159:144] of the result.
	/// \param __w05
	/// A 16-bit integral value used to initialize bits [175:160] of the result.
	/// \param __w04
	/// A 16-bit integral value used to initialize bits [191:176] of the result.
	/// \param __w03
	/// A 16-bit integral value used to initialize bits [207:192] of the result.
	/// \param __w02
	/// A 16-bit integral value used to initialize bits [223:208] of the result.
	/// \param __w01
	/// A 16-bit integral value used to initialize bits [239:224] of the result.
	/// \param __w00
	/// A 16-bit integral value used to initialize bits [255:240] of the result.
	/// \returns An initialized 256-bit integer vector.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
	short __w11, short __w10, short __w09, short __w08,
	short __w07, short __w06, short __w05, short __w04,
	short __w03, short __w02, short __w01, short __w00)
	{
	return (__m256i)(__v16hi){ __w15, __w14, __w13, __w12, __w11, __w10, __w09,
	__w08, __w07, __w06, __w05, __w04, __w03, __w02, __w01, __w00 };
	}

	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
	/// with the specified 8-bit integral values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic is a utility function and does not correspond to a specific
	/// instruction.
	///
	/// \param __b31
	/// An 8-bit integral value used to initialize bits [7:0] of the result.
	/// \param __b30
	/// An 8-bit integral value used to initialize bits [15:8] of the result.
	/// \param __b29
	/// An 8-bit integral value used to initialize bits [23:16] of the result.
	/// \param __b28
	/// An 8-bit integral value used to initialize bits [31:24] of the result.
	/// \param __b27
	/// An 8-bit integral value used to initialize bits [39:32] of the result.
	/// \param __b26
	/// An 8-bit integral value used to initialize bits [47:40] of the result.
	/// \param __b25
	/// An 8-bit integral value used to initialize bits [55:48] of the result.
	/// \param __b24
	/// An 8-bit integral value used to initialize bits [63:56] of the result.
	/// \param __b23
	/// An 8-bit integral value used to initialize bits [71:64] of the result.
	/// \param __b22
	/// An 8-bit integral value used to initialize bits [79:72] of the result.
	/// \param __b21
	/// An 8-bit integral value used to initialize bits [87:80] of the result.
	/// \param __b20
	/// An 8-bit integral value used to initialize bits [95:88] of the result.
	/// \param __b19
	/// An 8-bit integral value used to initialize bits [103:96] of the result.
	/// \param __b18
	/// An 8-bit integral value used to initialize bits [111:104] of the result.
	/// \param __b17
	/// An 8-bit integral value used to initialize bits [119:112] of the result.
	/// \param __b16
	/// An 8-bit integral value used to initialize bits [127:120] of the result.
	/// \param __b15
	/// An 8-bit integral value used to initialize bits [135:128] of the result.
	/// \param __b14
	/// An 8-bit integral value used to initialize bits [143:136] of the result.
	/// \param __b13
	/// An 8-bit integral value used to initialize bits [151:144] of the result.
	/// \param __b12
	/// An 8-bit integral value used to initialize bits [159:152] of the result.
	/// \param __b11
	/// An 8-bit integral value used to initialize bits [167:160] of the result.
	/// \param __b10
	/// An 8-bit integral value used to initialize bits [175:168] of the result.
	/// \param __b09
	/// An 8-bit integral value used to initialize bits [183:176] of the result.
	/// \param __b08
	/// An 8-bit integral value used to initialize bits [191:184] of the result.
	/// \param __b07
	/// An 8-bit integral value used to initialize bits [199:192] of the result.
	/// \param __b06
	/// An 8-bit integral value used to initialize bits [207:200] of the result.
	/// \param __b05
	/// An 8-bit integral value used to initialize bits [215:208] of the result.
	/// \param __b04
	/// An 8-bit integral value used to initialize bits [223:216] of the result.
	/// \param __b03
	/// An 8-bit integral value used to initialize bits [231:224] of the result.
	/// \param __b02
	/// An 8-bit integral value used to initialize bits [239:232] of the result.
	/// \param __b01
	/// An 8-bit integral value used to initialize bits [247:240] of the result.
	/// \param __b00
	/// An 8-bit integral value used to initialize bits [255:248] of the result.
	/// \returns An initialized 256-bit integer vector.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
	char __b27, char __b26, char __b25, char __b24,
	char __b23, char __b22, char __b21, char __b20,
	char __b19, char __b18, char __b17, char __b16,
	char __b15, char __b14, char __b13, char __b12,
	char __b11, char __b10, char __b09, char __b08,
	char __b07, char __b06, char __b05, char __b04,
	char __b03, char __b02, char __b01, char __b00)
	{
	return (__m256i)(__v32qi){
	__b31, __b30, __b29, __b28, __b27, __b26, __b25, __b24,
	__b23, __b22, __b21, __b20, __b19, __b18, __b17, __b16,
	__b15, __b14, __b13, __b12, __b11, __b10, __b09, __b08,
	__b07, __b06, __b05, __b04, __b03, __b02, __b01, __b00 };
	}

	/// \brief Constructs a 256-bit integer vector, initialized in reverse order
	/// with the specified 64-bit integral values.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
	/// instruction.
	///
	/// \param __a
	/// A 64-bit integral value used to initialize bits [63:0] of the result.
	/// \param __b
	/// A 64-bit integral value used to initialize bits [127:64] of the result.
	/// \param __c
	/// A 64-bit integral value used to initialize bits [191:128] of the result.
	/// \param __d
	/// A 64-bit integral value used to initialize bits [255:192] of the result.
	/// \returns An initialized 256-bit integer vector.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
	{
	return (__m256i)(__v4di){ __a, __b, __c, __d };
	}

	/* Create vectors with repeated elements */
	/// \brief Constructs a 256-bit floating-point vector of [4 x double], with each
	/// of the four double-precision floating-point vector elements set to the
	/// specified double-precision floating-point value.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
	///
	/// \param __w
	/// A double-precision floating-point value used to initialize each vector
	/// element of the result.
	/// \returns An initialized 256-bit floating-point vector of [4 x double].
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_set1_pd(double __w)
	{
	return (__m256d){ __w, __w, __w, __w };
	}

	/// \brief Constructs a 256-bit floating-point vector of [8 x float], with each
	/// of the eight single-precision floating-point vector elements set to the
	/// specified single-precision floating-point value.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
	/// instruction.
	///
	/// \param __w
	/// A single-precision floating-point value used to initialize each vector
	/// element of the result.
	/// \returns An initialized 256-bit floating-point vector of [8 x float].
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_set1_ps(float __w)
	{
	return (__m256){ __w, __w, __w, __w, __w, __w, __w, __w };
	}

	/// \brief Constructs a 256-bit integer vector of [8 x i32], with each of the
	/// 32-bit integral vector elements set to the specified 32-bit integral
	/// value.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
	/// instruction.
	///
	/// \param __i
	/// A 32-bit integral value used to initialize each vector element of the
	/// result.
	/// \returns An initialized 256-bit integer vector of [8 x i32].
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set1_epi32(int __i)
	{
	return (__m256i)(__v8si){ __i, __i, __i, __i, __i, __i, __i, __i };
	}

	/// \brief Constructs a 256-bit integer vector of [16 x i16], with each of the
	/// 16-bit integral vector elements set to the specified 16-bit integral
	/// value.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
	///
	/// \param __w
	/// A 16-bit integral value used to initialize each vector element of the
	/// result.
	/// \returns An initialized 256-bit integer vector of [16 x i16].
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set1_epi16(short __w)
	{
	return (__m256i)(__v16hi){ __w, __w, __w, __w, __w, __w, __w, __w, __w, __w,
	__w, __w, __w, __w, __w, __w };
	}

	/// \brief Constructs a 256-bit integer vector of [32 x i8], with each of the
	/// 8-bit integral vector elements set to the specified 8-bit integral value.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
	///
	/// \param __b
	/// An 8-bit integral value used to initialize each vector element of the
	/// result.
	/// \returns An initialized 256-bit integer vector of [32 x i8].
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set1_epi8(char __b)
	{
	return (__m256i)(__v32qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
	__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
	__b, __b, __b, __b, __b, __b, __b };
	}

	/// \brief Constructs a 256-bit integer vector of [4 x i64], with each of the
	/// 64-bit integral vector elements set to the specified 64-bit integral
	/// value.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
	///
	/// \param __q
	/// A 64-bit integral value used to initialize each vector element of the
	/// result.
	/// \returns An initialized 256-bit integer vector of [4 x i64].
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set1_epi64x(long long __q)
	{
	return (__m256i)(__v4di){ __q, __q, __q, __q };
	}

	/* Create __zeroed vectors */
	/// \brief Constructs a 256-bit floating-point vector of [4 x double] with all
	/// vector elements initialized to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
	///
	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_setzero_pd(void)
	{
	return (__m256d){ 0, 0, 0, 0 };
	}

	/// \brief Constructs a 256-bit floating-point vector of [8 x float] with all
	/// vector elements initialized to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
	///
	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_setzero_ps(void)
	{
	return (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
	}

	/// \brief Constructs a 256-bit integer vector initialized to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
	///
	/// \returns A 256-bit integer vector initialized to zero.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_setzero_si256(void)
	{
	return (__m256i){ 0LL, 0LL, 0LL, 0LL };
	}

	/* Cast between vector types */
	/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
	/// floating-point vector of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit floating-point vector of [4 x double].
	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
	/// bitwise pattern as the parameter.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_castpd_ps(__m256d __a)
	{
	return (__m256)__a;
	}

	/// \brief Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
	/// integer vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit floating-point vector of [4 x double].
	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
	/// parameter.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_castpd_si256(__m256d __a)
	{
	return (__m256i)__a;
	}

	/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
	/// floating-point vector of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit floating-point vector of [8 x float].
	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
	/// bitwise pattern as the parameter.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_castps_pd(__m256 __a)
	{
	return (__m256d)__a;
	}

	/// \brief Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
	/// integer vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit floating-point vector of [8 x float].
	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
	/// parameter.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_castps_si256(__m256 __a)
	{
	return (__m256i)__a;
	}

	/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
	/// of [8 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
	/// bitwise pattern as the parameter.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_castsi256_ps(__m256i __a)
	{
	return (__m256)__a;
	}

	/// \brief Casts a 256-bit integer vector into a 256-bit floating-point vector
	/// of [4 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
	/// bitwise pattern as the parameter.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_castsi256_pd(__m256i __a)
	{
	return (__m256d)__a;
	}

	/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit floating-point vector of [4 x double].
	/// \returns A 128-bit floating-point vector of [2 x double] containing the
	/// lower 128 bits of the parameter.
	static __inline __m128d __DEFAULT_FN_ATTRS
	_mm256_castpd256_pd128(__m256d __a)
	{
	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
	}

	/// \brief Returns the lower 128 bits of a 256-bit floating-point vector of
	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit floating-point vector of [8 x float].
	/// \returns A 128-bit floating-point vector of [4 x float] containing the
	/// lower 128 bits of the parameter.
	static __inline __m128 __DEFAULT_FN_ATTRS
	_mm256_castps256_ps128(__m256 __a)
	{
	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
	}

	/// \brief Truncates a 256-bit integer vector into a 128-bit integer vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 256-bit integer vector.
	/// \returns A 128-bit integer vector containing the lower 128 bits of the
	/// parameter.
	static __inline __m128i __DEFAULT_FN_ATTRS
	_mm256_castsi256_si128(__m256i __a)
	{
	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
	}

	/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
	/// 128-bit floating-point vector of [2 x double].
	///
	/// The lower 128 bits contain the value of the source vector. The contents
	/// of the upper 128 bits are undefined.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit vector of [2 x double].
	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
	/// contain the value of the parameter. The contents of the upper 128 bits
	/// are undefined.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_castpd128_pd256(__m128d __a)
	{
	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
	}

	/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
	/// 128-bit floating-point vector of [4 x float].
	///
	/// The lower 128 bits contain the value of the source vector. The contents
	/// of the upper 128 bits are undefined.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
	/// contain the value of the parameter. The contents of the upper 128 bits
	/// are undefined.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_castps128_ps256(__m128 __a)
	{
	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
	}

	/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
	///
	/// The lower 128 bits contain the value of the source vector. The contents
	/// of the upper 128 bits are undefined.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit integer vector.
	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
	/// the parameter. The contents of the upper 128 bits are undefined.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_castsi128_si256(__m128i __a)
	{
	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
	}

	/// \brief Constructs a 256-bit floating-point vector of [4 x double] from a
	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
	/// contain the value of the source vector. The upper 128 bits are set
	/// to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit vector of [2 x double].
	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
	/// contain the value of the parameter. The upper 128 bits are set to zero.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_zextpd128_pd256(__m128d __a)
	{
	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
	}

	/// \brief Constructs a 256-bit floating-point vector of [8 x float] from a
	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
	/// the value of the source vector. The upper 128 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit vector of [4 x float].
	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
	/// contain the value of the parameter. The upper 128 bits are set to zero.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_zextps128_ps256(__m128 __a)
	{
	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
	}

	/// \brief Constructs a 256-bit integer vector from a 128-bit integer vector.
	/// The lower 128 bits contain the value of the source vector. The upper
	/// 128 bits are set to zero.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic has no corresponding instruction.
	///
	/// \param __a
	/// A 128-bit integer vector.
	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
	/// the parameter. The upper 128 bits are set to zero.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_zextsi128_si256(__m128i __a)
	{
	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
	}

	/*
	Vector insert.
	We use macros rather than inlines because we only want to accept
	invocations where the immediate M is a constant expression.
	*/
	/// \brief Constructs a new 256-bit vector of [8 x float] by first duplicating
	/// a 256-bit vector of [8 x float] given in the first parameter, and then
	/// replacing either the upper or the lower 128 bits with the contents of a
	/// 128-bit vector of [4 x float] in the second parameter.
	///
	/// The immediate integer parameter determines between the upper or the lower
	/// 128 bits.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param V1
	/// A 256-bit vector of [8 x float]. This vector is copied to the result
	/// first, and then either the upper or the lower 128 bits of the result will
	/// be replaced by the contents of \a V2.
	/// \param V2
	/// A 128-bit vector of [4 x float]. The contents of this parameter are
	/// written to either the upper or the lower 128 bits of the result depending
	/// on the value of parameter \a M.
	/// \param M
	/// An immediate integer. The least significant bit determines how the values
	/// from the two parameters are interleaved: \n
	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
	/// result. \n
	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
	/// result.
	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
	#define _mm256_insertf128_ps(V1, V2, M) __extension__ ({ \
	(__m256)__builtin_shufflevector( \
	(__v8sf)(__m256)(V1), \
	(__v8sf)_mm256_castps128_ps256((__m128)(V2)), \
	(((M) & 1) ? 0 : 8), \
	(((M) & 1) ? 1 : 9), \
	(((M) & 1) ? 2 : 10), \
	(((M) & 1) ? 3 : 11), \
	(((M) & 1) ? 8 : 4), \
	(((M) & 1) ? 9 : 5), \
	(((M) & 1) ? 10 : 6), \
	(((M) & 1) ? 11 : 7) );})

	/// \brief Constructs a new 256-bit vector of [4 x double] by first duplicating
	/// a 256-bit vector of [4 x double] given in the first parameter, and then
	/// replacing either the upper or the lower 128 bits with the contents of a
	/// 128-bit vector of [2 x double] in the second parameter.
	///
	/// The immediate integer parameter determines between the upper or the lower
	/// 128 bits.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param V1
	/// A 256-bit vector of [4 x double]. This vector is copied to the result
	/// first, and then either the upper or the lower 128 bits of the result will
	/// be replaced by the contents of \a V2.
	/// \param V2
	/// A 128-bit vector of [2 x double]. The contents of this parameter are
	/// written to either the upper or the lower 128 bits of the result depending
	/// on the value of parameter \a M.
	/// \param M
	/// An immediate integer. The least significant bit determines how the values
	/// from the two parameters are interleaved: \n
	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
	/// result. \n
	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
	/// result.
	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
	#define _mm256_insertf128_pd(V1, V2, M) __extension__ ({ \
	(__m256d)__builtin_shufflevector( \
	(__v4df)(__m256d)(V1), \
	(__v4df)_mm256_castpd128_pd256((__m128d)(V2)), \
	(((M) & 1) ? 0 : 4), \
	(((M) & 1) ? 1 : 5), \
	(((M) & 1) ? 4 : 2), \
	(((M) & 1) ? 5 : 3) );})

	/// \brief Constructs a new 256-bit integer vector by first duplicating a
	/// 256-bit integer vector given in the first parameter, and then replacing
	/// either the upper or the lower 128 bits with the contents of a 128-bit
	/// integer vector in the second parameter.
	///
	/// The immediate integer parameter determines between the upper or the lower
	/// 128 bits.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param V1
	/// A 256-bit integer vector. This vector is copied to the result first, and
	/// then either the upper or the lower 128 bits of the result will be
	/// replaced by the contents of \a V2.
	/// \param V2
	/// A 128-bit integer vector. The contents of this parameter are written to
	/// either the upper or the lower 128 bits of the result depending on the
	/// value of parameter \a M.
	/// \param M
	/// An immediate integer. The least significant bit determines how the values
	/// from the two parameters are interleaved: \n
	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
	/// result. \n
	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
	/// result.
	/// \returns A 256-bit integer vector containing the interleaved values.
	#define _mm256_insertf128_si256(V1, V2, M) __extension__ ({ \
	(__m256i)__builtin_shufflevector( \
	(__v4di)(__m256i)(V1), \
	(__v4di)_mm256_castsi128_si256((__m128i)(V2)), \
	(((M) & 1) ? 0 : 4), \
	(((M) & 1) ? 1 : 5), \
	(((M) & 1) ? 4 : 2), \
	(((M) & 1) ? 5 : 3) );})

	/*
	Vector extract.
	We use macros rather than inlines because we only want to accept
	invocations where the immediate M is a constant expression.
	*/
	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
	/// of [8 x float], as determined by the immediate integer parameter, and
	/// returns the extracted bits as a 128-bit vector of [4 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
	///
	/// \param V
	/// A 256-bit vector of [8 x float].
	/// \param M
	/// An immediate integer. The least significant bit determines which bits are
	/// extracted from the first parameter: \n
	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
	/// result. \n
	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
	#define _mm256_extractf128_ps(V, M) __extension__ ({ \
	(__m128)__builtin_shufflevector( \
	(__v8sf)(__m256)(V), \
	(__v8sf)(_mm256_undefined_ps()), \
	(((M) & 1) ? 4 : 0), \
	(((M) & 1) ? 5 : 1), \
	(((M) & 1) ? 6 : 2), \
	(((M) & 1) ? 7 : 3) );})

	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit vector
	/// of [4 x double], as determined by the immediate integer parameter, and
	/// returns the extracted bits as a 128-bit vector of [2 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
	///
	/// \param V
	/// A 256-bit vector of [4 x double].
	/// \param M
	/// An immediate integer. The least significant bit determines which bits are
	/// extracted from the first parameter: \n
	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
	/// result. \n
	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
	#define _mm256_extractf128_pd(V, M) __extension__ ({ \
	(__m128d)__builtin_shufflevector( \
	(__v4df)(__m256d)(V), \
	(__v4df)(_mm256_undefined_pd()), \
	(((M) & 1) ? 2 : 0), \
	(((M) & 1) ? 3 : 1) );})

	/// \brief Extracts either the upper or the lower 128 bits from a 256-bit
	/// integer vector, as determined by the immediate integer parameter, and
	/// returns the extracted bits as a 128-bit integer vector.
	///
	/// \headerfile <x86intrin.h>
	///
	/// \code
	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
	/// \endcode
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
	///
	/// \param V
	/// A 256-bit integer vector.
	/// \param M
	/// An immediate integer. The least significant bit determines which bits are
	/// extracted from the first parameter: \n
	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
	/// result. \n
	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
	/// \returns A 128-bit integer vector containing the extracted bits.
	#define _mm256_extractf128_si256(V, M) __extension__ ({ \
	(__m128i)__builtin_shufflevector( \
	(__v4di)(__m256i)(V), \
	(__v4di)(_mm256_undefined_si256()), \
	(((M) & 1) ? 2 : 0), \
	(((M) & 1) ? 3 : 1) );})

	/* SIMD load ops (unaligned) */
	/// \brief Loads two 128-bit floating-point vectors of [4 x float] from
	/// unaligned memory locations and constructs a 256-bit floating-point vector
	/// of [8 x float] by concatenating the two 128-bit vectors.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to load instructions followed by the
	/// <c> VINSERTF128 </c> instruction.
	///
	/// \param __addr_hi
	/// A pointer to a 128-bit memory location containing 4 consecutive
	/// single-precision floating-point values. These values are to be copied to
	/// bits[255:128] of the result. The address of the memory location does not
	/// have to be aligned.
	/// \param __addr_lo
	/// A pointer to a 128-bit memory location containing 4 consecutive
	/// single-precision floating-point values. These values are to be copied to
	/// bits[127:0] of the result. The address of the memory location does not
	/// have to be aligned.
	/// \returns A 256-bit floating-point vector of [8 x float] containing the
	/// concatenated result.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
	{
	__m256 __v256 = _mm256_castps128_ps256(_mm_loadu_ps(__addr_lo));
	return _mm256_insertf128_ps(__v256, _mm_loadu_ps(__addr_hi), 1);
	}

	/// \brief Loads two 128-bit floating-point vectors of [2 x double] from
	/// unaligned memory locations and constructs a 256-bit floating-point vector
	/// of [4 x double] by concatenating the two 128-bit vectors.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to load instructions followed by the
	/// <c> VINSERTF128 </c> instruction.
	///
	/// \param __addr_hi
	/// A pointer to a 128-bit memory location containing two consecutive
	/// double-precision floating-point values. These values are to be copied to
	/// bits[255:128] of the result. The address of the memory location does not
	/// have to be aligned.
	/// \param __addr_lo
	/// A pointer to a 128-bit memory location containing two consecutive
	/// double-precision floating-point values. These values are to be copied to
	/// bits[127:0] of the result. The address of the memory location does not
	/// have to be aligned.
	/// \returns A 256-bit floating-point vector of [4 x double] containing the
	/// concatenated result.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
	{
	__m256d __v256 = _mm256_castpd128_pd256(_mm_loadu_pd(__addr_lo));
	return _mm256_insertf128_pd(__v256, _mm_loadu_pd(__addr_hi), 1);
	}

	/// \brief Loads two 128-bit integer vectors from unaligned memory locations and
	/// constructs a 256-bit integer vector by concatenating the two 128-bit
	/// vectors.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to load instructions followed by the
	/// <c> VINSERTF128 </c> instruction.
	///
	/// \param __addr_hi
	/// A pointer to a 128-bit memory location containing a 128-bit integer
	/// vector. This vector is to be copied to bits[255:128] of the result. The
	/// address of the memory location does not have to be aligned.
	/// \param __addr_lo
	/// A pointer to a 128-bit memory location containing a 128-bit integer
	/// vector. This vector is to be copied to bits[127:0] of the result. The
	/// address of the memory location does not have to be aligned.
	/// \returns A 256-bit integer vector containing the concatenated result.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_loadu2_m128i(__m128i const __addr_hi, __m128i const __addr_lo)
	{
	__m256i __v256 = _mm256_castsi128_si256(_mm_loadu_si128(__addr_lo));
	return _mm256_insertf128_si256(__v256, _mm_loadu_si128(__addr_hi), 1);
	}

	/* SIMD store ops (unaligned) */
	/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
	/// vector of [8 x float] into two different unaligned memory locations.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
	/// store instructions.
	///
	/// \param __addr_hi
	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
	/// copied to this memory location. The address of this memory location does
	/// not have to be aligned.
	/// \param __addr_lo
	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
	/// copied to this memory location. The address of this memory location does
	/// not have to be aligned.
	/// \param __a
	/// A 256-bit floating-point vector of [8 x float].
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
	{
	__m128 __v128;

	__v128 = _mm256_castps256_ps128(__a);
	_mm_storeu_ps(__addr_lo, __v128);
	__v128 = _mm256_extractf128_ps(__a, 1);
	_mm_storeu_ps(__addr_hi, __v128);
	}

	/// \brief Stores the upper and lower 128 bits of a 256-bit floating-point
	/// vector of [4 x double] into two different unaligned memory locations.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
	/// store instructions.
	///
	/// \param __addr_hi
	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
	/// copied to this memory location. The address of this memory location does
	/// not have to be aligned.
	/// \param __addr_lo
	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
	/// copied to this memory location. The address of this memory location does
	/// not have to be aligned.
	/// \param __a
	/// A 256-bit floating-point vector of [4 x double].
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
	{
	__m128d __v128;

	__v128 = _mm256_castpd256_pd128(__a);
	_mm_storeu_pd(__addr_lo, __v128);
	__v128 = _mm256_extractf128_pd(__a, 1);
	_mm_storeu_pd(__addr_hi, __v128);
	}

	/// \brief Stores the upper and lower 128 bits of a 256-bit integer vector into
	/// two different unaligned memory locations.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
	/// store instructions.
	///
	/// \param __addr_hi
	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
	/// copied to this memory location. The address of this memory location does
	/// not have to be aligned.
	/// \param __addr_lo
	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
	/// copied to this memory location. The address of this memory location does
	/// not have to be aligned.
	/// \param __a
	/// A 256-bit integer vector.
	static __inline void __DEFAULT_FN_ATTRS
	_mm256_storeu2_m128i(__m128i __addr_hi, __m128i __addr_lo, __m256i __a)
	{
	__m128i __v128;

	__v128 = _mm256_castsi256_si128(__a);
	_mm_storeu_si128(__addr_lo, __v128);
	__v128 = _mm256_extractf128_si256(__a, 1);
	_mm_storeu_si128(__addr_hi, __v128);
	}

	/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
	/// concatenating two 128-bit floating-point vectors of [4 x float].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param __hi
	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
	/// 128 bits of the result.
	/// \param __lo
	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
	/// 128 bits of the result.
	/// \returns A 256-bit floating-point vector of [8 x float] containing the
	/// concatenated result.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_set_m128 (__m128 __hi, __m128 __lo)
	{
	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
	}

	/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
	/// concatenating two 128-bit floating-point vectors of [2 x double].
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param __hi
	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
	/// 128 bits of the result.
	/// \param __lo
	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
	/// 128 bits of the result.
	/// \returns A 256-bit floating-point vector of [4 x double] containing the
	/// concatenated result.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_set_m128d (__m128d __hi, __m128d __lo)
	{
	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
	}

	/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
	/// integer vectors.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param __hi
	/// A 128-bit integer vector to be copied to the upper 128 bits of the
	/// result.
	/// \param __lo
	/// A 128-bit integer vector to be copied to the lower 128 bits of the
	/// result.
	/// \returns A 256-bit integer vector containing the concatenated result.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_set_m128i (__m128i __hi, __m128i __lo)
	{
	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
	}

	/// \brief Constructs a 256-bit floating-point vector of [8 x float] by
	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
	/// similar to _mm256_set_m128, but the order of the input parameters is
	/// swapped.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param __lo
	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
	/// 128 bits of the result.
	/// \param __hi
	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
	/// 128 bits of the result.
	/// \returns A 256-bit floating-point vector of [8 x float] containing the
	/// concatenated result.
	static __inline __m256 __DEFAULT_FN_ATTRS
	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
	{
	return _mm256_set_m128(__hi, __lo);
	}

	/// \brief Constructs a 256-bit floating-point vector of [4 x double] by
	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
	/// similar to _mm256_set_m128d, but the order of the input parameters is
	/// swapped.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param __lo
	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
	/// 128 bits of the result.
	/// \param __hi
	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
	/// 128 bits of the result.
	/// \returns A 256-bit floating-point vector of [4 x double] containing the
	/// concatenated result.
	static __inline __m256d __DEFAULT_FN_ATTRS
	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
	{
	return (__m256d)_mm256_set_m128((__m128)__hi, (__m128)__lo);
	}

	/// \brief Constructs a 256-bit integer vector by concatenating two 128-bit
	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
	/// the input parameters is swapped.
	///
	/// \headerfile <x86intrin.h>
	///
	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
	///
	/// \param __lo
	/// A 128-bit integer vector to be copied to the lower 128 bits of the
	/// result.
	/// \param __hi
	/// A 128-bit integer vector to be copied to the upper 128 bits of the
	/// result.
	/// \returns A 256-bit integer vector containing the concatenated result.
	static __inline __m256i __DEFAULT_FN_ATTRS
	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
	{
	return (__m256i)_mm256_set_m128((__m128)__hi, (__m128)__lo);
	}

	#undef __DEFAULT_FN_ATTRS

	#endif /* __AVXINTRIN_H */
	Index: head/contrib/llvm/tools/clang/lib/Headers/float.h
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Headers/float.h (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Headers/float.h (revision 322320)
	@@ -1,137 +1,146 @@
	/*===---- float.h - Characteristics of floating point types ----------------===
	*
	* Permission is hereby granted, free of charge, to any person obtaining a copy
	* of this software and associated documentation files (the "Software"), to deal
	* in the Software without restriction, including without limitation the rights
	* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	* copies of the Software, and to permit persons to whom the Software is
	* furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice shall be included in
	* all copies or substantial portions of the Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
	* THE SOFTWARE.
	*
	*===-----------------------------------------------------------------------===
	*/

	#ifndef __FLOAT_H
	#define __FLOAT_H

	/* If we're on MinGW, fall back to the system's float.h, which might have
	* additional definitions provided for Windows.
	* For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
	*
	* Also fall back on Darwin to allow additional definitions and
	* implementation-defined values.
	*/
	#if (defined(__APPLE__) \|\| (defined(__MINGW32__) \|\| defined(_MSC_VER))) && \
	__STDC_HOSTED__ && __has_include_next(<float.h>)
	+
	+/* Prior to Apple's 10.7 SDK, float.h SDK header used to apply an extra level
	+ * of #include_next<float.h> to keep Metrowerks compilers happy. Avoid this
	+ * extra indirection.
	+ */
	+#ifdef __APPLE__
	+#define _FLOAT_H_
	+#endif
	+
	# include_next <float.h>

	/* Undefine anything that we'll be redefining below. */
	# undef FLT_EVAL_METHOD
	# undef FLT_ROUNDS
	# undef FLT_RADIX
	# undef FLT_MANT_DIG
	# undef DBL_MANT_DIG
	# undef LDBL_MANT_DIG
	# if __STDC_VERSION__ >= 199901L \|\| !defined(__STRICT_ANSI__)
	# undef DECIMAL_DIG
	# endif
	# undef FLT_DIG
	# undef DBL_DIG
	# undef LDBL_DIG
	# undef FLT_MIN_EXP
	# undef DBL_MIN_EXP
	# undef LDBL_MIN_EXP
	# undef FLT_MIN_10_EXP
	# undef DBL_MIN_10_EXP
	# undef LDBL_MIN_10_EXP
	# undef FLT_MAX_EXP
	# undef DBL_MAX_EXP
	# undef LDBL_MAX_EXP
	# undef FLT_MAX_10_EXP
	# undef DBL_MAX_10_EXP
	# undef LDBL_MAX_10_EXP
	# undef FLT_MAX
	# undef DBL_MAX
	# undef LDBL_MAX
	# undef FLT_EPSILON
	# undef DBL_EPSILON
	# undef LDBL_EPSILON
	# undef FLT_MIN
	# undef DBL_MIN
	# undef LDBL_MIN
	# if __STDC_VERSION__ >= 201112L \|\| !defined(__STRICT_ANSI__)
	# undef FLT_TRUE_MIN
	# undef DBL_TRUE_MIN
	# undef LDBL_TRUE_MIN
	# undef FLT_DECIMAL_DIG
	# undef DBL_DECIMAL_DIG
	# undef LDBL_DECIMAL_DIG
	# endif
	#endif

	/* Characteristics of floating point types, C99 5.2.4.2.2 */

	#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
	#define FLT_ROUNDS (__builtin_flt_rounds())
	#define FLT_RADIX __FLT_RADIX__

	#define FLT_MANT_DIG __FLT_MANT_DIG__
	#define DBL_MANT_DIG __DBL_MANT_DIG__
	#define LDBL_MANT_DIG __LDBL_MANT_DIG__

	#if __STDC_VERSION__ >= 199901L \|\| !defined(__STRICT_ANSI__)
	# define DECIMAL_DIG __DECIMAL_DIG__
	#endif

	#define FLT_DIG __FLT_DIG__
	#define DBL_DIG __DBL_DIG__
	#define LDBL_DIG __LDBL_DIG__

	#define FLT_MIN_EXP __FLT_MIN_EXP__
	#define DBL_MIN_EXP __DBL_MIN_EXP__
	#define LDBL_MIN_EXP __LDBL_MIN_EXP__

	#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
	#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
	#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__

	#define FLT_MAX_EXP __FLT_MAX_EXP__
	#define DBL_MAX_EXP __DBL_MAX_EXP__
	#define LDBL_MAX_EXP __LDBL_MAX_EXP__

	#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
	#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
	#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__

	#define FLT_MAX __FLT_MAX__
	#define DBL_MAX __DBL_MAX__
	#define LDBL_MAX __LDBL_MAX__

	#define FLT_EPSILON __FLT_EPSILON__
	#define DBL_EPSILON __DBL_EPSILON__
	#define LDBL_EPSILON __LDBL_EPSILON__

	#define FLT_MIN __FLT_MIN__
	#define DBL_MIN __DBL_MIN__
	#define LDBL_MIN __LDBL_MIN__

	#if __STDC_VERSION__ >= 201112L \|\| !defined(__STRICT_ANSI__)
	# define FLT_TRUE_MIN __FLT_DENORM_MIN__
	# define DBL_TRUE_MIN __DBL_DENORM_MIN__
	# define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
	# define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
	# define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
	# define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
	#endif

	#endif /* __FLOAT_H */
	Index: head/contrib/llvm/tools/clang/lib/Sema/Sema.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Sema/Sema.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Sema/Sema.cpp (revision 322320)
	@@ -1,1708 +1,1711 @@
	//===--- Sema.cpp - AST Builder and Semantic Analysis Implementation ------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the actions class which performs semantic analysis and
	// builds an AST out of a parse stream.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/AST/ASTContext.h"
	#include "clang/AST/ASTDiagnostic.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclFriend.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/StmtCXX.h"
	#include "clang/Basic/DiagnosticOptions.h"
	#include "clang/Basic/PartialDiagnostic.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Lex/HeaderSearch.h"
	#include "clang/Lex/Preprocessor.h"
	#include "clang/Sema/CXXFieldCollector.h"
	#include "clang/Sema/DelayedDiagnostic.h"
	#include "clang/Sema/ExternalSemaSource.h"
	#include "clang/Sema/Initialization.h"
	#include "clang/Sema/MultiplexExternalSemaSource.h"
	#include "clang/Sema/ObjCMethodList.h"
	#include "clang/Sema/PrettyDeclStackTrace.h"
	#include "clang/Sema/Scope.h"
	#include "clang/Sema/ScopeInfo.h"
	#include "clang/Sema/SemaConsumer.h"
	#include "clang/Sema/SemaInternal.h"
	#include "clang/Sema/TemplateDeduction.h"
	#include "llvm/ADT/DenseMap.h"
	#include "llvm/ADT/SmallSet.h"
	using namespace clang;
	using namespace sema;

	SourceLocation Sema::getLocForEndOfToken(SourceLocation Loc, unsigned Offset) {
	return Lexer::getLocForEndOfToken(Loc, Offset, SourceMgr, LangOpts);
	}

	ModuleLoader &Sema::getModuleLoader() const { return PP.getModuleLoader(); }

	PrintingPolicy Sema::getPrintingPolicy(const ASTContext &Context,
	const Preprocessor &PP) {
	PrintingPolicy Policy = Context.getPrintingPolicy();
	// Our printing policy is copied over the ASTContext printing policy whenever
	// a diagnostic is emitted, so recompute it.
	Policy.Bool = Context.getLangOpts().Bool;
	if (!Policy.Bool) {
	if (const MacroInfo *BoolMacro = PP.getMacroInfo(Context.getBoolName())) {
	Policy.Bool = BoolMacro->isObjectLike() &&
	BoolMacro->getNumTokens() == 1 &&
	BoolMacro->getReplacementToken(0).is(tok::kw__Bool);
	}
	}

	return Policy;
	}

	void Sema::ActOnTranslationUnitScope(Scope *S) {
	TUScope = S;
	PushDeclContext(S, Context.getTranslationUnitDecl());
	}

	Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
	TranslationUnitKind TUKind, CodeCompleteConsumer *CodeCompleter)
	: ExternalSource(nullptr), isMultiplexExternalSource(false),
	FPFeatures(pp.getLangOpts()), LangOpts(pp.getLangOpts()), PP(pp),
	Context(ctxt), Consumer(consumer), Diags(PP.getDiagnostics()),
	SourceMgr(PP.getSourceManager()), CollectStats(false),
	CodeCompleter(CodeCompleter), CurContext(nullptr),
	OriginalLexicalContext(nullptr), MSStructPragmaOn(false),
	MSPointerToMemberRepresentationMethod(
	LangOpts.getMSPointerToMemberRepresentationMethod()),
	VtorDispStack(MSVtorDispAttr::Mode(LangOpts.VtorDispMode)), PackStack(0),
	DataSegStack(nullptr), BSSSegStack(nullptr), ConstSegStack(nullptr),
	CodeSegStack(nullptr), CurInitSeg(nullptr), VisContext(nullptr),
	PragmaAttributeCurrentTargetDecl(nullptr),
	IsBuildingRecoveryCallExpr(false), Cleanup{}, LateTemplateParser(nullptr),
	LateTemplateParserCleanup(nullptr), OpaqueParser(nullptr), IdResolver(pp),
	StdExperimentalNamespaceCache(nullptr), StdInitializerList(nullptr),
	CXXTypeInfoDecl(nullptr), MSVCGuidDecl(nullptr), NSNumberDecl(nullptr),
	NSValueDecl(nullptr), NSStringDecl(nullptr),
	StringWithUTF8StringMethod(nullptr),
	ValueWithBytesObjCTypeMethod(nullptr), NSArrayDecl(nullptr),
	ArrayWithObjectsMethod(nullptr), NSDictionaryDecl(nullptr),
	DictionaryWithObjectsMethod(nullptr), GlobalNewDeleteDeclared(false),
	TUKind(TUKind), NumSFINAEErrors(0), AccessCheckingSFINAE(false),
	InNonInstantiationSFINAEContext(false), NonInstantiationEntries(0),
	ArgumentPackSubstitutionIndex(-1), CurrentInstantiationScope(nullptr),
	DisableTypoCorrection(false), TyposCorrected(0), AnalysisWarnings(*this),
	ThreadSafetyDeclCache(nullptr), VarDataSharingAttributesStack(nullptr),
	CurScope(nullptr), Ident_super(nullptr), Ident___float128(nullptr) {
	TUScope = nullptr;

	LoadedExternalKnownNamespaces = false;
	for (unsigned I = 0; I != NSAPI::NumNSNumberLiteralMethods; ++I)
	NSNumberLiteralMethods[I] = nullptr;

	if (getLangOpts().ObjC1)
	NSAPIObj.reset(new NSAPI(Context));

	if (getLangOpts().CPlusPlus)
	FieldCollector.reset(new CXXFieldCollector());

	// Tell diagnostics how to render things from the AST library.
	Diags.SetArgToStringFn(&FormatASTNodeDiagnosticArgument, &Context);

	ExprEvalContexts.emplace_back(
	ExpressionEvaluationContext::PotentiallyEvaluated, 0, CleanupInfo{},
	nullptr, false);

	FunctionScopes.push_back(new FunctionScopeInfo(Diags));

	// Initilization of data sharing attributes stack for OpenMP
	InitDataSharingAttributesStack();
	}

	void Sema::addImplicitTypedef(StringRef Name, QualType T) {
	DeclarationName DN = &Context.Idents.get(Name);
	if (IdResolver.begin(DN) == IdResolver.end())
	PushOnScopeChains(Context.buildImplicitTypedef(T, Name), TUScope);
	}

	void Sema::Initialize() {
	if (SemaConsumer *SC = dyn_cast<SemaConsumer>(&Consumer))
	SC->InitializeSema(*this);

	// Tell the external Sema source about this Sema object.
	if (ExternalSemaSource *ExternalSema
	= dyn_cast_or_null<ExternalSemaSource>(Context.getExternalSource()))
	ExternalSema->InitializeSema(*this);

	// This needs to happen after ExternalSemaSource::InitializeSema(this) or we
	// will not be able to merge any duplicate __va_list_tag decls correctly.
	VAListTagName = PP.getIdentifierInfo("__va_list_tag");

	if (!TUScope)
	return;

	// Initialize predefined 128-bit integer types, if needed.
	if (Context.getTargetInfo().hasInt128Type()) {
	// If either of the 128-bit integer types are unavailable to name lookup,
	// define them now.
	DeclarationName Int128 = &Context.Idents.get("__int128_t");
	if (IdResolver.begin(Int128) == IdResolver.end())
	PushOnScopeChains(Context.getInt128Decl(), TUScope);

	DeclarationName UInt128 = &Context.Idents.get("__uint128_t");
	if (IdResolver.begin(UInt128) == IdResolver.end())
	PushOnScopeChains(Context.getUInt128Decl(), TUScope);
	}


	// Initialize predefined Objective-C types:
	if (getLangOpts().ObjC1) {
	// If 'SEL' does not yet refer to any declarations, make it refer to the
	// predefined 'SEL'.
	DeclarationName SEL = &Context.Idents.get("SEL");
	if (IdResolver.begin(SEL) == IdResolver.end())
	PushOnScopeChains(Context.getObjCSelDecl(), TUScope);

	// If 'id' does not yet refer to any declarations, make it refer to the
	// predefined 'id'.
	DeclarationName Id = &Context.Idents.get("id");
	if (IdResolver.begin(Id) == IdResolver.end())
	PushOnScopeChains(Context.getObjCIdDecl(), TUScope);

	// Create the built-in typedef for 'Class'.
	DeclarationName Class = &Context.Idents.get("Class");
	if (IdResolver.begin(Class) == IdResolver.end())
	PushOnScopeChains(Context.getObjCClassDecl(), TUScope);

	// Create the built-in forward declaratino for 'Protocol'.
	DeclarationName Protocol = &Context.Idents.get("Protocol");
	if (IdResolver.begin(Protocol) == IdResolver.end())
	PushOnScopeChains(Context.getObjCProtocolDecl(), TUScope);
	}

	// Create the internal type for the *StringMakeConstantString builtins.
	DeclarationName ConstantString = &Context.Idents.get("__NSConstantString");
	if (IdResolver.begin(ConstantString) == IdResolver.end())
	PushOnScopeChains(Context.getCFConstantStringDecl(), TUScope);

	// Initialize Microsoft "predefined C++ types".
	if (getLangOpts().MSVCCompat) {
	if (getLangOpts().CPlusPlus &&
	IdResolver.begin(&Context.Idents.get("type_info")) == IdResolver.end())
	PushOnScopeChains(Context.buildImplicitRecord("type_info", TTK_Class),
	TUScope);

	addImplicitTypedef("size_t", Context.getSizeType());
	}

	// Initialize predefined OpenCL types and supported extensions and (optional)
	// core features.
	if (getLangOpts().OpenCL) {
	getOpenCLOptions().addSupport(Context.getTargetInfo().getSupportedOpenCLOpts());
	getOpenCLOptions().enableSupportedCore(getLangOpts().OpenCLVersion);
	addImplicitTypedef("sampler_t", Context.OCLSamplerTy);
	addImplicitTypedef("event_t", Context.OCLEventTy);
	if (getLangOpts().OpenCLVersion >= 200) {
	addImplicitTypedef("clk_event_t", Context.OCLClkEventTy);
	addImplicitTypedef("queue_t", Context.OCLQueueTy);
	addImplicitTypedef("reserve_id_t", Context.OCLReserveIDTy);
	addImplicitTypedef("atomic_int", Context.getAtomicType(Context.IntTy));
	addImplicitTypedef("atomic_uint",
	Context.getAtomicType(Context.UnsignedIntTy));
	auto AtomicLongT = Context.getAtomicType(Context.LongTy);
	addImplicitTypedef("atomic_long", AtomicLongT);
	auto AtomicULongT = Context.getAtomicType(Context.UnsignedLongTy);
	addImplicitTypedef("atomic_ulong", AtomicULongT);
	addImplicitTypedef("atomic_float",
	Context.getAtomicType(Context.FloatTy));
	auto AtomicDoubleT = Context.getAtomicType(Context.DoubleTy);
	addImplicitTypedef("atomic_double", AtomicDoubleT);
	// OpenCLC v2.0, s6.13.11.6 requires that atomic_flag is implemented as
	// 32-bit integer and OpenCLC v2.0, s6.1.1 int is always 32-bit wide.
	addImplicitTypedef("atomic_flag", Context.getAtomicType(Context.IntTy));
	auto AtomicIntPtrT = Context.getAtomicType(Context.getIntPtrType());
	addImplicitTypedef("atomic_intptr_t", AtomicIntPtrT);
	auto AtomicUIntPtrT = Context.getAtomicType(Context.getUIntPtrType());
	addImplicitTypedef("atomic_uintptr_t", AtomicUIntPtrT);
	auto AtomicSizeT = Context.getAtomicType(Context.getSizeType());
	addImplicitTypedef("atomic_size_t", AtomicSizeT);
	auto AtomicPtrDiffT = Context.getAtomicType(Context.getPointerDiffType());
	addImplicitTypedef("atomic_ptrdiff_t", AtomicPtrDiffT);

	// OpenCL v2.0 s6.13.11.6:
	// - The atomic_long and atomic_ulong types are supported if the
	// cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
	// extensions are supported.
	// - The atomic_double type is only supported if double precision
	// is supported and the cl_khr_int64_base_atomics and
	// cl_khr_int64_extended_atomics extensions are supported.
	// - If the device address space is 64-bits, the data types
	// atomic_intptr_t, atomic_uintptr_t, atomic_size_t and
	// atomic_ptrdiff_t are supported if the cl_khr_int64_base_atomics and
	// cl_khr_int64_extended_atomics extensions are supported.
	std::vector<QualType> Atomic64BitTypes;
	Atomic64BitTypes.push_back(AtomicLongT);
	Atomic64BitTypes.push_back(AtomicULongT);
	Atomic64BitTypes.push_back(AtomicDoubleT);
	if (Context.getTypeSize(AtomicSizeT) == 64) {
	Atomic64BitTypes.push_back(AtomicSizeT);
	Atomic64BitTypes.push_back(AtomicIntPtrT);
	Atomic64BitTypes.push_back(AtomicUIntPtrT);
	Atomic64BitTypes.push_back(AtomicPtrDiffT);
	}
	for (auto &I : Atomic64BitTypes)
	setOpenCLExtensionForType(I,
	"cl_khr_int64_base_atomics cl_khr_int64_extended_atomics");

	setOpenCLExtensionForType(AtomicDoubleT, "cl_khr_fp64");
	}

	setOpenCLExtensionForType(Context.DoubleTy, "cl_khr_fp64");

	#define GENERIC_IMAGE_TYPE_EXT(Type, Id, Ext) \
	setOpenCLExtensionForType(Context.Id, Ext);
	#include "clang/Basic/OpenCLImageTypes.def"
	};

	if (Context.getTargetInfo().hasBuiltinMSVaList()) {
	DeclarationName MSVaList = &Context.Idents.get("__builtin_ms_va_list");
	if (IdResolver.begin(MSVaList) == IdResolver.end())
	PushOnScopeChains(Context.getBuiltinMSVaListDecl(), TUScope);
	}

	DeclarationName BuiltinVaList = &Context.Idents.get("__builtin_va_list");
	if (IdResolver.begin(BuiltinVaList) == IdResolver.end())
	PushOnScopeChains(Context.getBuiltinVaListDecl(), TUScope);
	}

	Sema::~Sema() {
	if (VisContext) FreeVisContext();
	// Kill all the active scopes.
	for (unsigned I = 1, E = FunctionScopes.size(); I != E; ++I)
	delete FunctionScopes[I];
	if (FunctionScopes.size() == 1)
	delete FunctionScopes[0];

	// Tell the SemaConsumer to forget about us; we're going out of scope.
	if (SemaConsumer *SC = dyn_cast<SemaConsumer>(&Consumer))
	SC->ForgetSema();

	// Detach from the external Sema source.
	if (ExternalSemaSource *ExternalSema
	= dyn_cast_or_null<ExternalSemaSource>(Context.getExternalSource()))
	ExternalSema->ForgetSema();

	// If Sema's ExternalSource is the multiplexer - we own it.
	if (isMultiplexExternalSource)
	delete ExternalSource;

	threadSafety::threadSafetyCleanup(ThreadSafetyDeclCache);

	// Destroys data sharing attributes stack for OpenMP
	DestroyDataSharingAttributesStack();

	assert(DelayedTypos.empty() && "Uncorrected typos!");
	}

	/// makeUnavailableInSystemHeader - There is an error in the current
	/// context. If we're still in a system header, and we can plausibly
	/// make the relevant declaration unavailable instead of erroring, do
	/// so and return true.
	bool Sema::makeUnavailableInSystemHeader(SourceLocation loc,
	UnavailableAttr::ImplicitReason reason) {
	// If we're not in a function, it's an error.
	FunctionDecl *fn = dyn_cast<FunctionDecl>(CurContext);
	if (!fn) return false;

	// If we're in template instantiation, it's an error.
	if (inTemplateInstantiation())
	return false;

	// If that function's not in a system header, it's an error.
	if (!Context.getSourceManager().isInSystemHeader(loc))
	return false;

	// If the function is already unavailable, it's not an error.
	if (fn->hasAttr<UnavailableAttr>()) return true;

	fn->addAttr(UnavailableAttr::CreateImplicit(Context, "", reason, loc));
	return true;
	}

	ASTMutationListener *Sema::getASTMutationListener() const {
	return getASTConsumer().GetASTMutationListener();
	}

	///\brief Registers an external source. If an external source already exists,
	/// creates a multiplex external source and appends to it.
	///
	///\param[in] E - A non-null external sema source.
	///
	void Sema::addExternalSource(ExternalSemaSource *E) {
	assert(E && "Cannot use with NULL ptr");

	if (!ExternalSource) {
	ExternalSource = E;
	return;
	}

	if (isMultiplexExternalSource)
	static_cast<MultiplexExternalSemaSource>(ExternalSource)->addSource(E);
	else {
	ExternalSource = new MultiplexExternalSemaSource(ExternalSource, E);
	isMultiplexExternalSource = true;
	}
	}

	/// \brief Print out statistics about the semantic analysis.
	void Sema::PrintStats() const {
	llvm::errs() << "\n*** Semantic Analysis Stats:\n";
	llvm::errs() << NumSFINAEErrors << " SFINAE diagnostics trapped.\n";

	BumpAlloc.PrintStats();
	AnalysisWarnings.PrintStats();
	}

	void Sema::diagnoseNullableToNonnullConversion(QualType DstType,
	QualType SrcType,
	SourceLocation Loc) {
	Optional<NullabilityKind> ExprNullability = SrcType->getNullability(Context);
	if (!ExprNullability \|\| *ExprNullability != NullabilityKind::Nullable)
	return;

	Optional<NullabilityKind> TypeNullability = DstType->getNullability(Context);
	if (!TypeNullability \|\| *TypeNullability != NullabilityKind::NonNull)
	return;

	Diag(Loc, diag::warn_nullability_lost) << SrcType << DstType;
	}

	void Sema::diagnoseZeroToNullptrConversion(CastKind Kind, const Expr* E) {
	if (Kind != CK_NullToPointer && Kind != CK_NullToMemberPointer)
	return;
	if (E->getType()->isNullPtrType())
	return;
	// nullptr only exists from C++11 on, so don't warn on its absence earlier.
	if (!getLangOpts().CPlusPlus11)
	return;

	Diag(E->getLocStart(), diag::warn_zero_as_null_pointer_constant)
	<< FixItHint::CreateReplacement(E->getSourceRange(), "nullptr");
	}

	/// ImpCastExprToType - If Expr is not of type 'Type', insert an implicit cast.
	/// If there is already an implicit cast, merge into the existing one.
	/// The result is of the given category.
	ExprResult Sema::ImpCastExprToType(Expr *E, QualType Ty,
	CastKind Kind, ExprValueKind VK,
	const CXXCastPath *BasePath,
	CheckedConversionKind CCK) {
	#ifndef NDEBUG
	if (VK == VK_RValue && !E->isRValue()) {
	switch (Kind) {
	default:
	llvm_unreachable("can't implicitly cast lvalue to rvalue with this cast "
	"kind");
	case CK_LValueToRValue:
	case CK_ArrayToPointerDecay:
	case CK_FunctionToPointerDecay:
	case CK_ToVoid:
	break;
	}
	}
	assert((VK == VK_RValue \|\| !E->isRValue()) && "can't cast rvalue to lvalue");
	#endif

	diagnoseNullableToNonnullConversion(Ty, E->getType(), E->getLocStart());
	diagnoseZeroToNullptrConversion(Kind, E);

	QualType ExprTy = Context.getCanonicalType(E->getType());
	QualType TypeTy = Context.getCanonicalType(Ty);

	if (ExprTy == TypeTy)
	return E;

	// C++1z [conv.array]: The temporary materialization conversion is applied.
	// We also use this to fuel C++ DR1213, which applies to C++11 onwards.
	if (Kind == CK_ArrayToPointerDecay && getLangOpts().CPlusPlus &&
	E->getValueKind() == VK_RValue) {
	// The temporary is an lvalue in C++98 and an xvalue otherwise.
	ExprResult Materialized = CreateMaterializeTemporaryExpr(
	E->getType(), E, !getLangOpts().CPlusPlus11);
	if (Materialized.isInvalid())
	return ExprError();
	E = Materialized.get();
	}

	if (ImplicitCastExpr *ImpCast = dyn_cast<ImplicitCastExpr>(E)) {
	if (ImpCast->getCastKind() == Kind && (!BasePath \|\| BasePath->empty())) {
	ImpCast->setType(Ty);
	ImpCast->setValueKind(VK);
	return E;
	}
	}

	return ImplicitCastExpr::Create(Context, Ty, Kind, E, BasePath, VK);
	}

	/// ScalarTypeToBooleanCastKind - Returns the cast kind corresponding
	/// to the conversion from scalar type ScalarTy to the Boolean type.
	CastKind Sema::ScalarTypeToBooleanCastKind(QualType ScalarTy) {
	switch (ScalarTy->getScalarTypeKind()) {
	case Type::STK_Bool: return CK_NoOp;
	case Type::STK_CPointer: return CK_PointerToBoolean;
	case Type::STK_BlockPointer: return CK_PointerToBoolean;
	case Type::STK_ObjCObjectPointer: return CK_PointerToBoolean;
	case Type::STK_MemberPointer: return CK_MemberPointerToBoolean;
	case Type::STK_Integral: return CK_IntegralToBoolean;
	case Type::STK_Floating: return CK_FloatingToBoolean;
	case Type::STK_IntegralComplex: return CK_IntegralComplexToBoolean;
	case Type::STK_FloatingComplex: return CK_FloatingComplexToBoolean;
	}
	return CK_Invalid;
	}

	/// \brief Used to prune the decls of Sema's UnusedFileScopedDecls vector.
	static bool ShouldRemoveFromUnused(Sema SemaRef, const DeclaratorDecl D) {
	if (D->getMostRecentDecl()->isUsed())
	return true;

	if (D->isExternallyVisible())
	return true;

	if (const FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	// If this is a function template and none of its specializations is used,
	// we should warn.
	if (FunctionTemplateDecl *Template = FD->getDescribedFunctionTemplate())
	for (const auto *Spec : Template->specializations())
	if (ShouldRemoveFromUnused(SemaRef, Spec))
	return true;

	// UnusedFileScopedDecls stores the first declaration.
	// The declaration may have become definition so check again.
	const FunctionDecl *DeclToCheck;
	if (FD->hasBody(DeclToCheck))
	return !SemaRef->ShouldWarnIfUnusedFileScopedDecl(DeclToCheck);

	// Later redecls may add new information resulting in not having to warn,
	// so check again.
	DeclToCheck = FD->getMostRecentDecl();
	if (DeclToCheck != FD)
	return !SemaRef->ShouldWarnIfUnusedFileScopedDecl(DeclToCheck);
	}

	if (const VarDecl *VD = dyn_cast<VarDecl>(D)) {
	// If a variable usable in constant expressions is referenced,
	// don't warn if it isn't used: if the value of a variable is required
	// for the computation of a constant expression, it doesn't make sense to
	// warn even if the variable isn't odr-used. (isReferenced doesn't
	// precisely reflect that, but it's a decent approximation.)
	if (VD->isReferenced() &&
	VD->isUsableInConstantExpressions(SemaRef->Context))
	return true;

	if (VarTemplateDecl *Template = VD->getDescribedVarTemplate())
	// If this is a variable template and none of its specializations is used,
	// we should warn.
	for (const auto *Spec : Template->specializations())
	if (ShouldRemoveFromUnused(SemaRef, Spec))
	return true;

	// UnusedFileScopedDecls stores the first declaration.
	// The declaration may have become definition so check again.
	const VarDecl *DeclToCheck = VD->getDefinition();
	if (DeclToCheck)
	return !SemaRef->ShouldWarnIfUnusedFileScopedDecl(DeclToCheck);

	// Later redecls may add new information resulting in not having to warn,
	// so check again.
	DeclToCheck = VD->getMostRecentDecl();
	if (DeclToCheck != VD)
	return !SemaRef->ShouldWarnIfUnusedFileScopedDecl(DeclToCheck);
	}

	return false;
	}

	/// Obtains a sorted list of functions and variables that are undefined but
	/// ODR-used.
	void Sema::getUndefinedButUsed(
	SmallVectorImpl<std::pair<NamedDecl *, SourceLocation> > &Undefined) {
	for (const auto &UndefinedUse : UndefinedButUsed) {
	NamedDecl *ND = UndefinedUse.first;

	// Ignore attributes that have become invalid.
	if (ND->isInvalidDecl()) continue;

	// __attribute__((weakref)) is basically a definition.
	if (ND->hasAttr<WeakRefAttr>()) continue;

	+ if (isa<CXXDeductionGuideDecl>(ND))
	+ continue;
	+
	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(ND)) {
	if (FD->isDefined())
	continue;
	if (FD->isExternallyVisible() &&
	!FD->getMostRecentDecl()->isInlined())
	continue;
	} else {
	auto *VD = cast<VarDecl>(ND);
	if (VD->hasDefinition() != VarDecl::DeclarationOnly)
	continue;
	if (VD->isExternallyVisible() && !VD->getMostRecentDecl()->isInline())
	continue;
	}

	Undefined.push_back(std::make_pair(ND, UndefinedUse.second));
	}
	}

	/// checkUndefinedButUsed - Check for undefined objects with internal linkage
	/// or that are inline.
	static void checkUndefinedButUsed(Sema &S) {
	if (S.UndefinedButUsed.empty()) return;

	// Collect all the still-undefined entities with internal linkage.
	SmallVector<std::pair<NamedDecl *, SourceLocation>, 16> Undefined;
	S.getUndefinedButUsed(Undefined);
	if (Undefined.empty()) return;

	for (SmallVectorImpl<std::pair<NamedDecl *, SourceLocation> >::iterator
	I = Undefined.begin(), E = Undefined.end(); I != E; ++I) {
	NamedDecl *ND = I->first;

	if (ND->hasAttr<DLLImportAttr>() \|\| ND->hasAttr<DLLExportAttr>()) {
	// An exported function will always be emitted when defined, so even if
	// the function is inline, it doesn't have to be emitted in this TU. An
	// imported function implies that it has been exported somewhere else.
	continue;
	}

	if (!ND->isExternallyVisible()) {
	S.Diag(ND->getLocation(), diag::warn_undefined_internal)
	<< isa<VarDecl>(ND) << ND;
	} else if (auto *FD = dyn_cast<FunctionDecl>(ND)) {
	(void)FD;
	assert(FD->getMostRecentDecl()->isInlined() &&
	"used object requires definition but isn't inline or internal?");
	// FIXME: This is ill-formed; we should reject.
	S.Diag(ND->getLocation(), diag::warn_undefined_inline) << ND;
	} else {
	assert(cast<VarDecl>(ND)->getMostRecentDecl()->isInline() &&
	"used var requires definition but isn't inline or internal?");
	S.Diag(ND->getLocation(), diag::err_undefined_inline_var) << ND;
	}
	if (I->second.isValid())
	S.Diag(I->second, diag::note_used_here);
	}

	S.UndefinedButUsed.clear();
	}

	void Sema::LoadExternalWeakUndeclaredIdentifiers() {
	if (!ExternalSource)
	return;

	SmallVector<std::pair<IdentifierInfo *, WeakInfo>, 4> WeakIDs;
	ExternalSource->ReadWeakUndeclaredIdentifiers(WeakIDs);
	for (auto &WeakID : WeakIDs)
	WeakUndeclaredIdentifiers.insert(WeakID);
	}


	typedef llvm::DenseMap<const CXXRecordDecl*, bool> RecordCompleteMap;

	/// \brief Returns true, if all methods and nested classes of the given
	/// CXXRecordDecl are defined in this translation unit.
	///
	/// Should only be called from ActOnEndOfTranslationUnit so that all
	/// definitions are actually read.
	static bool MethodsAndNestedClassesComplete(const CXXRecordDecl *RD,
	RecordCompleteMap &MNCComplete) {
	RecordCompleteMap::iterator Cache = MNCComplete.find(RD);
	if (Cache != MNCComplete.end())
	return Cache->second;
	if (!RD->isCompleteDefinition())
	return false;
	bool Complete = true;
	for (DeclContext::decl_iterator I = RD->decls_begin(),
	E = RD->decls_end();
	I != E && Complete; ++I) {
	if (const CXXMethodDecl M = dyn_cast<CXXMethodDecl>(I))
	Complete = M->isDefined() \|\| (M->isPure() && !isa<CXXDestructorDecl>(M));
	else if (const FunctionTemplateDecl F = dyn_cast<FunctionTemplateDecl>(I))
	// If the template function is marked as late template parsed at this
	// point, it has not been instantiated and therefore we have not
	// performed semantic analysis on it yet, so we cannot know if the type
	// can be considered complete.
	Complete = !F->getTemplatedDecl()->isLateTemplateParsed() &&
	F->getTemplatedDecl()->isDefined();
	else if (const CXXRecordDecl R = dyn_cast<CXXRecordDecl>(I)) {
	if (R->isInjectedClassName())
	continue;
	if (R->hasDefinition())
	Complete = MethodsAndNestedClassesComplete(R->getDefinition(),
	MNCComplete);
	else
	Complete = false;
	}
	}
	MNCComplete[RD] = Complete;
	return Complete;
	}

	/// \brief Returns true, if the given CXXRecordDecl is fully defined in this
	/// translation unit, i.e. all methods are defined or pure virtual and all
	/// friends, friend functions and nested classes are fully defined in this
	/// translation unit.
	///
	/// Should only be called from ActOnEndOfTranslationUnit so that all
	/// definitions are actually read.
	static bool IsRecordFullyDefined(const CXXRecordDecl *RD,
	RecordCompleteMap &RecordsComplete,
	RecordCompleteMap &MNCComplete) {
	RecordCompleteMap::iterator Cache = RecordsComplete.find(RD);
	if (Cache != RecordsComplete.end())
	return Cache->second;
	bool Complete = MethodsAndNestedClassesComplete(RD, MNCComplete);
	for (CXXRecordDecl::friend_iterator I = RD->friend_begin(),
	E = RD->friend_end();
	I != E && Complete; ++I) {
	// Check if friend classes and methods are complete.
	if (TypeSourceInfo TSI = (I)->getFriendType()) {
	// Friend classes are available as the TypeSourceInfo of the FriendDecl.
	if (CXXRecordDecl *FriendD = TSI->getType()->getAsCXXRecordDecl())
	Complete = MethodsAndNestedClassesComplete(FriendD, MNCComplete);
	else
	Complete = false;
	} else {
	// Friend functions are available through the NamedDecl of FriendDecl.
	if (const FunctionDecl *FD =
	dyn_cast<FunctionDecl>((*I)->getFriendDecl()))
	Complete = FD->isDefined();
	else
	// This is a template friend, give up.
	Complete = false;
	}
	}
	RecordsComplete[RD] = Complete;
	return Complete;
	}

	void Sema::emitAndClearUnusedLocalTypedefWarnings() {
	if (ExternalSource)
	ExternalSource->ReadUnusedLocalTypedefNameCandidates(
	UnusedLocalTypedefNameCandidates);
	for (const TypedefNameDecl *TD : UnusedLocalTypedefNameCandidates) {
	if (TD->isReferenced())
	continue;
	Diag(TD->getLocation(), diag::warn_unused_local_typedef)
	<< isa<TypeAliasDecl>(TD) << TD->getDeclName();
	}
	UnusedLocalTypedefNameCandidates.clear();
	}

	/// This is called before the very first declaration in the translation unit
	/// is parsed. Note that the ASTContext may have already injected some
	/// declarations.
	void Sema::ActOnStartOfTranslationUnit() {
	if (getLangOpts().ModulesTS) {
	// We start in the global module; all those declarations are implicitly
	// module-private (though they do not have module linkage).
	Context.getTranslationUnitDecl()->setModuleOwnershipKind(
	Decl::ModuleOwnershipKind::ModulePrivate);
	}
	}

	/// ActOnEndOfTranslationUnit - This is called at the very end of the
	/// translation unit when EOF is reached and all but the top-level scope is
	/// popped.
	void Sema::ActOnEndOfTranslationUnit() {
	assert(DelayedDiagnostics.getCurrentPool() == nullptr
	&& "reached end of translation unit with a pool attached?");

	// If code completion is enabled, don't perform any end-of-translation-unit
	// work.
	if (PP.isCodeCompletionEnabled())
	return;

	// Complete translation units and modules define vtables and perform implicit
	// instantiations. PCH files do not.
	if (TUKind != TU_Prefix) {
	DiagnoseUseOfUnimplementedSelectors();

	// If DefinedUsedVTables ends up marking any virtual member functions it
	// might lead to more pending template instantiations, which we then need
	// to instantiate.
	DefineUsedVTables();

	// C++: Perform implicit template instantiations.
	//
	// FIXME: When we perform these implicit instantiations, we do not
	// carefully keep track of the point of instantiation (C++ [temp.point]).
	// This means that name lookup that occurs within the template
	// instantiation will always happen at the end of the translation unit,
	// so it will find some names that are not required to be found. This is
	// valid, but we could do better by diagnosing if an instantiation uses a
	// name that was not visible at its first point of instantiation.
	if (ExternalSource) {
	// Load pending instantiations from the external source.
	SmallVector<PendingImplicitInstantiation, 4> Pending;
	ExternalSource->ReadPendingInstantiations(Pending);
	for (auto PII : Pending)
	if (auto Func = dyn_cast<FunctionDecl>(PII.first))
	Func->setInstantiationIsPending(true);
	PendingInstantiations.insert(PendingInstantiations.begin(),
	Pending.begin(), Pending.end());
	}
	PerformPendingInstantiations();

	if (LateTemplateParserCleanup)
	LateTemplateParserCleanup(OpaqueParser);

	CheckDelayedMemberExceptionSpecs();
	}

	DiagnoseUnterminatedPragmaAttribute();

	// All delayed member exception specs should be checked or we end up accepting
	// incompatible declarations.
	// FIXME: This is wrong for TUKind == TU_Prefix. In that case, we need to
	// write out the lists to the AST file (if any).
	assert(DelayedDefaultedMemberExceptionSpecs.empty());
	assert(DelayedExceptionSpecChecks.empty());

	// All dllexport classes should have been processed already.
	assert(DelayedDllExportClasses.empty());

	// Remove file scoped decls that turned out to be used.
	UnusedFileScopedDecls.erase(
	std::remove_if(UnusedFileScopedDecls.begin(nullptr, true),
	UnusedFileScopedDecls.end(),
	[this](const DeclaratorDecl *DD) {
	return ShouldRemoveFromUnused(this, DD);
	}),
	UnusedFileScopedDecls.end());

	if (TUKind == TU_Prefix) {
	// Translation unit prefixes don't need any of the checking below.
	if (!PP.isIncrementalProcessingEnabled())
	TUScope = nullptr;
	return;
	}

	// Check for #pragma weak identifiers that were never declared
	LoadExternalWeakUndeclaredIdentifiers();
	for (auto WeakID : WeakUndeclaredIdentifiers) {
	if (WeakID.second.getUsed())
	continue;

	Decl *PrevDecl = LookupSingleName(TUScope, WeakID.first, SourceLocation(),
	LookupOrdinaryName);
	if (PrevDecl != nullptr &&
	!(isa<FunctionDecl>(PrevDecl) \|\| isa<VarDecl>(PrevDecl)))
	Diag(WeakID.second.getLocation(), diag::warn_attribute_wrong_decl_type)
	<< "'weak'" << ExpectedVariableOrFunction;
	else
	Diag(WeakID.second.getLocation(), diag::warn_weak_identifier_undeclared)
	<< WeakID.first;
	}

	if (LangOpts.CPlusPlus11 &&
	!Diags.isIgnored(diag::warn_delegating_ctor_cycle, SourceLocation()))
	CheckDelegatingCtorCycles();

	if (!Diags.hasErrorOccurred()) {
	if (ExternalSource)
	ExternalSource->ReadUndefinedButUsed(UndefinedButUsed);
	checkUndefinedButUsed(*this);
	}

	if (TUKind == TU_Module) {
	// If we are building a module, resolve all of the exported declarations
	// now.
	if (Module *CurrentModule = PP.getCurrentModule()) {
	ModuleMap &ModMap = PP.getHeaderSearchInfo().getModuleMap();

	SmallVector<Module *, 2> Stack;
	Stack.push_back(CurrentModule);
	while (!Stack.empty()) {
	Module *Mod = Stack.pop_back_val();

	// Resolve the exported declarations and conflicts.
	// FIXME: Actually complain, once we figure out how to teach the
	// diagnostic client to deal with complaints in the module map at this
	// point.
	ModMap.resolveExports(Mod, /Complain=/false);
	ModMap.resolveUses(Mod, /Complain=/false);
	ModMap.resolveConflicts(Mod, /Complain=/false);

	// Queue the submodules, so their exports will also be resolved.
	Stack.append(Mod->submodule_begin(), Mod->submodule_end());
	}
	}

	// Warnings emitted in ActOnEndOfTranslationUnit() should be emitted for
	// modules when they are built, not every time they are used.
	emitAndClearUnusedLocalTypedefWarnings();

	// Modules don't need any of the checking below.
	if (!PP.isIncrementalProcessingEnabled())
	TUScope = nullptr;
	return;
	}

	// C99 6.9.2p2:
	// A declaration of an identifier for an object that has file
	// scope without an initializer, and without a storage-class
	// specifier or with the storage-class specifier static,
	// constitutes a tentative definition. If a translation unit
	// contains one or more tentative definitions for an identifier,
	// and the translation unit contains no external definition for
	// that identifier, then the behavior is exactly as if the
	// translation unit contains a file scope declaration of that
	// identifier, with the composite type as of the end of the
	// translation unit, with an initializer equal to 0.
	llvm::SmallSet<VarDecl *, 32> Seen;
	for (TentativeDefinitionsType::iterator
	T = TentativeDefinitions.begin(ExternalSource),
	TEnd = TentativeDefinitions.end();
	T != TEnd; ++T)
	{
	VarDecl VD = (T)->getActingDefinition();

	// If the tentative definition was completed, getActingDefinition() returns
	// null. If we've already seen this variable before, insert()'s second
	// return value is false.
	if (!VD \|\| VD->isInvalidDecl() \|\| !Seen.insert(VD).second)
	continue;

	if (const IncompleteArrayType *ArrayT
	= Context.getAsIncompleteArrayType(VD->getType())) {
	// Set the length of the array to 1 (C99 6.9.2p5).
	Diag(VD->getLocation(), diag::warn_tentative_incomplete_array);
	llvm::APInt One(Context.getTypeSize(Context.getSizeType()), true);
	QualType T = Context.getConstantArrayType(ArrayT->getElementType(),
	One, ArrayType::Normal, 0);
	VD->setType(T);
	} else if (RequireCompleteType(VD->getLocation(), VD->getType(),
	diag::err_tentative_def_incomplete_type))
	VD->setInvalidDecl();

	// No initialization is performed for a tentative definition.
	CheckCompleteVariableDeclaration(VD);

	// Notify the consumer that we've completed a tentative definition.
	if (!VD->isInvalidDecl())
	Consumer.CompleteTentativeDefinition(VD);

	}

	// If there were errors, disable 'unused' warnings since they will mostly be
	// noise.
	if (!Diags.hasErrorOccurred()) {
	// Output warning for unused file scoped decls.
	for (UnusedFileScopedDeclsType::iterator
	I = UnusedFileScopedDecls.begin(ExternalSource),
	E = UnusedFileScopedDecls.end(); I != E; ++I) {
	if (ShouldRemoveFromUnused(this, *I))
	continue;

	if (const FunctionDecl FD = dyn_cast<FunctionDecl>(I)) {
	const FunctionDecl *DiagD;
	if (!FD->hasBody(DiagD))
	DiagD = FD;
	if (DiagD->isDeleted())
	continue; // Deleted functions are supposed to be unused.
	if (DiagD->isReferenced()) {
	if (isa<CXXMethodDecl>(DiagD))
	Diag(DiagD->getLocation(), diag::warn_unneeded_member_function)
	<< DiagD->getDeclName();
	else {
	if (FD->getStorageClass() == SC_Static &&
	!FD->isInlineSpecified() &&
	!SourceMgr.isInMainFile(
	SourceMgr.getExpansionLoc(FD->getLocation())))
	Diag(DiagD->getLocation(),
	diag::warn_unneeded_static_internal_decl)
	<< DiagD->getDeclName();
	else
	Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl)
	<< /function/0 << DiagD->getDeclName();
	}
	} else {
	if (FD->getDescribedFunctionTemplate())
	Diag(DiagD->getLocation(), diag::warn_unused_template)
	<< /function/0 << DiagD->getDeclName();
	else
	Diag(DiagD->getLocation(),
	isa<CXXMethodDecl>(DiagD) ? diag::warn_unused_member_function
	: diag::warn_unused_function)
	<< DiagD->getDeclName();
	}
	} else {
	const VarDecl DiagD = cast<VarDecl>(I)->getDefinition();
	if (!DiagD)
	DiagD = cast<VarDecl>(*I);
	if (DiagD->isReferenced()) {
	Diag(DiagD->getLocation(), diag::warn_unneeded_internal_decl)
	<< /variable/1 << DiagD->getDeclName();
	} else if (DiagD->getType().isConstQualified()) {
	const SourceManager &SM = SourceMgr;
	if (SM.getMainFileID() != SM.getFileID(DiagD->getLocation()) \|\|
	!PP.getLangOpts().IsHeaderFile)
	Diag(DiagD->getLocation(), diag::warn_unused_const_variable)
	<< DiagD->getDeclName();
	} else {
	if (DiagD->getDescribedVarTemplate())
	Diag(DiagD->getLocation(), diag::warn_unused_template)
	<< /variable/1 << DiagD->getDeclName();
	else
	Diag(DiagD->getLocation(), diag::warn_unused_variable)
	<< DiagD->getDeclName();
	}
	}
	}

	emitAndClearUnusedLocalTypedefWarnings();
	}

	if (!Diags.isIgnored(diag::warn_unused_private_field, SourceLocation())) {
	RecordCompleteMap RecordsComplete;
	RecordCompleteMap MNCComplete;
	for (NamedDeclSetType::iterator I = UnusedPrivateFields.begin(),
	E = UnusedPrivateFields.end(); I != E; ++I) {
	const NamedDecl D = I;
	const CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(D->getDeclContext());
	if (RD && !RD->isUnion() &&
	IsRecordFullyDefined(RD, RecordsComplete, MNCComplete)) {
	Diag(D->getLocation(), diag::warn_unused_private_field)
	<< D->getDeclName();
	}
	}
	}

	if (!Diags.isIgnored(diag::warn_mismatched_delete_new, SourceLocation())) {
	if (ExternalSource)
	ExternalSource->ReadMismatchingDeleteExpressions(DeleteExprs);
	for (const auto &DeletedFieldInfo : DeleteExprs) {
	for (const auto &DeleteExprLoc : DeletedFieldInfo.second) {
	AnalyzeDeleteExprMismatch(DeletedFieldInfo.first, DeleteExprLoc.first,
	DeleteExprLoc.second);
	}
	}
	}

	// Check we've noticed that we're no longer parsing the initializer for every
	// variable. If we miss cases, then at best we have a performance issue and
	// at worst a rejects-valid bug.
	assert(ParsingInitForAutoVars.empty() &&
	"Didn't unmark var as having its initializer parsed");

	if (!PP.isIncrementalProcessingEnabled())
	TUScope = nullptr;
	}


	//===----------------------------------------------------------------------===//
	// Helper functions.
	//===----------------------------------------------------------------------===//

	DeclContext *Sema::getFunctionLevelDeclContext() {
	DeclContext *DC = CurContext;

	while (true) {
	if (isa<BlockDecl>(DC) \|\| isa<EnumDecl>(DC) \|\| isa<CapturedDecl>(DC)) {
	DC = DC->getParent();
	} else if (isa<CXXMethodDecl>(DC) &&
	cast<CXXMethodDecl>(DC)->getOverloadedOperator() == OO_Call &&
	cast<CXXRecordDecl>(DC->getParent())->isLambda()) {
	DC = DC->getParent()->getParent();
	}
	else break;
	}

	return DC;
	}

	/// getCurFunctionDecl - If inside of a function body, this returns a pointer
	/// to the function decl for the function being parsed. If we're currently
	/// in a 'block', this returns the containing context.
	FunctionDecl *Sema::getCurFunctionDecl() {
	DeclContext *DC = getFunctionLevelDeclContext();
	return dyn_cast<FunctionDecl>(DC);
	}

	ObjCMethodDecl *Sema::getCurMethodDecl() {
	DeclContext *DC = getFunctionLevelDeclContext();
	while (isa<RecordDecl>(DC))
	DC = DC->getParent();
	return dyn_cast<ObjCMethodDecl>(DC);
	}

	NamedDecl *Sema::getCurFunctionOrMethodDecl() {
	DeclContext *DC = getFunctionLevelDeclContext();
	if (isa<ObjCMethodDecl>(DC) \|\| isa<FunctionDecl>(DC))
	return cast<NamedDecl>(DC);
	return nullptr;
	}

	void Sema::EmitCurrentDiagnostic(unsigned DiagID) {
	// FIXME: It doesn't make sense to me that DiagID is an incoming argument here
	// and yet we also use the current diag ID on the DiagnosticsEngine. This has
	// been made more painfully obvious by the refactor that introduced this
	// function, but it is possible that the incoming argument can be
	// eliminated. If it truly cannot be (for example, there is some reentrancy
	// issue I am not seeing yet), then there should at least be a clarifying
	// comment somewhere.
	if (Optional<TemplateDeductionInfo*> Info = isSFINAEContext()) {
	switch (DiagnosticIDs::getDiagnosticSFINAEResponse(
	Diags.getCurrentDiagID())) {
	case DiagnosticIDs::SFINAE_Report:
	// We'll report the diagnostic below.
	break;

	case DiagnosticIDs::SFINAE_SubstitutionFailure:
	// Count this failure so that we know that template argument deduction
	// has failed.
	++NumSFINAEErrors;

	// Make a copy of this suppressed diagnostic and store it with the
	// template-deduction information.
	if (Info && !(Info)->hasSFINAEDiagnostic()) {
	Diagnostic DiagInfo(&Diags);
	(*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(),
	PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
	}

	Diags.setLastDiagnosticIgnored();
	Diags.Clear();
	return;

	case DiagnosticIDs::SFINAE_AccessControl: {
	// Per C++ Core Issue 1170, access control is part of SFINAE.
	// Additionally, the AccessCheckingSFINAE flag can be used to temporarily
	// make access control a part of SFINAE for the purposes of checking
	// type traits.
	if (!AccessCheckingSFINAE && !getLangOpts().CPlusPlus11)
	break;

	SourceLocation Loc = Diags.getCurrentDiagLoc();

	// Suppress this diagnostic.
	++NumSFINAEErrors;

	// Make a copy of this suppressed diagnostic and store it with the
	// template-deduction information.
	if (Info && !(Info)->hasSFINAEDiagnostic()) {
	Diagnostic DiagInfo(&Diags);
	(*Info)->addSFINAEDiagnostic(DiagInfo.getLocation(),
	PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
	}

	Diags.setLastDiagnosticIgnored();
	Diags.Clear();

	// Now the diagnostic state is clear, produce a C++98 compatibility
	// warning.
	Diag(Loc, diag::warn_cxx98_compat_sfinae_access_control);

	// The last diagnostic which Sema produced was ignored. Suppress any
	// notes attached to it.
	Diags.setLastDiagnosticIgnored();
	return;
	}

	case DiagnosticIDs::SFINAE_Suppress:
	// Make a copy of this suppressed diagnostic and store it with the
	// template-deduction information;
	if (*Info) {
	Diagnostic DiagInfo(&Diags);
	(*Info)->addSuppressedDiagnostic(DiagInfo.getLocation(),
	PartialDiagnostic(DiagInfo, Context.getDiagAllocator()));
	}

	// Suppress this diagnostic.
	Diags.setLastDiagnosticIgnored();
	Diags.Clear();
	return;
	}
	}

	// Set up the context's printing policy based on our current state.
	Context.setPrintingPolicy(getPrintingPolicy());

	// Emit the diagnostic.
	if (!Diags.EmitCurrentDiagnostic())
	return;

	// If this is not a note, and we're in a template instantiation
	// that is different from the last template instantiation where
	// we emitted an error, print a template instantiation
	// backtrace.
	if (!DiagnosticIDs::isBuiltinNote(DiagID))
	PrintContextStack();
	}

	Sema::SemaDiagnosticBuilder
	Sema::Diag(SourceLocation Loc, const PartialDiagnostic& PD) {
	SemaDiagnosticBuilder Builder(Diag(Loc, PD.getDiagID()));
	PD.Emit(Builder);

	return Builder;
	}

	/// \brief Looks through the macro-expansion chain for the given
	/// location, looking for a macro expansion with the given name.
	/// If one is found, returns true and sets the location to that
	/// expansion loc.
	bool Sema::findMacroSpelling(SourceLocation &locref, StringRef name) {
	SourceLocation loc = locref;
	if (!loc.isMacroID()) return false;

	// There's no good way right now to look at the intermediate
	// expansions, so just jump to the expansion location.
	loc = getSourceManager().getExpansionLoc(loc);

	// If that's written with the name, stop here.
	SmallVector<char, 16> buffer;
	if (getPreprocessor().getSpelling(loc, buffer) == name) {
	locref = loc;
	return true;
	}
	return false;
	}

	/// \brief Determines the active Scope associated with the given declaration
	/// context.
	///
	/// This routine maps a declaration context to the active Scope object that
	/// represents that declaration context in the parser. It is typically used
	/// from "scope-less" code (e.g., template instantiation, lazy creation of
	/// declarations) that injects a name for name-lookup purposes and, therefore,
	/// must update the Scope.
	///
	/// \returns The scope corresponding to the given declaraion context, or NULL
	/// if no such scope is open.
	Scope Sema::getScopeForContext(DeclContext Ctx) {

	if (!Ctx)
	return nullptr;

	Ctx = Ctx->getPrimaryContext();
	for (Scope *S = getCurScope(); S; S = S->getParent()) {
	// Ignore scopes that cannot have declarations. This is important for
	// out-of-line definitions of static class members.
	if (S->getFlags() & (Scope::DeclScope \| Scope::TemplateParamScope))
	if (DeclContext *Entity = S->getEntity())
	if (Ctx == Entity->getPrimaryContext())
	return S;
	}

	return nullptr;
	}

	/// \brief Enter a new function scope
	void Sema::PushFunctionScope() {
	if (FunctionScopes.size() == 1) {
	// Use the "top" function scope rather than having to allocate
	// memory for a new scope.
	FunctionScopes.back()->Clear();
	FunctionScopes.push_back(FunctionScopes.back());
	if (LangOpts.OpenMP)
	pushOpenMPFunctionRegion();
	return;
	}

	FunctionScopes.push_back(new FunctionScopeInfo(getDiagnostics()));
	if (LangOpts.OpenMP)
	pushOpenMPFunctionRegion();
	}

	void Sema::PushBlockScope(Scope BlockScope, BlockDecl Block) {
	FunctionScopes.push_back(new BlockScopeInfo(getDiagnostics(),
	BlockScope, Block));
	}

	LambdaScopeInfo *Sema::PushLambdaScope() {
	LambdaScopeInfo *const LSI = new LambdaScopeInfo(getDiagnostics());
	FunctionScopes.push_back(LSI);
	return LSI;
	}

	void Sema::RecordParsingTemplateParameterDepth(unsigned Depth) {
	if (LambdaScopeInfo *const LSI = getCurLambda()) {
	LSI->AutoTemplateParameterDepth = Depth;
	return;
	}
	llvm_unreachable(
	"Remove assertion if intentionally called in a non-lambda context.");
	}

	void Sema::PopFunctionScopeInfo(const AnalysisBasedWarnings::Policy *WP,
	const Decl D, const BlockExpr blkExpr) {
	FunctionScopeInfo *Scope = FunctionScopes.pop_back_val();
	assert(!FunctionScopes.empty() && "mismatched push/pop!");

	if (LangOpts.OpenMP)
	popOpenMPFunctionRegion(Scope);

	// Issue any analysis-based warnings.
	if (WP && D)
	AnalysisWarnings.IssueWarnings(*WP, Scope, D, blkExpr);
	else
	for (const auto &PUD : Scope->PossiblyUnreachableDiags)
	Diag(PUD.Loc, PUD.PD);

	if (FunctionScopes.back() != Scope)
	delete Scope;
	}

	void Sema::PushCompoundScope() {
	getCurFunction()->CompoundScopes.push_back(CompoundScopeInfo());
	}

	void Sema::PopCompoundScope() {
	FunctionScopeInfo *CurFunction = getCurFunction();
	assert(!CurFunction->CompoundScopes.empty() && "mismatched push/pop");

	CurFunction->CompoundScopes.pop_back();
	}

	/// \brief Determine whether any errors occurred within this function/method/
	/// block.
	bool Sema::hasAnyUnrecoverableErrorsInThisFunction() const {
	return getCurFunction()->ErrorTrap.hasUnrecoverableErrorOccurred();
	}

	BlockScopeInfo *Sema::getCurBlock() {
	if (FunctionScopes.empty())
	return nullptr;

	auto CurBSI = dyn_cast<BlockScopeInfo>(FunctionScopes.back());
	if (CurBSI && CurBSI->TheDecl &&
	!CurBSI->TheDecl->Encloses(CurContext)) {
	// We have switched contexts due to template instantiation.
	assert(!CodeSynthesisContexts.empty());
	return nullptr;
	}

	return CurBSI;
	}

	LambdaScopeInfo *Sema::getCurLambda(bool IgnoreNonLambdaCapturingScope) {
	if (FunctionScopes.empty())
	return nullptr;

	auto I = FunctionScopes.rbegin();
	if (IgnoreNonLambdaCapturingScope) {
	auto E = FunctionScopes.rend();
	while (I != E && isa<CapturingScopeInfo>(I) && !isa<LambdaScopeInfo>(I))
	++I;
	if (I == E)
	return nullptr;
	}
	auto CurLSI = dyn_cast<LambdaScopeInfo>(I);
	if (CurLSI && CurLSI->Lambda &&
	!CurLSI->Lambda->Encloses(CurContext)) {
	// We have switched contexts due to template instantiation.
	assert(!CodeSynthesisContexts.empty());
	return nullptr;
	}

	return CurLSI;
	}
	// We have a generic lambda if we parsed auto parameters, or we have
	// an associated template parameter list.
	LambdaScopeInfo *Sema::getCurGenericLambda() {
	if (LambdaScopeInfo *LSI = getCurLambda()) {
	return (LSI->AutoTemplateParams.size() \|\|
	LSI->GLTemplateParameterList) ? LSI : nullptr;
	}
	return nullptr;
	}


	void Sema::ActOnComment(SourceRange Comment) {
	if (!LangOpts.RetainCommentsFromSystemHeaders &&
	SourceMgr.isInSystemHeader(Comment.getBegin()))
	return;
	RawComment RC(SourceMgr, Comment, false,
	LangOpts.CommentOpts.ParseAllComments);
	if (RC.isAlmostTrailingComment()) {
	SourceRange MagicMarkerRange(Comment.getBegin(),
	Comment.getBegin().getLocWithOffset(3));
	StringRef MagicMarkerText;
	switch (RC.getKind()) {
	case RawComment::RCK_OrdinaryBCPL:
	MagicMarkerText = "///<";
	break;
	case RawComment::RCK_OrdinaryC:
	MagicMarkerText = "/**<";
	break;
	default:
	llvm_unreachable("if this is an almost Doxygen comment, "
	"it should be ordinary");
	}
	Diag(Comment.getBegin(), diag::warn_not_a_doxygen_trailing_member_comment) <<
	FixItHint::CreateReplacement(MagicMarkerRange, MagicMarkerText);
	}
	Context.addComment(RC);
	}

	// Pin this vtable to this file.
	ExternalSemaSource::~ExternalSemaSource() {}

	void ExternalSemaSource::ReadMethodPool(Selector Sel) { }
	void ExternalSemaSource::updateOutOfDateSelector(Selector Sel) { }

	void ExternalSemaSource::ReadKnownNamespaces(
	SmallVectorImpl<NamespaceDecl *> &Namespaces) {
	}

	void ExternalSemaSource::ReadUndefinedButUsed(
	llvm::MapVector<NamedDecl *, SourceLocation> &Undefined) {}

	void ExternalSemaSource::ReadMismatchingDeleteExpressions(llvm::MapVector<
	FieldDecl *, llvm::SmallVector<std::pair<SourceLocation, bool>, 4>> &) {}

	void PrettyDeclStackTraceEntry::print(raw_ostream &OS) const {
	SourceLocation Loc = this->Loc;
	if (!Loc.isValid() && TheDecl) Loc = TheDecl->getLocation();
	if (Loc.isValid()) {
	Loc.print(OS, S.getSourceManager());
	OS << ": ";
	}
	OS << Message;

	if (auto *ND = dyn_cast_or_null<NamedDecl>(TheDecl)) {
	OS << " '";
	ND->getNameForDiagnostic(OS, ND->getASTContext().getPrintingPolicy(), true);
	OS << "'";
	}

	OS << '\n';
	}

	/// \brief Figure out if an expression could be turned into a call.
	///
	/// Use this when trying to recover from an error where the programmer may have
	/// written just the name of a function instead of actually calling it.
	///
	/// \param E - The expression to examine.
	/// \param ZeroArgCallReturnTy - If the expression can be turned into a call
	/// with no arguments, this parameter is set to the type returned by such a
	/// call; otherwise, it is set to an empty QualType.
	/// \param OverloadSet - If the expression is an overloaded function
	/// name, this parameter is populated with the decls of the various overloads.
	bool Sema::tryExprAsCall(Expr &E, QualType &ZeroArgCallReturnTy,
	UnresolvedSetImpl &OverloadSet) {
	ZeroArgCallReturnTy = QualType();
	OverloadSet.clear();

	const OverloadExpr *Overloads = nullptr;
	bool IsMemExpr = false;
	if (E.getType() == Context.OverloadTy) {
	OverloadExpr::FindResult FR = OverloadExpr::find(const_cast<Expr*>(&E));

	// Ignore overloads that are pointer-to-member constants.
	if (FR.HasFormOfMemberPointer)
	return false;

	Overloads = FR.Expression;
	} else if (E.getType() == Context.BoundMemberTy) {
	Overloads = dyn_cast<UnresolvedMemberExpr>(E.IgnoreParens());
	IsMemExpr = true;
	}

	bool Ambiguous = false;

	if (Overloads) {
	for (OverloadExpr::decls_iterator it = Overloads->decls_begin(),
	DeclsEnd = Overloads->decls_end(); it != DeclsEnd; ++it) {
	OverloadSet.addDecl(*it);

	// Check whether the function is a non-template, non-member which takes no
	// arguments.
	if (IsMemExpr)
	continue;
	if (const FunctionDecl *OverloadDecl
	= dyn_cast<FunctionDecl>((*it)->getUnderlyingDecl())) {
	if (OverloadDecl->getMinRequiredArguments() == 0) {
	if (!ZeroArgCallReturnTy.isNull() && !Ambiguous) {
	ZeroArgCallReturnTy = QualType();
	Ambiguous = true;
	} else
	ZeroArgCallReturnTy = OverloadDecl->getReturnType();
	}
	}
	}

	// If it's not a member, use better machinery to try to resolve the call
	if (!IsMemExpr)
	return !ZeroArgCallReturnTy.isNull();
	}

	// Attempt to call the member with no arguments - this will correctly handle
	// member templates with defaults/deduction of template arguments, overloads
	// with default arguments, etc.
	if (IsMemExpr && !E.isTypeDependent()) {
	bool Suppress = getDiagnostics().getSuppressAllDiagnostics();
	getDiagnostics().setSuppressAllDiagnostics(true);
	ExprResult R = BuildCallToMemberFunction(nullptr, &E, SourceLocation(),
	None, SourceLocation());
	getDiagnostics().setSuppressAllDiagnostics(Suppress);
	if (R.isUsable()) {
	ZeroArgCallReturnTy = R.get()->getType();
	return true;
	}
	return false;
	}

	if (const DeclRefExpr *DeclRef = dyn_cast<DeclRefExpr>(E.IgnoreParens())) {
	if (const FunctionDecl *Fun = dyn_cast<FunctionDecl>(DeclRef->getDecl())) {
	if (Fun->getMinRequiredArguments() == 0)
	ZeroArgCallReturnTy = Fun->getReturnType();
	return true;
	}
	}

	// We don't have an expression that's convenient to get a FunctionDecl from,
	// but we can at least check if the type is "function of 0 arguments".
	QualType ExprTy = E.getType();
	const FunctionType *FunTy = nullptr;
	QualType PointeeTy = ExprTy->getPointeeType();
	if (!PointeeTy.isNull())
	FunTy = PointeeTy->getAs<FunctionType>();
	if (!FunTy)
	FunTy = ExprTy->getAs<FunctionType>();

	if (const FunctionProtoType *FPT =
	dyn_cast_or_null<FunctionProtoType>(FunTy)) {
	if (FPT->getNumParams() == 0)
	ZeroArgCallReturnTy = FunTy->getReturnType();
	return true;
	}
	return false;
	}

	/// \brief Give notes for a set of overloads.
	///
	/// A companion to tryExprAsCall. In cases when the name that the programmer
	/// wrote was an overloaded function, we may be able to make some guesses about
	/// plausible overloads based on their return types; such guesses can be handed
	/// off to this method to be emitted as notes.
	///
	/// \param Overloads - The overloads to note.
	/// \param FinalNoteLoc - If we've suppressed printing some overloads due to
	/// -fshow-overloads=best, this is the location to attach to the note about too
	/// many candidates. Typically this will be the location of the original
	/// ill-formed expression.
	static void noteOverloads(Sema &S, const UnresolvedSetImpl &Overloads,
	const SourceLocation FinalNoteLoc) {
	int ShownOverloads = 0;
	int SuppressedOverloads = 0;
	for (UnresolvedSetImpl::iterator It = Overloads.begin(),
	DeclsEnd = Overloads.end(); It != DeclsEnd; ++It) {
	// FIXME: Magic number for max shown overloads stolen from
	// OverloadCandidateSet::NoteCandidates.
	if (ShownOverloads >= 4 && S.Diags.getShowOverloads() == Ovl_Best) {
	++SuppressedOverloads;
	continue;
	}

	NamedDecl Fn = (It)->getUnderlyingDecl();
	S.Diag(Fn->getLocation(), diag::note_possible_target_of_call);
	++ShownOverloads;
	}

	if (SuppressedOverloads)
	S.Diag(FinalNoteLoc, diag::note_ovl_too_many_candidates)
	<< SuppressedOverloads;
	}

	static void notePlausibleOverloads(Sema &S, SourceLocation Loc,
	const UnresolvedSetImpl &Overloads,
	bool (*IsPlausibleResult)(QualType)) {
	if (!IsPlausibleResult)
	return noteOverloads(S, Overloads, Loc);

	UnresolvedSet<2> PlausibleOverloads;
	for (OverloadExpr::decls_iterator It = Overloads.begin(),
	DeclsEnd = Overloads.end(); It != DeclsEnd; ++It) {
	const FunctionDecl OverloadDecl = cast<FunctionDecl>(It);
	QualType OverloadResultTy = OverloadDecl->getReturnType();
	if (IsPlausibleResult(OverloadResultTy))
	PlausibleOverloads.addDecl(It.getDecl());
	}
	noteOverloads(S, PlausibleOverloads, Loc);
	}

	/// Determine whether the given expression can be called by just
	/// putting parentheses after it. Notably, expressions with unary
	/// operators can't be because the unary operator will start parsing
	/// outside the call.
	static bool IsCallableWithAppend(Expr *E) {
	E = E->IgnoreImplicit();
	return (!isa<CStyleCastExpr>(E) &&
	!isa<UnaryOperator>(E) &&
	!isa<BinaryOperator>(E) &&
	!isa<CXXOperatorCallExpr>(E));
	}

	bool Sema::tryToRecoverWithCall(ExprResult &E, const PartialDiagnostic &PD,
	bool ForceComplain,
	bool (*IsPlausibleResult)(QualType)) {
	SourceLocation Loc = E.get()->getExprLoc();
	SourceRange Range = E.get()->getSourceRange();

	QualType ZeroArgCallTy;
	UnresolvedSet<4> Overloads;
	if (tryExprAsCall(*E.get(), ZeroArgCallTy, Overloads) &&
	!ZeroArgCallTy.isNull() &&
	(!IsPlausibleResult \|\| IsPlausibleResult(ZeroArgCallTy))) {
	// At this point, we know E is potentially callable with 0
	// arguments and that it returns something of a reasonable type,
	// so we can emit a fixit and carry on pretending that E was
	// actually a CallExpr.
	SourceLocation ParenInsertionLoc = getLocForEndOfToken(Range.getEnd());
	Diag(Loc, PD)
	<< /zero-arg/ 1 << Range
	<< (IsCallableWithAppend(E.get())
	? FixItHint::CreateInsertion(ParenInsertionLoc, "()")
	: FixItHint());
	notePlausibleOverloads(*this, Loc, Overloads, IsPlausibleResult);

	// FIXME: Try this before emitting the fixit, and suppress diagnostics
	// while doing so.
	E = ActOnCallExpr(nullptr, E.get(), Range.getEnd(), None,
	Range.getEnd().getLocWithOffset(1));
	return true;
	}

	if (!ForceComplain) return false;

	Diag(Loc, PD) << /not zero-arg/ 0 << Range;
	notePlausibleOverloads(*this, Loc, Overloads, IsPlausibleResult);
	E = ExprError();
	return true;
	}

	IdentifierInfo *Sema::getSuperIdentifier() const {
	if (!Ident_super)
	Ident_super = &Context.Idents.get("super");
	return Ident_super;
	}

	IdentifierInfo *Sema::getFloat128Identifier() const {
	if (!Ident___float128)
	Ident___float128 = &Context.Idents.get("__float128");
	return Ident___float128;
	}

	void Sema::PushCapturedRegionScope(Scope S, CapturedDecl CD, RecordDecl *RD,
	CapturedRegionKind K) {
	CapturingScopeInfo *CSI = new CapturedRegionScopeInfo(
	getDiagnostics(), S, CD, RD, CD->getContextParam(), K,
	(getLangOpts().OpenMP && K == CR_OpenMP) ? getOpenMPNestingLevel() : 0);
	CSI->ReturnType = Context.VoidTy;
	FunctionScopes.push_back(CSI);
	}

	CapturedRegionScopeInfo *Sema::getCurCapturedRegion() {
	if (FunctionScopes.empty())
	return nullptr;

	return dyn_cast<CapturedRegionScopeInfo>(FunctionScopes.back());
	}

	const llvm::MapVector<FieldDecl *, Sema::DeleteLocs> &
	Sema::getMismatchingDeleteExpressions() const {
	return DeleteExprs;
	}

	void Sema::setOpenCLExtensionForType(QualType T, llvm::StringRef ExtStr) {
	if (ExtStr.empty())
	return;
	llvm::SmallVector<StringRef, 1> Exts;
	ExtStr.split(Exts, " ", /* limit / -1, / keep empty */ false);
	auto CanT = T.getCanonicalType().getTypePtr();
	for (auto &I : Exts)
	OpenCLTypeExtMap[CanT].insert(I.str());
	}

	void Sema::setOpenCLExtensionForDecl(Decl *FD, StringRef ExtStr) {
	llvm::SmallVector<StringRef, 1> Exts;
	ExtStr.split(Exts, " ", /* limit / -1, / keep empty */ false);
	if (Exts.empty())
	return;
	for (auto &I : Exts)
	OpenCLDeclExtMap[FD].insert(I.str());
	}

	void Sema::setCurrentOpenCLExtensionForType(QualType T) {
	if (CurrOpenCLExtension.empty())
	return;
	setOpenCLExtensionForType(T, CurrOpenCLExtension);
	}

	void Sema::setCurrentOpenCLExtensionForDecl(Decl *D) {
	if (CurrOpenCLExtension.empty())
	return;
	setOpenCLExtensionForDecl(D, CurrOpenCLExtension);
	}

	bool Sema::isOpenCLDisabledDecl(Decl *FD) {
	auto Loc = OpenCLDeclExtMap.find(FD);
	if (Loc == OpenCLDeclExtMap.end())
	return false;
	for (auto &I : Loc->second) {
	if (!getOpenCLOptions().isEnabled(I))
	return true;
	}
	return false;
	}

	template <typename T, typename DiagLocT, typename DiagInfoT, typename MapT>
	bool Sema::checkOpenCLDisabledTypeOrDecl(T D, DiagLocT DiagLoc,
	DiagInfoT DiagInfo, MapT &Map,
	unsigned Selector,
	SourceRange SrcRange) {
	auto Loc = Map.find(D);
	if (Loc == Map.end())
	return false;
	bool Disabled = false;
	for (auto &I : Loc->second) {
	if (I != CurrOpenCLExtension && !getOpenCLOptions().isEnabled(I)) {
	Diag(DiagLoc, diag::err_opencl_requires_extension) << Selector << DiagInfo
	<< I << SrcRange;
	Disabled = true;
	}
	}
	return Disabled;
	}

	bool Sema::checkOpenCLDisabledTypeDeclSpec(const DeclSpec &DS, QualType QT) {
	// Check extensions for declared types.
	Decl *Decl = nullptr;
	if (auto TypedefT = dyn_cast<TypedefType>(QT.getTypePtr()))
	Decl = TypedefT->getDecl();
	if (auto TagT = dyn_cast<TagType>(QT.getCanonicalType().getTypePtr()))
	Decl = TagT->getDecl();
	auto Loc = DS.getTypeSpecTypeLoc();
	if (checkOpenCLDisabledTypeOrDecl(Decl, Loc, QT, OpenCLDeclExtMap))
	return true;

	// Check extensions for builtin types.
	return checkOpenCLDisabledTypeOrDecl(QT.getCanonicalType().getTypePtr(), Loc,
	QT, OpenCLTypeExtMap);
	}

	bool Sema::checkOpenCLDisabledDecl(const NamedDecl &D, const Expr &E) {
	IdentifierInfo *FnName = D.getIdentifier();
	return checkOpenCLDisabledTypeOrDecl(&D, E.getLocStart(), FnName,
	OpenCLDeclExtMap, 1, D.getSourceRange());
	}
	Index: head/contrib/llvm/tools/clang/lib/Sema/SemaExpr.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Sema/SemaExpr.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Sema/SemaExpr.cpp (revision 322320)
	@@ -1,15697 +1,15697 @@
	//===--- SemaExpr.cpp - Semantic Analysis for Expressions -----------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements semantic analysis for expressions.
	//
	//===----------------------------------------------------------------------===//

	#include "TreeTransform.h"
	#include "clang/AST/ASTConsumer.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/ASTLambda.h"
	#include "clang/AST/ASTMutationListener.h"
	#include "clang/AST/CXXInheritance.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/EvaluatedExprVisitor.h"
	#include "clang/AST/Expr.h"
	#include "clang/AST/ExprCXX.h"
	#include "clang/AST/ExprObjC.h"
	#include "clang/AST/ExprOpenMP.h"
	#include "clang/AST/RecursiveASTVisitor.h"
	#include "clang/AST/TypeLoc.h"
	#include "clang/Basic/PartialDiagnostic.h"
	#include "clang/Basic/SourceManager.h"
	#include "clang/Basic/TargetInfo.h"
	#include "clang/Lex/LiteralSupport.h"
	#include "clang/Lex/Preprocessor.h"
	#include "clang/Sema/AnalysisBasedWarnings.h"
	#include "clang/Sema/DeclSpec.h"
	#include "clang/Sema/DelayedDiagnostic.h"
	#include "clang/Sema/Designator.h"
	#include "clang/Sema/Initialization.h"
	#include "clang/Sema/Lookup.h"
	#include "clang/Sema/ParsedTemplate.h"
	#include "clang/Sema/Scope.h"
	#include "clang/Sema/ScopeInfo.h"
	#include "clang/Sema/SemaFixItUtils.h"
	#include "clang/Sema/SemaInternal.h"
	#include "clang/Sema/Template.h"
	#include "llvm/Support/ConvertUTF.h"
	using namespace clang;
	using namespace sema;

	/// \brief Determine whether the use of this declaration is valid, without
	/// emitting diagnostics.
	bool Sema::CanUseDecl(NamedDecl *D, bool TreatUnavailableAsInvalid) {
	// See if this is an auto-typed variable whose initializer we are parsing.
	if (ParsingInitForAutoVars.count(D))
	return false;

	// See if this is a deleted function.
	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	if (FD->isDeleted())
	return false;

	// If the function has a deduced return type, and we can't deduce it,
	// then we can't use it either.
	if (getLangOpts().CPlusPlus14 && FD->getReturnType()->isUndeducedType() &&
	DeduceReturnType(FD, SourceLocation(), /Diagnose/ false))
	return false;
	}

	// See if this function is unavailable.
	if (TreatUnavailableAsInvalid && D->getAvailability() == AR_Unavailable &&
	cast<Decl>(CurContext)->getAvailability() != AR_Unavailable)
	return false;

	return true;
	}

	static void DiagnoseUnusedOfDecl(Sema &S, NamedDecl *D, SourceLocation Loc) {
	// Warn if this is used but marked unused.
	if (const auto *A = D->getAttr<UnusedAttr>()) {
	// [[maybe_unused]] should not diagnose uses, but __attribute__((unused))
	// should diagnose them.
	if (A->getSemanticSpelling() != UnusedAttr::CXX11_maybe_unused) {
	const Decl *DC = cast_or_null<Decl>(S.getCurObjCLexicalContext());
	if (DC && !DC->hasAttr<UnusedAttr>())
	S.Diag(Loc, diag::warn_used_but_marked_unused) << D->getDeclName();
	}
	}
	}

	/// \brief Emit a note explaining that this function is deleted.
	void Sema::NoteDeletedFunction(FunctionDecl *Decl) {
	assert(Decl->isDeleted());

	CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(Decl);

	if (Method && Method->isDeleted() && Method->isDefaulted()) {
	// If the method was explicitly defaulted, point at that declaration.
	if (!Method->isImplicit())
	Diag(Decl->getLocation(), diag::note_implicitly_deleted);

	// Try to diagnose why this special member function was implicitly
	// deleted. This might fail, if that reason no longer applies.
	CXXSpecialMember CSM = getSpecialMember(Method);
	if (CSM != CXXInvalid)
	ShouldDeleteSpecialMember(Method, CSM, nullptr, /Diagnose=/true);

	return;
	}

	auto *Ctor = dyn_cast<CXXConstructorDecl>(Decl);
	if (Ctor && Ctor->isInheritingConstructor())
	return NoteDeletedInheritingConstructor(Ctor);

	Diag(Decl->getLocation(), diag::note_availability_specified_here)
	<< Decl << true;
	}

	/// \brief Determine whether a FunctionDecl was ever declared with an
	/// explicit storage class.
	static bool hasAnyExplicitStorageClass(const FunctionDecl *D) {
	for (auto I : D->redecls()) {
	if (I->getStorageClass() != SC_None)
	return true;
	}
	return false;
	}

	/// \brief Check whether we're in an extern inline function and referring to a
	/// variable or function with internal linkage (C11 6.7.4p3).
	///
	/// This is only a warning because we used to silently accept this code, but
	/// in many cases it will not behave correctly. This is not enabled in C++ mode
	/// because the restriction language is a bit weaker (C++11 [basic.def.odr]p6)
	/// and so while there may still be user mistakes, most of the time we can't
	/// prove that there are errors.
	static void diagnoseUseOfInternalDeclInInlineFunction(Sema &S,
	const NamedDecl *D,
	SourceLocation Loc) {
	// This is disabled under C++; there are too many ways for this to fire in
	// contexts where the warning is a false positive, or where it is technically
	// correct but benign.
	if (S.getLangOpts().CPlusPlus)
	return;

	// Check if this is an inlined function or method.
	FunctionDecl *Current = S.getCurFunctionDecl();
	if (!Current)
	return;
	if (!Current->isInlined())
	return;
	if (!Current->isExternallyVisible())
	return;

	// Check if the decl has internal linkage.
	if (D->getFormalLinkage() != InternalLinkage)
	return;

	// Downgrade from ExtWarn to Extension if
	// (1) the supposedly external inline function is in the main file,
	// and probably won't be included anywhere else.
	// (2) the thing we're referencing is a pure function.
	// (3) the thing we're referencing is another inline function.
	// This last can give us false negatives, but it's better than warning on
	// wrappers for simple C library functions.
	const FunctionDecl *UsedFn = dyn_cast<FunctionDecl>(D);
	bool DowngradeWarning = S.getSourceManager().isInMainFile(Loc);
	if (!DowngradeWarning && UsedFn)
	DowngradeWarning = UsedFn->isInlined() \|\| UsedFn->hasAttr<ConstAttr>();

	S.Diag(Loc, DowngradeWarning ? diag::ext_internal_in_extern_inline_quiet
	: diag::ext_internal_in_extern_inline)
	<< /IsVar=/!UsedFn << D;

	S.MaybeSuggestAddingStaticToDecl(Current);

	S.Diag(D->getCanonicalDecl()->getLocation(), diag::note_entity_declared_at)
	<< D;
	}

	void Sema::MaybeSuggestAddingStaticToDecl(const FunctionDecl *Cur) {
	const FunctionDecl *First = Cur->getFirstDecl();

	// Suggest "static" on the function, if possible.
	if (!hasAnyExplicitStorageClass(First)) {
	SourceLocation DeclBegin = First->getSourceRange().getBegin();
	Diag(DeclBegin, diag::note_convert_inline_to_static)
	<< Cur << FixItHint::CreateInsertion(DeclBegin, "static ");
	}
	}

	/// \brief Determine whether the use of this declaration is valid, and
	/// emit any corresponding diagnostics.
	///
	/// This routine diagnoses various problems with referencing
	/// declarations that can occur when using a declaration. For example,
	/// it might warn if a deprecated or unavailable declaration is being
	/// used, or produce an error (and return true) if a C++0x deleted
	/// function is being used.
	///
	/// \returns true if there was an error (this declaration cannot be
	/// referenced), false otherwise.
	///
	bool Sema::DiagnoseUseOfDecl(NamedDecl *D, SourceLocation Loc,
	const ObjCInterfaceDecl *UnknownObjCClass,
	bool ObjCPropertyAccess,
	bool AvoidPartialAvailabilityChecks) {
	if (getLangOpts().CPlusPlus && isa<FunctionDecl>(D)) {
	// If there were any diagnostics suppressed by template argument deduction,
	// emit them now.
	auto Pos = SuppressedDiagnostics.find(D->getCanonicalDecl());
	if (Pos != SuppressedDiagnostics.end()) {
	for (const PartialDiagnosticAt &Suppressed : Pos->second)
	Diag(Suppressed.first, Suppressed.second);

	// Clear out the list of suppressed diagnostics, so that we don't emit
	// them again for this specialization. However, we don't obsolete this
	// entry from the table, because we want to avoid ever emitting these
	// diagnostics again.
	Pos->second.clear();
	}

	// C++ [basic.start.main]p3:
	// The function 'main' shall not be used within a program.
	if (cast<FunctionDecl>(D)->isMain())
	Diag(Loc, diag::ext_main_used);
	}

	// See if this is an auto-typed variable whose initializer we are parsing.
	if (ParsingInitForAutoVars.count(D)) {
	if (isa<BindingDecl>(D)) {
	Diag(Loc, diag::err_binding_cannot_appear_in_own_initializer)
	<< D->getDeclName();
	} else {
	Diag(Loc, diag::err_auto_variable_cannot_appear_in_own_initializer)
	<< D->getDeclName() << cast<VarDecl>(D)->getType();
	}
	return true;
	}

	// See if this is a deleted function.
	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	if (FD->isDeleted()) {
	auto *Ctor = dyn_cast<CXXConstructorDecl>(FD);
	if (Ctor && Ctor->isInheritingConstructor())
	Diag(Loc, diag::err_deleted_inherited_ctor_use)
	<< Ctor->getParent()
	<< Ctor->getInheritedConstructor().getConstructor()->getParent();
	else
	Diag(Loc, diag::err_deleted_function_use);
	NoteDeletedFunction(FD);
	return true;
	}

	// If the function has a deduced return type, and we can't deduce it,
	// then we can't use it either.
	if (getLangOpts().CPlusPlus14 && FD->getReturnType()->isUndeducedType() &&
	DeduceReturnType(FD, Loc))
	return true;

	if (getLangOpts().CUDA && !CheckCUDACall(Loc, FD))
	return true;
	}

	auto getReferencedObjCProp = [](const NamedDecl *D) ->
	const ObjCPropertyDecl * {
	if (const auto *MD = dyn_cast<ObjCMethodDecl>(D))
	return MD->findPropertyDecl();
	return nullptr;
	};
	if (const ObjCPropertyDecl *ObjCPDecl = getReferencedObjCProp(D)) {
	if (diagnoseArgIndependentDiagnoseIfAttrs(ObjCPDecl, Loc))
	return true;
	} else if (diagnoseArgIndependentDiagnoseIfAttrs(D, Loc)) {
	return true;
	}

	// [OpenMP 4.0], 2.15 declare reduction Directive, Restrictions
	// Only the variables omp_in and omp_out are allowed in the combiner.
	// Only the variables omp_priv and omp_orig are allowed in the
	// initializer-clause.
	auto *DRD = dyn_cast<OMPDeclareReductionDecl>(CurContext);
	if (LangOpts.OpenMP && DRD && !CurContext->containsDecl(D) &&
	isa<VarDecl>(D)) {
	Diag(Loc, diag::err_omp_wrong_var_in_declare_reduction)
	<< getCurFunction()->HasOMPDeclareReductionCombiner;
	Diag(D->getLocation(), diag::note_entity_declared_at) << D;
	return true;
	}

	DiagnoseAvailabilityOfDecl(D, Loc, UnknownObjCClass, ObjCPropertyAccess,
	AvoidPartialAvailabilityChecks);

	DiagnoseUnusedOfDecl(*this, D, Loc);

	diagnoseUseOfInternalDeclInInlineFunction(*this, D, Loc);

	return false;
	}

	/// \brief Retrieve the message suffix that should be added to a
	/// diagnostic complaining about the given function being deleted or
	/// unavailable.
	std::string Sema::getDeletedOrUnavailableSuffix(const FunctionDecl *FD) {
	std::string Message;
	if (FD->getAvailability(&Message))
	return ": " + Message;

	return std::string();
	}

	/// DiagnoseSentinelCalls - This routine checks whether a call or
	/// message-send is to a declaration with the sentinel attribute, and
	/// if so, it checks that the requirements of the sentinel are
	/// satisfied.
	void Sema::DiagnoseSentinelCalls(NamedDecl *D, SourceLocation Loc,
	ArrayRef<Expr *> Args) {
	const SentinelAttr *attr = D->getAttr<SentinelAttr>();
	if (!attr)
	return;

	// The number of formal parameters of the declaration.
	unsigned numFormalParams;

	// The kind of declaration. This is also an index into a %select in
	// the diagnostic.
	enum CalleeType { CT_Function, CT_Method, CT_Block } calleeType;

	if (ObjCMethodDecl *MD = dyn_cast<ObjCMethodDecl>(D)) {
	numFormalParams = MD->param_size();
	calleeType = CT_Method;
	} else if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
	numFormalParams = FD->param_size();
	calleeType = CT_Function;
	} else if (isa<VarDecl>(D)) {
	QualType type = cast<ValueDecl>(D)->getType();
	const FunctionType *fn = nullptr;
	if (const PointerType *ptr = type->getAs<PointerType>()) {
	fn = ptr->getPointeeType()->getAs<FunctionType>();
	if (!fn) return;
	calleeType = CT_Function;
	} else if (const BlockPointerType *ptr = type->getAs<BlockPointerType>()) {
	fn = ptr->getPointeeType()->castAs<FunctionType>();
	calleeType = CT_Block;
	} else {
	return;
	}

	if (const FunctionProtoType *proto = dyn_cast<FunctionProtoType>(fn)) {
	numFormalParams = proto->getNumParams();
	} else {
	numFormalParams = 0;
	}
	} else {
	return;
	}

	// "nullPos" is the number of formal parameters at the end which
	// effectively count as part of the variadic arguments. This is
	// useful if you would prefer to not have any formal parameters,
	// but the language forces you to have at least one.
	unsigned nullPos = attr->getNullPos();
	assert((nullPos == 0 \|\| nullPos == 1) && "invalid null position on sentinel");
	numFormalParams = (nullPos > numFormalParams ? 0 : numFormalParams - nullPos);

	// The number of arguments which should follow the sentinel.
	unsigned numArgsAfterSentinel = attr->getSentinel();

	// If there aren't enough arguments for all the formal parameters,
	// the sentinel, and the args after the sentinel, complain.
	if (Args.size() < numFormalParams + numArgsAfterSentinel + 1) {
	Diag(Loc, diag::warn_not_enough_argument) << D->getDeclName();
	Diag(D->getLocation(), diag::note_sentinel_here) << int(calleeType);
	return;
	}

	// Otherwise, find the sentinel expression.
	Expr *sentinelExpr = Args[Args.size() - numArgsAfterSentinel - 1];
	if (!sentinelExpr) return;
	if (sentinelExpr->isValueDependent()) return;
	if (Context.isSentinelNullExpr(sentinelExpr)) return;

	// Pick a reasonable string to insert. Optimistically use 'nil', 'nullptr',
	// or 'NULL' if those are actually defined in the context. Only use
	// 'nil' for ObjC methods, where it's much more likely that the
	// variadic arguments form a list of object pointers.
	SourceLocation MissingNilLoc
	= getLocForEndOfToken(sentinelExpr->getLocEnd());
	std::string NullValue;
	if (calleeType == CT_Method && PP.isMacroDefined("nil"))
	NullValue = "nil";
	else if (getLangOpts().CPlusPlus11)
	NullValue = "nullptr";
	else if (PP.isMacroDefined("NULL"))
	NullValue = "NULL";
	else
	NullValue = "(void*) 0";

	if (MissingNilLoc.isInvalid())
	Diag(Loc, diag::warn_missing_sentinel) << int(calleeType);
	else
	Diag(MissingNilLoc, diag::warn_missing_sentinel)
	<< int(calleeType)
	<< FixItHint::CreateInsertion(MissingNilLoc, ", " + NullValue);
	Diag(D->getLocation(), diag::note_sentinel_here) << int(calleeType);
	}

	SourceRange Sema::getExprRange(Expr *E) const {
	return E ? E->getSourceRange() : SourceRange();
	}

	//===----------------------------------------------------------------------===//
	// Standard Promotions and Conversions
	//===----------------------------------------------------------------------===//

	/// DefaultFunctionArrayConversion (C99 6.3.2.1p3, C99 6.3.2.1p4).
	ExprResult Sema::DefaultFunctionArrayConversion(Expr *E, bool Diagnose) {
	// Handle any placeholder expressions which made it here.
	if (E->getType()->isPlaceholderType()) {
	ExprResult result = CheckPlaceholderExpr(E);
	if (result.isInvalid()) return ExprError();
	E = result.get();
	}

	QualType Ty = E->getType();
	assert(!Ty.isNull() && "DefaultFunctionArrayConversion - missing type");

	if (Ty->isFunctionType()) {
	// If we are here, we are not calling a function but taking
	// its address (which is not allowed in OpenCL v1.0 s6.8.a.3).
	if (getLangOpts().OpenCL) {
	if (Diagnose)
	Diag(E->getExprLoc(), diag::err_opencl_taking_function_address);
	return ExprError();
	}

	if (auto *DRE = dyn_cast<DeclRefExpr>(E->IgnoreParenCasts()))
	if (auto *FD = dyn_cast<FunctionDecl>(DRE->getDecl()))
	if (!checkAddressOfFunctionIsAvailable(FD, Diagnose, E->getExprLoc()))
	return ExprError();

	E = ImpCastExprToType(E, Context.getPointerType(Ty),
	CK_FunctionToPointerDecay).get();
	} else if (Ty->isArrayType()) {
	// In C90 mode, arrays only promote to pointers if the array expression is
	// an lvalue. The relevant legalese is C90 6.2.2.1p3: "an lvalue that has
	// type 'array of type' is converted to an expression that has type 'pointer
	// to type'...". In C99 this was changed to: C99 6.3.2.1p3: "an expression
	// that has type 'array of type' ...". The relevant change is "an lvalue"
	// (C90) to "an expression" (C99).
	//
	// C++ 4.2p1:
	// An lvalue or rvalue of type "array of N T" or "array of unknown bound of
	// T" can be converted to an rvalue of type "pointer to T".
	//
	if (getLangOpts().C99 \|\| getLangOpts().CPlusPlus \|\| E->isLValue())
	E = ImpCastExprToType(E, Context.getArrayDecayedType(Ty),
	CK_ArrayToPointerDecay).get();
	}
	return E;
	}

	static void CheckForNullPointerDereference(Sema &S, Expr *E) {
	// Check to see if we are dereferencing a null pointer. If so,
	// and if not volatile-qualified, this is undefined behavior that the
	// optimizer will delete, so warn about it. People sometimes try to use this
	// to get a deterministic trap and are surprised by clang's behavior. This
	// only handles the pattern "*null", which is a very syntactic check.
	if (UnaryOperator *UO = dyn_cast<UnaryOperator>(E->IgnoreParenCasts()))
	if (UO->getOpcode() == UO_Deref &&
	UO->getSubExpr()->IgnoreParenCasts()->
	isNullPointerConstant(S.Context, Expr::NPC_ValueDependentIsNotNull) &&
	!UO->getType().isVolatileQualified()) {
	S.DiagRuntimeBehavior(UO->getOperatorLoc(), UO,
	S.PDiag(diag::warn_indirection_through_null)
	<< UO->getSubExpr()->getSourceRange());
	S.DiagRuntimeBehavior(UO->getOperatorLoc(), UO,
	S.PDiag(diag::note_indirection_through_null));
	}
	}

	static void DiagnoseDirectIsaAccess(Sema &S, const ObjCIvarRefExpr *OIRE,
	SourceLocation AssignLoc,
	const Expr* RHS) {
	const ObjCIvarDecl *IV = OIRE->getDecl();
	if (!IV)
	return;

	DeclarationName MemberName = IV->getDeclName();
	IdentifierInfo *Member = MemberName.getAsIdentifierInfo();
	if (!Member \|\| !Member->isStr("isa"))
	return;

	const Expr *Base = OIRE->getBase();
	QualType BaseType = Base->getType();
	if (OIRE->isArrow())
	BaseType = BaseType->getPointeeType();
	if (const ObjCObjectType *OTy = BaseType->getAs<ObjCObjectType>())
	if (ObjCInterfaceDecl *IDecl = OTy->getInterface()) {
	ObjCInterfaceDecl *ClassDeclared = nullptr;
	ObjCIvarDecl *IV = IDecl->lookupInstanceVariable(Member, ClassDeclared);
	if (!ClassDeclared->getSuperClass()
	&& (*ClassDeclared->ivar_begin()) == IV) {
	if (RHS) {
	NamedDecl *ObjectSetClass =
	S.LookupSingleName(S.TUScope,
	&S.Context.Idents.get("object_setClass"),
	SourceLocation(), S.LookupOrdinaryName);
	if (ObjectSetClass) {
	SourceLocation RHSLocEnd = S.getLocForEndOfToken(RHS->getLocEnd());
	S.Diag(OIRE->getExprLoc(), diag::warn_objc_isa_assign) <<
	FixItHint::CreateInsertion(OIRE->getLocStart(), "object_setClass(") <<
	FixItHint::CreateReplacement(SourceRange(OIRE->getOpLoc(),
	AssignLoc), ",") <<
	FixItHint::CreateInsertion(RHSLocEnd, ")");
	}
	else
	S.Diag(OIRE->getLocation(), diag::warn_objc_isa_assign);
	} else {
	NamedDecl *ObjectGetClass =
	S.LookupSingleName(S.TUScope,
	&S.Context.Idents.get("object_getClass"),
	SourceLocation(), S.LookupOrdinaryName);
	if (ObjectGetClass)
	S.Diag(OIRE->getExprLoc(), diag::warn_objc_isa_use) <<
	FixItHint::CreateInsertion(OIRE->getLocStart(), "object_getClass(") <<
	FixItHint::CreateReplacement(
	SourceRange(OIRE->getOpLoc(),
	OIRE->getLocEnd()), ")");
	else
	S.Diag(OIRE->getLocation(), diag::warn_objc_isa_use);
	}
	S.Diag(IV->getLocation(), diag::note_ivar_decl);
	}
	}
	}

	ExprResult Sema::DefaultLvalueConversion(Expr *E) {
	// Handle any placeholder expressions which made it here.
	if (E->getType()->isPlaceholderType()) {
	ExprResult result = CheckPlaceholderExpr(E);
	if (result.isInvalid()) return ExprError();
	E = result.get();
	}

	// C++ [conv.lval]p1:
	// A glvalue of a non-function, non-array type T can be
	// converted to a prvalue.
	if (!E->isGLValue()) return E;

	QualType T = E->getType();
	assert(!T.isNull() && "r-value conversion on typeless expression?");

	// We don't want to throw lvalue-to-rvalue casts on top of
	// expressions of certain types in C++.
	if (getLangOpts().CPlusPlus &&
	(E->getType() == Context.OverloadTy \|\|
	T->isDependentType() \|\|
	T->isRecordType()))
	return E;

	// The C standard is actually really unclear on this point, and
	// DR106 tells us what the result should be but not why. It's
	// generally best to say that void types just doesn't undergo
	// lvalue-to-rvalue at all. Note that expressions of unqualified
	// 'void' type are never l-values, but qualified void can be.
	if (T->isVoidType())
	return E;

	// OpenCL usually rejects direct accesses to values of 'half' type.
	if (getLangOpts().OpenCL && !getOpenCLOptions().isEnabled("cl_khr_fp16") &&
	T->isHalfType()) {
	Diag(E->getExprLoc(), diag::err_opencl_half_load_store)
	<< 0 << T;
	return ExprError();
	}

	CheckForNullPointerDereference(*this, E);
	if (const ObjCIsaExpr *OISA = dyn_cast<ObjCIsaExpr>(E->IgnoreParenCasts())) {
	NamedDecl *ObjectGetClass = LookupSingleName(TUScope,
	&Context.Idents.get("object_getClass"),
	SourceLocation(), LookupOrdinaryName);
	if (ObjectGetClass)
	Diag(E->getExprLoc(), diag::warn_objc_isa_use) <<
	FixItHint::CreateInsertion(OISA->getLocStart(), "object_getClass(") <<
	FixItHint::CreateReplacement(
	SourceRange(OISA->getOpLoc(), OISA->getIsaMemberLoc()), ")");
	else
	Diag(E->getExprLoc(), diag::warn_objc_isa_use);
	}
	else if (const ObjCIvarRefExpr *OIRE =
	dyn_cast<ObjCIvarRefExpr>(E->IgnoreParenCasts()))
	DiagnoseDirectIsaAccess(this, OIRE, SourceLocation(), / Expr*/nullptr);

	// C++ [conv.lval]p1:
	// [...] If T is a non-class type, the type of the prvalue is the
	// cv-unqualified version of T. Otherwise, the type of the
	// rvalue is T.
	//
	// C99 6.3.2.1p2:
	// If the lvalue has qualified type, the value has the unqualified
	// version of the type of the lvalue; otherwise, the value has the
	// type of the lvalue.
	if (T.hasQualifiers())
	T = T.getUnqualifiedType();

	// Under the MS ABI, lock down the inheritance model now.
	if (T->isMemberPointerType() &&
	Context.getTargetInfo().getCXXABI().isMicrosoft())
	(void)isCompleteType(E->getExprLoc(), T);

	UpdateMarkingForLValueToRValue(E);

	// Loading a __weak object implicitly retains the value, so we need a cleanup to
	// balance that.
	if (E->getType().getObjCLifetime() == Qualifiers::OCL_Weak)
	Cleanup.setExprNeedsCleanups(true);

	ExprResult Res = ImplicitCastExpr::Create(Context, T, CK_LValueToRValue, E,
	nullptr, VK_RValue);

	// C11 6.3.2.1p2:
	// ... if the lvalue has atomic type, the value has the non-atomic version
	// of the type of the lvalue ...
	if (const AtomicType *Atomic = T->getAs<AtomicType>()) {
	T = Atomic->getValueType().getUnqualifiedType();
	Res = ImplicitCastExpr::Create(Context, T, CK_AtomicToNonAtomic, Res.get(),
	nullptr, VK_RValue);
	}

	return Res;
	}

	ExprResult Sema::DefaultFunctionArrayLvalueConversion(Expr *E, bool Diagnose) {
	ExprResult Res = DefaultFunctionArrayConversion(E, Diagnose);
	if (Res.isInvalid())
	return ExprError();
	Res = DefaultLvalueConversion(Res.get());
	if (Res.isInvalid())
	return ExprError();
	return Res;
	}

	/// CallExprUnaryConversions - a special case of an unary conversion
	/// performed on a function designator of a call expression.
	ExprResult Sema::CallExprUnaryConversions(Expr *E) {
	QualType Ty = E->getType();
	ExprResult Res = E;
	// Only do implicit cast for a function type, but not for a pointer
	// to function type.
	if (Ty->isFunctionType()) {
	Res = ImpCastExprToType(E, Context.getPointerType(Ty),
	CK_FunctionToPointerDecay).get();
	if (Res.isInvalid())
	return ExprError();
	}
	Res = DefaultLvalueConversion(Res.get());
	if (Res.isInvalid())
	return ExprError();
	return Res.get();
	}

	/// UsualUnaryConversions - Performs various conversions that are common to most
	/// operators (C99 6.3). The conversions of array and function types are
	/// sometimes suppressed. For example, the array->pointer conversion doesn't
	/// apply if the array is an argument to the sizeof or address (&) operators.
	/// In these instances, this routine should not be called.
	ExprResult Sema::UsualUnaryConversions(Expr *E) {
	// First, convert to an r-value.
	ExprResult Res = DefaultFunctionArrayLvalueConversion(E);
	if (Res.isInvalid())
	return ExprError();
	E = Res.get();

	QualType Ty = E->getType();
	assert(!Ty.isNull() && "UsualUnaryConversions - missing type");

	// Half FP have to be promoted to float unless it is natively supported
	if (Ty->isHalfType() && !getLangOpts().NativeHalfType)
	return ImpCastExprToType(Res.get(), Context.FloatTy, CK_FloatingCast);

	// Try to perform integral promotions if the object has a theoretically
	// promotable type.
	if (Ty->isIntegralOrUnscopedEnumerationType()) {
	// C99 6.3.1.1p2:
	//
	// The following may be used in an expression wherever an int or
	// unsigned int may be used:
	// - an object or expression with an integer type whose integer
	// conversion rank is less than or equal to the rank of int
	// and unsigned int.
	// - A bit-field of type _Bool, int, signed int, or unsigned int.
	//
	// If an int can represent all values of the original type, the
	// value is converted to an int; otherwise, it is converted to an
	// unsigned int. These are called the integer promotions. All
	// other types are unchanged by the integer promotions.

	QualType PTy = Context.isPromotableBitField(E);
	if (!PTy.isNull()) {
	E = ImpCastExprToType(E, PTy, CK_IntegralCast).get();
	return E;
	}
	if (Ty->isPromotableIntegerType()) {
	QualType PT = Context.getPromotedIntegerType(Ty);
	E = ImpCastExprToType(E, PT, CK_IntegralCast).get();
	return E;
	}
	}
	return E;
	}

	/// DefaultArgumentPromotion (C99 6.5.2.2p6). Used for function calls that
	/// do not have a prototype. Arguments that have type float or __fp16
	/// are promoted to double. All other argument types are converted by
	/// UsualUnaryConversions().
	ExprResult Sema::DefaultArgumentPromotion(Expr *E) {
	QualType Ty = E->getType();
	assert(!Ty.isNull() && "DefaultArgumentPromotion - missing type");

	ExprResult Res = UsualUnaryConversions(E);
	if (Res.isInvalid())
	return ExprError();
	E = Res.get();

	// If this is a 'float' or '__fp16' (CVR qualified or typedef) promote to
	// double.
	const BuiltinType *BTy = Ty->getAs<BuiltinType>();
	if (BTy && (BTy->getKind() == BuiltinType::Half \|\|
	BTy->getKind() == BuiltinType::Float)) {
	if (getLangOpts().OpenCL &&
	!getOpenCLOptions().isEnabled("cl_khr_fp64")) {
	if (BTy->getKind() == BuiltinType::Half) {
	E = ImpCastExprToType(E, Context.FloatTy, CK_FloatingCast).get();
	}
	} else {
	E = ImpCastExprToType(E, Context.DoubleTy, CK_FloatingCast).get();
	}
	}

	// C++ performs lvalue-to-rvalue conversion as a default argument
	// promotion, even on class types, but note:
	// C++11 [conv.lval]p2:
	// When an lvalue-to-rvalue conversion occurs in an unevaluated
	// operand or a subexpression thereof the value contained in the
	// referenced object is not accessed. Otherwise, if the glvalue
	// has a class type, the conversion copy-initializes a temporary
	// of type T from the glvalue and the result of the conversion
	// is a prvalue for the temporary.
	// FIXME: add some way to gate this entire thing for correctness in
	// potentially potentially evaluated contexts.
	if (getLangOpts().CPlusPlus && E->isGLValue() && !isUnevaluatedContext()) {
	ExprResult Temp = PerformCopyInitialization(
	InitializedEntity::InitializeTemporary(E->getType()),
	E->getExprLoc(), E);
	if (Temp.isInvalid())
	return ExprError();
	E = Temp.get();
	}

	return E;
	}

	/// Determine the degree of POD-ness for an expression.
	/// Incomplete types are considered POD, since this check can be performed
	/// when we're in an unevaluated context.
	Sema::VarArgKind Sema::isValidVarArgType(const QualType &Ty) {
	if (Ty->isIncompleteType()) {
	// C++11 [expr.call]p7:
	// After these conversions, if the argument does not have arithmetic,
	// enumeration, pointer, pointer to member, or class type, the program
	// is ill-formed.
	//
	// Since we've already performed array-to-pointer and function-to-pointer
	// decay, the only such type in C++ is cv void. This also handles
	// initializer lists as variadic arguments.
	if (Ty->isVoidType())
	return VAK_Invalid;

	if (Ty->isObjCObjectType())
	return VAK_Invalid;
	return VAK_Valid;
	}

	if (Ty.isCXX98PODType(Context))
	return VAK_Valid;

	// C++11 [expr.call]p7:
	// Passing a potentially-evaluated argument of class type (Clause 9)
	// having a non-trivial copy constructor, a non-trivial move constructor,
	// or a non-trivial destructor, with no corresponding parameter,
	// is conditionally-supported with implementation-defined semantics.
	if (getLangOpts().CPlusPlus11 && !Ty->isDependentType())
	if (CXXRecordDecl *Record = Ty->getAsCXXRecordDecl())
	if (!Record->hasNonTrivialCopyConstructor() &&
	!Record->hasNonTrivialMoveConstructor() &&
	!Record->hasNonTrivialDestructor())
	return VAK_ValidInCXX11;

	if (getLangOpts().ObjCAutoRefCount && Ty->isObjCLifetimeType())
	return VAK_Valid;

	if (Ty->isObjCObjectType())
	return VAK_Invalid;

	if (getLangOpts().MSVCCompat)
	return VAK_MSVCUndefined;

	// FIXME: In C++11, these cases are conditionally-supported, meaning we're
	// permitted to reject them. We should consider doing so.
	return VAK_Undefined;
	}

	void Sema::checkVariadicArgument(const Expr *E, VariadicCallType CT) {
	// Don't allow one to pass an Objective-C interface to a vararg.
	const QualType &Ty = E->getType();
	VarArgKind VAK = isValidVarArgType(Ty);

	// Complain about passing non-POD types through varargs.
	switch (VAK) {
	case VAK_ValidInCXX11:
	DiagRuntimeBehavior(
	E->getLocStart(), nullptr,
	PDiag(diag::warn_cxx98_compat_pass_non_pod_arg_to_vararg)
	<< Ty << CT);
	// Fall through.
	case VAK_Valid:
	if (Ty->isRecordType()) {
	// This is unlikely to be what the user intended. If the class has a
	// 'c_str' member function, the user probably meant to call that.
	DiagRuntimeBehavior(E->getLocStart(), nullptr,
	PDiag(diag::warn_pass_class_arg_to_vararg)
	<< Ty << CT << hasCStrMethod(E) << ".c_str()");
	}
	break;

	case VAK_Undefined:
	case VAK_MSVCUndefined:
	DiagRuntimeBehavior(
	E->getLocStart(), nullptr,
	PDiag(diag::warn_cannot_pass_non_pod_arg_to_vararg)
	<< getLangOpts().CPlusPlus11 << Ty << CT);
	break;

	case VAK_Invalid:
	if (Ty->isObjCObjectType())
	DiagRuntimeBehavior(
	E->getLocStart(), nullptr,
	PDiag(diag::err_cannot_pass_objc_interface_to_vararg)
	<< Ty << CT);
	else
	Diag(E->getLocStart(), diag::err_cannot_pass_to_vararg)
	<< isa<InitListExpr>(E) << Ty << CT;
	break;
	}
	}

	/// DefaultVariadicArgumentPromotion - Like DefaultArgumentPromotion, but
	/// will create a trap if the resulting type is not a POD type.
	ExprResult Sema::DefaultVariadicArgumentPromotion(Expr *E, VariadicCallType CT,
	FunctionDecl *FDecl) {
	if (const BuiltinType *PlaceholderTy = E->getType()->getAsPlaceholderType()) {
	// Strip the unbridged-cast placeholder expression off, if applicable.
	if (PlaceholderTy->getKind() == BuiltinType::ARCUnbridgedCast &&
	(CT == VariadicMethod \|\|
	(FDecl && FDecl->hasAttr<CFAuditedTransferAttr>()))) {
	E = stripARCUnbridgedCast(E);

	// Otherwise, do normal placeholder checking.
	} else {
	ExprResult ExprRes = CheckPlaceholderExpr(E);
	if (ExprRes.isInvalid())
	return ExprError();
	E = ExprRes.get();
	}
	}

	ExprResult ExprRes = DefaultArgumentPromotion(E);
	if (ExprRes.isInvalid())
	return ExprError();
	E = ExprRes.get();

	// Diagnostics regarding non-POD argument types are
	// emitted along with format string checking in Sema::CheckFunctionCall().
	if (isValidVarArgType(E->getType()) == VAK_Undefined) {
	// Turn this into a trap.
	CXXScopeSpec SS;
	SourceLocation TemplateKWLoc;
	UnqualifiedId Name;
	Name.setIdentifier(PP.getIdentifierInfo("__builtin_trap"),
	E->getLocStart());
	ExprResult TrapFn = ActOnIdExpression(TUScope, SS, TemplateKWLoc,
	Name, true, false);
	if (TrapFn.isInvalid())
	return ExprError();

	ExprResult Call = ActOnCallExpr(TUScope, TrapFn.get(),
	E->getLocStart(), None,
	E->getLocEnd());
	if (Call.isInvalid())
	return ExprError();

	ExprResult Comma = ActOnBinOp(TUScope, E->getLocStart(), tok::comma,
	Call.get(), E);
	if (Comma.isInvalid())
	return ExprError();
	return Comma.get();
	}

	if (!getLangOpts().CPlusPlus &&
	RequireCompleteType(E->getExprLoc(), E->getType(),
	diag::err_call_incomplete_argument))
	return ExprError();

	return E;
	}

	/// \brief Converts an integer to complex float type. Helper function of
	/// UsualArithmeticConversions()
	///
	/// \return false if the integer expression is an integer type and is
	/// successfully converted to the complex type.
	static bool handleIntegerToComplexFloatConversion(Sema &S, ExprResult &IntExpr,
	ExprResult &ComplexExpr,
	QualType IntTy,
	QualType ComplexTy,
	bool SkipCast) {
	if (IntTy->isComplexType() \|\| IntTy->isRealFloatingType()) return true;
	if (SkipCast) return false;
	if (IntTy->isIntegerType()) {
	QualType fpTy = cast<ComplexType>(ComplexTy)->getElementType();
	IntExpr = S.ImpCastExprToType(IntExpr.get(), fpTy, CK_IntegralToFloating);
	IntExpr = S.ImpCastExprToType(IntExpr.get(), ComplexTy,
	CK_FloatingRealToComplex);
	} else {
	assert(IntTy->isComplexIntegerType());
	IntExpr = S.ImpCastExprToType(IntExpr.get(), ComplexTy,
	CK_IntegralComplexToFloatingComplex);
	}
	return false;
	}

	/// \brief Handle arithmetic conversion with complex types. Helper function of
	/// UsualArithmeticConversions()
	static QualType handleComplexFloatConversion(Sema &S, ExprResult &LHS,
	ExprResult &RHS, QualType LHSType,
	QualType RHSType,
	bool IsCompAssign) {
	// if we have an integer operand, the result is the complex type.
	if (!handleIntegerToComplexFloatConversion(S, RHS, LHS, RHSType, LHSType,
	/skipCast/false))
	return LHSType;
	if (!handleIntegerToComplexFloatConversion(S, LHS, RHS, LHSType, RHSType,
	/skipCast/IsCompAssign))
	return RHSType;

	// This handles complex/complex, complex/float, or float/complex.
	// When both operands are complex, the shorter operand is converted to the
	// type of the longer, and that is the type of the result. This corresponds
	// to what is done when combining two real floating-point operands.
	// The fun begins when size promotion occur across type domains.
	// From H&S 6.3.4: When one operand is complex and the other is a real
	// floating-point type, the less precise type is converted, within it's
	// real or complex domain, to the precision of the other type. For example,
	// when combining a "long double" with a "double _Complex", the
	// "double _Complex" is promoted to "long double _Complex".

	// Compute the rank of the two types, regardless of whether they are complex.
	int Order = S.Context.getFloatingTypeOrder(LHSType, RHSType);

	auto *LHSComplexType = dyn_cast<ComplexType>(LHSType);
	auto *RHSComplexType = dyn_cast<ComplexType>(RHSType);
	QualType LHSElementType =
	LHSComplexType ? LHSComplexType->getElementType() : LHSType;
	QualType RHSElementType =
	RHSComplexType ? RHSComplexType->getElementType() : RHSType;

	QualType ResultType = S.Context.getComplexType(LHSElementType);
	if (Order < 0) {
	// Promote the precision of the LHS if not an assignment.
	ResultType = S.Context.getComplexType(RHSElementType);
	if (!IsCompAssign) {
	if (LHSComplexType)
	LHS =
	S.ImpCastExprToType(LHS.get(), ResultType, CK_FloatingComplexCast);
	else
	LHS = S.ImpCastExprToType(LHS.get(), RHSElementType, CK_FloatingCast);
	}
	} else if (Order > 0) {
	// Promote the precision of the RHS.
	if (RHSComplexType)
	RHS = S.ImpCastExprToType(RHS.get(), ResultType, CK_FloatingComplexCast);
	else
	RHS = S.ImpCastExprToType(RHS.get(), LHSElementType, CK_FloatingCast);
	}
	return ResultType;
	}

	/// \brief Hande arithmetic conversion from integer to float. Helper function
	/// of UsualArithmeticConversions()
	static QualType handleIntToFloatConversion(Sema &S, ExprResult &FloatExpr,
	ExprResult &IntExpr,
	QualType FloatTy, QualType IntTy,
	bool ConvertFloat, bool ConvertInt) {
	if (IntTy->isIntegerType()) {
	if (ConvertInt)
	// Convert intExpr to the lhs floating point type.
	IntExpr = S.ImpCastExprToType(IntExpr.get(), FloatTy,
	CK_IntegralToFloating);
	return FloatTy;
	}

	// Convert both sides to the appropriate complex float.
	assert(IntTy->isComplexIntegerType());
	QualType result = S.Context.getComplexType(FloatTy);

	// _Complex int -> _Complex float
	if (ConvertInt)
	IntExpr = S.ImpCastExprToType(IntExpr.get(), result,
	CK_IntegralComplexToFloatingComplex);

	// float -> _Complex float
	if (ConvertFloat)
	FloatExpr = S.ImpCastExprToType(FloatExpr.get(), result,
	CK_FloatingRealToComplex);

	return result;
	}

	/// \brief Handle arithmethic conversion with floating point types. Helper
	/// function of UsualArithmeticConversions()
	static QualType handleFloatConversion(Sema &S, ExprResult &LHS,
	ExprResult &RHS, QualType LHSType,
	QualType RHSType, bool IsCompAssign) {
	bool LHSFloat = LHSType->isRealFloatingType();
	bool RHSFloat = RHSType->isRealFloatingType();

	// If we have two real floating types, convert the smaller operand
	// to the bigger result.
	if (LHSFloat && RHSFloat) {
	int order = S.Context.getFloatingTypeOrder(LHSType, RHSType);
	if (order > 0) {
	RHS = S.ImpCastExprToType(RHS.get(), LHSType, CK_FloatingCast);
	return LHSType;
	}

	assert(order < 0 && "illegal float comparison");
	if (!IsCompAssign)
	LHS = S.ImpCastExprToType(LHS.get(), RHSType, CK_FloatingCast);
	return RHSType;
	}

	if (LHSFloat) {
	// Half FP has to be promoted to float unless it is natively supported
	if (LHSType->isHalfType() && !S.getLangOpts().NativeHalfType)
	LHSType = S.Context.FloatTy;

	return handleIntToFloatConversion(S, LHS, RHS, LHSType, RHSType,
	/convertFloat=/!IsCompAssign,
	/convertInt=/ true);
	}
	assert(RHSFloat);
	return handleIntToFloatConversion(S, RHS, LHS, RHSType, LHSType,
	/convertInt=/ true,
	/convertFloat=/!IsCompAssign);
	}

	/// \brief Diagnose attempts to convert between __float128 and long double if
	/// there is no support for such conversion. Helper function of
	/// UsualArithmeticConversions().
	static bool unsupportedTypeConversion(const Sema &S, QualType LHSType,
	QualType RHSType) {
	/* No issue converting if at least one of the types is not a floating point
	type or the two types have the same rank.
	*/
	if (!LHSType->isFloatingType() \|\| !RHSType->isFloatingType() \|\|
	S.Context.getFloatingTypeOrder(LHSType, RHSType) == 0)
	return false;

	assert(LHSType->isFloatingType() && RHSType->isFloatingType() &&
	"The remaining types must be floating point types.");

	auto *LHSComplex = LHSType->getAs<ComplexType>();
	auto *RHSComplex = RHSType->getAs<ComplexType>();

	QualType LHSElemType = LHSComplex ?
	LHSComplex->getElementType() : LHSType;
	QualType RHSElemType = RHSComplex ?
	RHSComplex->getElementType() : RHSType;

	// No issue if the two types have the same representation
	if (&S.Context.getFloatTypeSemantics(LHSElemType) ==
	&S.Context.getFloatTypeSemantics(RHSElemType))
	return false;

	bool Float128AndLongDouble = (LHSElemType == S.Context.Float128Ty &&
	RHSElemType == S.Context.LongDoubleTy);
	Float128AndLongDouble \|= (LHSElemType == S.Context.LongDoubleTy &&
	RHSElemType == S.Context.Float128Ty);

	/* We've handled the situation where __float128 and long double have the same
	representation. The only other allowable conversion is if long double is
	really just double.
	*/
	return Float128AndLongDouble &&
	(&S.Context.getFloatTypeSemantics(S.Context.LongDoubleTy) !=
	&llvm::APFloat::IEEEdouble());
	}

	typedef ExprResult PerformCastFn(Sema &S, Expr *operand, QualType toType);

	namespace {
	/// These helper callbacks are placed in an anonymous namespace to
	/// permit their use as function template parameters.
	ExprResult doIntegralCast(Sema &S, Expr *op, QualType toType) {
	return S.ImpCastExprToType(op, toType, CK_IntegralCast);
	}

	ExprResult doComplexIntegralCast(Sema &S, Expr *op, QualType toType) {
	return S.ImpCastExprToType(op, S.Context.getComplexType(toType),
	CK_IntegralComplexCast);
	}
	}

	/// \brief Handle integer arithmetic conversions. Helper function of
	/// UsualArithmeticConversions()
	template <PerformCastFn doLHSCast, PerformCastFn doRHSCast>
	static QualType handleIntegerConversion(Sema &S, ExprResult &LHS,
	ExprResult &RHS, QualType LHSType,
	QualType RHSType, bool IsCompAssign) {
	// The rules for this case are in C99 6.3.1.8
	int order = S.Context.getIntegerTypeOrder(LHSType, RHSType);
	bool LHSSigned = LHSType->hasSignedIntegerRepresentation();
	bool RHSSigned = RHSType->hasSignedIntegerRepresentation();
	if (LHSSigned == RHSSigned) {
	// Same signedness; use the higher-ranked type
	if (order >= 0) {
	RHS = (*doRHSCast)(S, RHS.get(), LHSType);
	return LHSType;
	} else if (!IsCompAssign)
	LHS = (*doLHSCast)(S, LHS.get(), RHSType);
	return RHSType;
	} else if (order != (LHSSigned ? 1 : -1)) {
	// The unsigned type has greater than or equal rank to the
	// signed type, so use the unsigned type
	if (RHSSigned) {
	RHS = (*doRHSCast)(S, RHS.get(), LHSType);
	return LHSType;
	} else if (!IsCompAssign)
	LHS = (*doLHSCast)(S, LHS.get(), RHSType);
	return RHSType;
	} else if (S.Context.getIntWidth(LHSType) != S.Context.getIntWidth(RHSType)) {
	// The two types are different widths; if we are here, that
	// means the signed type is larger than the unsigned type, so
	// use the signed type.
	if (LHSSigned) {
	RHS = (*doRHSCast)(S, RHS.get(), LHSType);
	return LHSType;
	} else if (!IsCompAssign)
	LHS = (*doLHSCast)(S, LHS.get(), RHSType);
	return RHSType;
	} else {
	// The signed type is higher-ranked than the unsigned type,
	// but isn't actually any bigger (like unsigned int and long
	// on most 32-bit systems). Use the unsigned type corresponding
	// to the signed type.
	QualType result =
	S.Context.getCorrespondingUnsignedType(LHSSigned ? LHSType : RHSType);
	RHS = (*doRHSCast)(S, RHS.get(), result);
	if (!IsCompAssign)
	LHS = (*doLHSCast)(S, LHS.get(), result);
	return result;
	}
	}

	/// \brief Handle conversions with GCC complex int extension. Helper function
	/// of UsualArithmeticConversions()
	static QualType handleComplexIntConversion(Sema &S, ExprResult &LHS,
	ExprResult &RHS, QualType LHSType,
	QualType RHSType,
	bool IsCompAssign) {
	const ComplexType *LHSComplexInt = LHSType->getAsComplexIntegerType();
	const ComplexType *RHSComplexInt = RHSType->getAsComplexIntegerType();

	if (LHSComplexInt && RHSComplexInt) {
	QualType LHSEltType = LHSComplexInt->getElementType();
	QualType RHSEltType = RHSComplexInt->getElementType();
	QualType ScalarType =
	handleIntegerConversion<doComplexIntegralCast, doComplexIntegralCast>
	(S, LHS, RHS, LHSEltType, RHSEltType, IsCompAssign);

	return S.Context.getComplexType(ScalarType);
	}

	if (LHSComplexInt) {
	QualType LHSEltType = LHSComplexInt->getElementType();
	QualType ScalarType =
	handleIntegerConversion<doComplexIntegralCast, doIntegralCast>
	(S, LHS, RHS, LHSEltType, RHSType, IsCompAssign);
	QualType ComplexType = S.Context.getComplexType(ScalarType);
	RHS = S.ImpCastExprToType(RHS.get(), ComplexType,
	CK_IntegralRealToComplex);

	return ComplexType;
	}

	assert(RHSComplexInt);

	QualType RHSEltType = RHSComplexInt->getElementType();
	QualType ScalarType =
	handleIntegerConversion<doIntegralCast, doComplexIntegralCast>
	(S, LHS, RHS, LHSType, RHSEltType, IsCompAssign);
	QualType ComplexType = S.Context.getComplexType(ScalarType);

	if (!IsCompAssign)
	LHS = S.ImpCastExprToType(LHS.get(), ComplexType,
	CK_IntegralRealToComplex);
	return ComplexType;
	}

	/// UsualArithmeticConversions - Performs various conversions that are common to
	/// binary operators (C99 6.3.1.8). If both operands aren't arithmetic, this
	/// routine returns the first non-arithmetic type found. The client is
	/// responsible for emitting appropriate error diagnostics.
	QualType Sema::UsualArithmeticConversions(ExprResult &LHS, ExprResult &RHS,
	bool IsCompAssign) {
	if (!IsCompAssign) {
	LHS = UsualUnaryConversions(LHS.get());
	if (LHS.isInvalid())
	return QualType();
	}

	RHS = UsualUnaryConversions(RHS.get());
	if (RHS.isInvalid())
	return QualType();

	// For conversion purposes, we ignore any qualifiers.
	// For example, "const float" and "float" are equivalent.
	QualType LHSType =
	Context.getCanonicalType(LHS.get()->getType()).getUnqualifiedType();
	QualType RHSType =
	Context.getCanonicalType(RHS.get()->getType()).getUnqualifiedType();

	// For conversion purposes, we ignore any atomic qualifier on the LHS.
	if (const AtomicType *AtomicLHS = LHSType->getAs<AtomicType>())
	LHSType = AtomicLHS->getValueType();

	// If both types are identical, no conversion is needed.
	if (LHSType == RHSType)
	return LHSType;

	// If either side is a non-arithmetic type (e.g. a pointer), we are done.
	// The caller can deal with this (e.g. pointer + int).
	if (!LHSType->isArithmeticType() \|\| !RHSType->isArithmeticType())
	return QualType();

	// Apply unary and bitfield promotions to the LHS's type.
	QualType LHSUnpromotedType = LHSType;
	if (LHSType->isPromotableIntegerType())
	LHSType = Context.getPromotedIntegerType(LHSType);
	QualType LHSBitfieldPromoteTy = Context.isPromotableBitField(LHS.get());
	if (!LHSBitfieldPromoteTy.isNull())
	LHSType = LHSBitfieldPromoteTy;
	if (LHSType != LHSUnpromotedType && !IsCompAssign)
	LHS = ImpCastExprToType(LHS.get(), LHSType, CK_IntegralCast);

	// If both types are identical, no conversion is needed.
	if (LHSType == RHSType)
	return LHSType;

	// At this point, we have two different arithmetic types.

	// Diagnose attempts to convert between __float128 and long double where
	// such conversions currently can't be handled.
	if (unsupportedTypeConversion(*this, LHSType, RHSType))
	return QualType();

	// Handle complex types first (C99 6.3.1.8p1).
	if (LHSType->isComplexType() \|\| RHSType->isComplexType())
	return handleComplexFloatConversion(*this, LHS, RHS, LHSType, RHSType,
	IsCompAssign);

	// Now handle "real" floating types (i.e. float, double, long double).
	if (LHSType->isRealFloatingType() \|\| RHSType->isRealFloatingType())
	return handleFloatConversion(*this, LHS, RHS, LHSType, RHSType,
	IsCompAssign);

	// Handle GCC complex int extension.
	if (LHSType->isComplexIntegerType() \|\| RHSType->isComplexIntegerType())
	return handleComplexIntConversion(*this, LHS, RHS, LHSType, RHSType,
	IsCompAssign);

	// Finally, we have two differing integer types.
	return handleIntegerConversion<doIntegralCast, doIntegralCast>
	(*this, LHS, RHS, LHSType, RHSType, IsCompAssign);
	}


	//===----------------------------------------------------------------------===//
	// Semantic Analysis for various Expression Types
	//===----------------------------------------------------------------------===//


	ExprResult
	Sema::ActOnGenericSelectionExpr(SourceLocation KeyLoc,
	SourceLocation DefaultLoc,
	SourceLocation RParenLoc,
	Expr *ControllingExpr,
	ArrayRef<ParsedType> ArgTypes,
	ArrayRef<Expr *> ArgExprs) {
	unsigned NumAssocs = ArgTypes.size();
	assert(NumAssocs == ArgExprs.size());

	TypeSourceInfo *Types = new TypeSourceInfo[NumAssocs];
	for (unsigned i = 0; i < NumAssocs; ++i) {
	if (ArgTypes[i])
	(void) GetTypeFromParser(ArgTypes[i], &Types[i]);
	else
	Types[i] = nullptr;
	}

	ExprResult ER = CreateGenericSelectionExpr(KeyLoc, DefaultLoc, RParenLoc,
	ControllingExpr,
	llvm::makeArrayRef(Types, NumAssocs),
	ArgExprs);
	delete [] Types;
	return ER;
	}

	ExprResult
	Sema::CreateGenericSelectionExpr(SourceLocation KeyLoc,
	SourceLocation DefaultLoc,
	SourceLocation RParenLoc,
	Expr *ControllingExpr,
	ArrayRef<TypeSourceInfo *> Types,
	ArrayRef<Expr *> Exprs) {
	unsigned NumAssocs = Types.size();
	assert(NumAssocs == Exprs.size());

	// Decay and strip qualifiers for the controlling expression type, and handle
	// placeholder type replacement. See committee discussion from WG14 DR423.
	{
	EnterExpressionEvaluationContext Unevaluated(
	*this, Sema::ExpressionEvaluationContext::Unevaluated);
	ExprResult R = DefaultFunctionArrayLvalueConversion(ControllingExpr);
	if (R.isInvalid())
	return ExprError();
	ControllingExpr = R.get();
	}

	// The controlling expression is an unevaluated operand, so side effects are
	// likely unintended.
	if (!inTemplateInstantiation() &&
	ControllingExpr->HasSideEffects(Context, false))
	Diag(ControllingExpr->getExprLoc(),
	diag::warn_side_effects_unevaluated_context);

	bool TypeErrorFound = false,
	IsResultDependent = ControllingExpr->isTypeDependent(),
	ContainsUnexpandedParameterPack
	= ControllingExpr->containsUnexpandedParameterPack();

	for (unsigned i = 0; i < NumAssocs; ++i) {
	if (Exprs[i]->containsUnexpandedParameterPack())
	ContainsUnexpandedParameterPack = true;

	if (Types[i]) {
	if (Types[i]->getType()->containsUnexpandedParameterPack())
	ContainsUnexpandedParameterPack = true;

	if (Types[i]->getType()->isDependentType()) {
	IsResultDependent = true;
	} else {
	// C11 6.5.1.1p2 "The type name in a generic association shall specify a
	// complete object type other than a variably modified type."
	unsigned D = 0;
	if (Types[i]->getType()->isIncompleteType())
	D = diag::err_assoc_type_incomplete;
	else if (!Types[i]->getType()->isObjectType())
	D = diag::err_assoc_type_nonobject;
	else if (Types[i]->getType()->isVariablyModifiedType())
	D = diag::err_assoc_type_variably_modified;

	if (D != 0) {
	Diag(Types[i]->getTypeLoc().getBeginLoc(), D)
	<< Types[i]->getTypeLoc().getSourceRange()
	<< Types[i]->getType();
	TypeErrorFound = true;
	}

	// C11 6.5.1.1p2 "No two generic associations in the same generic
	// selection shall specify compatible types."
	for (unsigned j = i+1; j < NumAssocs; ++j)
	if (Types[j] && !Types[j]->getType()->isDependentType() &&
	Context.typesAreCompatible(Types[i]->getType(),
	Types[j]->getType())) {
	Diag(Types[j]->getTypeLoc().getBeginLoc(),
	diag::err_assoc_compatible_types)
	<< Types[j]->getTypeLoc().getSourceRange()
	<< Types[j]->getType()
	<< Types[i]->getType();
	Diag(Types[i]->getTypeLoc().getBeginLoc(),
	diag::note_compat_assoc)
	<< Types[i]->getTypeLoc().getSourceRange()
	<< Types[i]->getType();
	TypeErrorFound = true;
	}
	}
	}
	}
	if (TypeErrorFound)
	return ExprError();

	// If we determined that the generic selection is result-dependent, don't
	// try to compute the result expression.
	if (IsResultDependent)
	return new (Context) GenericSelectionExpr(
	Context, KeyLoc, ControllingExpr, Types, Exprs, DefaultLoc, RParenLoc,
	ContainsUnexpandedParameterPack);

	SmallVector<unsigned, 1> CompatIndices;
	unsigned DefaultIndex = -1U;
	for (unsigned i = 0; i < NumAssocs; ++i) {
	if (!Types[i])
	DefaultIndex = i;
	else if (Context.typesAreCompatible(ControllingExpr->getType(),
	Types[i]->getType()))
	CompatIndices.push_back(i);
	}

	// C11 6.5.1.1p2 "The controlling expression of a generic selection shall have
	// type compatible with at most one of the types named in its generic
	// association list."
	if (CompatIndices.size() > 1) {
	// We strip parens here because the controlling expression is typically
	// parenthesized in macro definitions.
	ControllingExpr = ControllingExpr->IgnoreParens();
	Diag(ControllingExpr->getLocStart(), diag::err_generic_sel_multi_match)
	<< ControllingExpr->getSourceRange() << ControllingExpr->getType()
	<< (unsigned) CompatIndices.size();
	for (unsigned I : CompatIndices) {
	Diag(Types[I]->getTypeLoc().getBeginLoc(),
	diag::note_compat_assoc)
	<< Types[I]->getTypeLoc().getSourceRange()
	<< Types[I]->getType();
	}
	return ExprError();
	}

	// C11 6.5.1.1p2 "If a generic selection has no default generic association,
	// its controlling expression shall have type compatible with exactly one of
	// the types named in its generic association list."
	if (DefaultIndex == -1U && CompatIndices.size() == 0) {
	// We strip parens here because the controlling expression is typically
	// parenthesized in macro definitions.
	ControllingExpr = ControllingExpr->IgnoreParens();
	Diag(ControllingExpr->getLocStart(), diag::err_generic_sel_no_match)
	<< ControllingExpr->getSourceRange() << ControllingExpr->getType();
	return ExprError();
	}

	// C11 6.5.1.1p3 "If a generic selection has a generic association with a
	// type name that is compatible with the type of the controlling expression,
	// then the result expression of the generic selection is the expression
	// in that generic association. Otherwise, the result expression of the
	// generic selection is the expression in the default generic association."
	unsigned ResultIndex =
	CompatIndices.size() ? CompatIndices[0] : DefaultIndex;

	return new (Context) GenericSelectionExpr(
	Context, KeyLoc, ControllingExpr, Types, Exprs, DefaultLoc, RParenLoc,
	ContainsUnexpandedParameterPack, ResultIndex);
	}

	/// getUDSuffixLoc - Create a SourceLocation for a ud-suffix, given the
	/// location of the token and the offset of the ud-suffix within it.
	static SourceLocation getUDSuffixLoc(Sema &S, SourceLocation TokLoc,
	unsigned Offset) {
	return Lexer::AdvanceToTokenCharacter(TokLoc, Offset, S.getSourceManager(),
	S.getLangOpts());
	}

	/// BuildCookedLiteralOperatorCall - A user-defined literal was found. Look up
	/// the corresponding cooked (non-raw) literal operator, and build a call to it.
	static ExprResult BuildCookedLiteralOperatorCall(Sema &S, Scope *Scope,
	IdentifierInfo *UDSuffix,
	SourceLocation UDSuffixLoc,
	ArrayRef<Expr*> Args,
	SourceLocation LitEndLoc) {
	assert(Args.size() <= 2 && "too many arguments for literal operator");

	QualType ArgTy[2];
	for (unsigned ArgIdx = 0; ArgIdx != Args.size(); ++ArgIdx) {
	ArgTy[ArgIdx] = Args[ArgIdx]->getType();
	if (ArgTy[ArgIdx]->isArrayType())
	ArgTy[ArgIdx] = S.Context.getArrayDecayedType(ArgTy[ArgIdx]);
	}

	DeclarationName OpName =
	S.Context.DeclarationNames.getCXXLiteralOperatorName(UDSuffix);
	DeclarationNameInfo OpNameInfo(OpName, UDSuffixLoc);
	OpNameInfo.setCXXLiteralOperatorNameLoc(UDSuffixLoc);

	LookupResult R(S, OpName, UDSuffixLoc, Sema::LookupOrdinaryName);
	if (S.LookupLiteralOperator(Scope, R, llvm::makeArrayRef(ArgTy, Args.size()),
	/AllowRaw/false, /AllowTemplate/false,
	/AllowStringTemplate/false) == Sema::LOLR_Error)
	return ExprError();

	return S.BuildLiteralOperatorCall(R, OpNameInfo, Args, LitEndLoc);
	}

	/// ActOnStringLiteral - The specified tokens were lexed as pasted string
	/// fragments (e.g. "foo" "bar" L"baz"). The result string has to handle string
	/// concatenation ([C99 5.1.1.2, translation phase #6]), so it may come from
	/// multiple tokens. However, the common case is that StringToks points to one
	/// string.
	///
	ExprResult
	Sema::ActOnStringLiteral(ArrayRef<Token> StringToks, Scope *UDLScope) {
	assert(!StringToks.empty() && "Must have at least one string!");

	StringLiteralParser Literal(StringToks, PP);
	if (Literal.hadError)
	return ExprError();

	SmallVector<SourceLocation, 4> StringTokLocs;
	for (const Token &Tok : StringToks)
	StringTokLocs.push_back(Tok.getLocation());

	QualType CharTy = Context.CharTy;
	StringLiteral::StringKind Kind = StringLiteral::Ascii;
	if (Literal.isWide()) {
	CharTy = Context.getWideCharType();
	Kind = StringLiteral::Wide;
	} else if (Literal.isUTF8()) {
	Kind = StringLiteral::UTF8;
	} else if (Literal.isUTF16()) {
	CharTy = Context.Char16Ty;
	Kind = StringLiteral::UTF16;
	} else if (Literal.isUTF32()) {
	CharTy = Context.Char32Ty;
	Kind = StringLiteral::UTF32;
	} else if (Literal.isPascal()) {
	CharTy = Context.UnsignedCharTy;
	}

	QualType CharTyConst = CharTy;
	// A C++ string literal has a const-qualified element type (C++ 2.13.4p1).
	if (getLangOpts().CPlusPlus \|\| getLangOpts().ConstStrings)
	CharTyConst.addConst();

	// Get an array type for the string, according to C99 6.4.5. This includes
	// the nul terminator character as well as the string length for pascal
	// strings.
	QualType StrTy = Context.getConstantArrayType(CharTyConst,
	llvm::APInt(32, Literal.GetNumStringChars()+1),
	ArrayType::Normal, 0);

	// OpenCL v1.1 s6.5.3: a string literal is in the constant address space.
	if (getLangOpts().OpenCL) {
	StrTy = Context.getAddrSpaceQualType(StrTy, LangAS::opencl_constant);
	}

	// Pass &StringTokLocs[0], StringTokLocs.size() to factory!
	StringLiteral *Lit = StringLiteral::Create(Context, Literal.GetString(),
	Kind, Literal.Pascal, StrTy,
	&StringTokLocs[0],
	StringTokLocs.size());
	if (Literal.getUDSuffix().empty())
	return Lit;

	// We're building a user-defined literal.
	IdentifierInfo *UDSuffix = &Context.Idents.get(Literal.getUDSuffix());
	SourceLocation UDSuffixLoc =
	getUDSuffixLoc(*this, StringTokLocs[Literal.getUDSuffixToken()],
	Literal.getUDSuffixOffset());

	// Make sure we're allowed user-defined literals here.
	if (!UDLScope)
	return ExprError(Diag(UDSuffixLoc, diag::err_invalid_string_udl));

	// C++11 [lex.ext]p5: The literal L is treated as a call of the form
	// operator "" X (str, len)
	QualType SizeType = Context.getSizeType();

	DeclarationName OpName =
	Context.DeclarationNames.getCXXLiteralOperatorName(UDSuffix);
	DeclarationNameInfo OpNameInfo(OpName, UDSuffixLoc);
	OpNameInfo.setCXXLiteralOperatorNameLoc(UDSuffixLoc);

	QualType ArgTy[] = {
	Context.getArrayDecayedType(StrTy), SizeType
	};

	LookupResult R(*this, OpName, UDSuffixLoc, LookupOrdinaryName);
	switch (LookupLiteralOperator(UDLScope, R, ArgTy,
	/AllowRaw/false, /AllowTemplate/false,
	/AllowStringTemplate/true)) {

	case LOLR_Cooked: {
	llvm::APInt Len(Context.getIntWidth(SizeType), Literal.GetNumStringChars());
	IntegerLiteral *LenArg = IntegerLiteral::Create(Context, Len, SizeType,
	StringTokLocs[0]);
	Expr *Args[] = { Lit, LenArg };

	return BuildLiteralOperatorCall(R, OpNameInfo, Args, StringTokLocs.back());
	}

	case LOLR_StringTemplate: {
	TemplateArgumentListInfo ExplicitArgs;

	unsigned CharBits = Context.getIntWidth(CharTy);
	bool CharIsUnsigned = CharTy->isUnsignedIntegerType();
	llvm::APSInt Value(CharBits, CharIsUnsigned);

	TemplateArgument TypeArg(CharTy);
	TemplateArgumentLocInfo TypeArgInfo(Context.getTrivialTypeSourceInfo(CharTy));
	ExplicitArgs.addArgument(TemplateArgumentLoc(TypeArg, TypeArgInfo));

	for (unsigned I = 0, N = Lit->getLength(); I != N; ++I) {
	Value = Lit->getCodeUnit(I);
	TemplateArgument Arg(Context, Value, CharTy);
	TemplateArgumentLocInfo ArgInfo;
	ExplicitArgs.addArgument(TemplateArgumentLoc(Arg, ArgInfo));
	}
	return BuildLiteralOperatorCall(R, OpNameInfo, None, StringTokLocs.back(),
	&ExplicitArgs);
	}
	case LOLR_Raw:
	case LOLR_Template:
	llvm_unreachable("unexpected literal operator lookup result");
	case LOLR_Error:
	return ExprError();
	}
	llvm_unreachable("unexpected literal operator lookup result");
	}

	ExprResult
	Sema::BuildDeclRefExpr(ValueDecl *D, QualType Ty, ExprValueKind VK,
	SourceLocation Loc,
	const CXXScopeSpec *SS) {
	DeclarationNameInfo NameInfo(D->getDeclName(), Loc);
	return BuildDeclRefExpr(D, Ty, VK, NameInfo, SS);
	}

	/// BuildDeclRefExpr - Build an expression that references a
	/// declaration that does not require a closure capture.
	ExprResult
	Sema::BuildDeclRefExpr(ValueDecl *D, QualType Ty, ExprValueKind VK,
	const DeclarationNameInfo &NameInfo,
	const CXXScopeSpec SS, NamedDecl FoundD,
	const TemplateArgumentListInfo *TemplateArgs) {
	bool RefersToCapturedVariable =
	isa<VarDecl>(D) &&
	NeedToCaptureVariable(cast<VarDecl>(D), NameInfo.getLoc());

	DeclRefExpr *E;
	if (isa<VarTemplateSpecializationDecl>(D)) {
	VarTemplateSpecializationDecl *VarSpec =
	cast<VarTemplateSpecializationDecl>(D);

	E = DeclRefExpr::Create(Context, SS ? SS->getWithLocInContext(Context)
	: NestedNameSpecifierLoc(),
	VarSpec->getTemplateKeywordLoc(), D,
	RefersToCapturedVariable, NameInfo.getLoc(), Ty, VK,
	FoundD, TemplateArgs);
	} else {
	assert(!TemplateArgs && "No template arguments for non-variable"
	" template specialization references");
	E = DeclRefExpr::Create(Context, SS ? SS->getWithLocInContext(Context)
	: NestedNameSpecifierLoc(),
	SourceLocation(), D, RefersToCapturedVariable,
	NameInfo, Ty, VK, FoundD);
	}

	MarkDeclRefReferenced(E);

	if (getLangOpts().ObjCWeak && isa<VarDecl>(D) &&
	Ty.getObjCLifetime() == Qualifiers::OCL_Weak &&
	!Diags.isIgnored(diag::warn_arc_repeated_use_of_weak, E->getLocStart()))
	recordUseOfEvaluatedWeak(E);

	FieldDecl *FD = dyn_cast<FieldDecl>(D);
	if (IndirectFieldDecl *IFD = dyn_cast<IndirectFieldDecl>(D))
	FD = IFD->getAnonField();
	if (FD) {
	UnusedPrivateFields.remove(FD);
	// Just in case we're building an illegal pointer-to-member.
	if (FD->isBitField())
	E->setObjectKind(OK_BitField);
	}

	// C++ [expr.prim]/8: The expression [...] is a bit-field if the identifier
	// designates a bit-field.
	if (auto *BD = dyn_cast<BindingDecl>(D))
	if (auto *BE = BD->getBinding())
	E->setObjectKind(BE->getObjectKind());

	return E;
	}

	/// Decomposes the given name into a DeclarationNameInfo, its location, and
	/// possibly a list of template arguments.
	///
	/// If this produces template arguments, it is permitted to call
	/// DecomposeTemplateName.
	///
	/// This actually loses a lot of source location information for
	/// non-standard name kinds; we should consider preserving that in
	/// some way.
	void
	Sema::DecomposeUnqualifiedId(const UnqualifiedId &Id,
	TemplateArgumentListInfo &Buffer,
	DeclarationNameInfo &NameInfo,
	const TemplateArgumentListInfo *&TemplateArgs) {
	if (Id.getKind() == UnqualifiedId::IK_TemplateId) {
	Buffer.setLAngleLoc(Id.TemplateId->LAngleLoc);
	Buffer.setRAngleLoc(Id.TemplateId->RAngleLoc);

	ASTTemplateArgsPtr TemplateArgsPtr(Id.TemplateId->getTemplateArgs(),
	Id.TemplateId->NumArgs);
	translateTemplateArguments(TemplateArgsPtr, Buffer);

	TemplateName TName = Id.TemplateId->Template.get();
	SourceLocation TNameLoc = Id.TemplateId->TemplateNameLoc;
	NameInfo = Context.getNameForTemplate(TName, TNameLoc);
	TemplateArgs = &Buffer;
	} else {
	NameInfo = GetNameFromUnqualifiedId(Id);
	TemplateArgs = nullptr;
	}
	}

	static void emitEmptyLookupTypoDiagnostic(
	const TypoCorrection &TC, Sema &SemaRef, const CXXScopeSpec &SS,
	DeclarationName Typo, SourceLocation TypoLoc, ArrayRef<Expr *> Args,
	unsigned DiagnosticID, unsigned DiagnosticSuggestID) {
	DeclContext *Ctx =
	SS.isEmpty() ? nullptr : SemaRef.computeDeclContext(SS, false);
	if (!TC) {
	// Emit a special diagnostic for failed member lookups.
	// FIXME: computing the declaration context might fail here (?)
	if (Ctx)
	SemaRef.Diag(TypoLoc, diag::err_no_member) << Typo << Ctx
	<< SS.getRange();
	else
	SemaRef.Diag(TypoLoc, DiagnosticID) << Typo;
	return;
	}

	std::string CorrectedStr = TC.getAsString(SemaRef.getLangOpts());
	bool DroppedSpecifier =
	TC.WillReplaceSpecifier() && Typo.getAsString() == CorrectedStr;
	unsigned NoteID = TC.getCorrectionDeclAs<ImplicitParamDecl>()
	? diag::note_implicit_param_decl
	: diag::note_previous_decl;
	if (!Ctx)
	SemaRef.diagnoseTypo(TC, SemaRef.PDiag(DiagnosticSuggestID) << Typo,
	SemaRef.PDiag(NoteID));
	else
	SemaRef.diagnoseTypo(TC, SemaRef.PDiag(diag::err_no_member_suggest)
	<< Typo << Ctx << DroppedSpecifier
	<< SS.getRange(),
	SemaRef.PDiag(NoteID));
	}

	/// Diagnose an empty lookup.
	///
	/// \return false if new lookup candidates were found
	bool
	Sema::DiagnoseEmptyLookup(Scope *S, CXXScopeSpec &SS, LookupResult &R,
	std::unique_ptr<CorrectionCandidateCallback> CCC,
	TemplateArgumentListInfo *ExplicitTemplateArgs,
	ArrayRef<Expr > Args, TypoExpr *Out) {
	DeclarationName Name = R.getLookupName();

	unsigned diagnostic = diag::err_undeclared_var_use;
	unsigned diagnostic_suggest = diag::err_undeclared_var_use_suggest;
	if (Name.getNameKind() == DeclarationName::CXXOperatorName \|\|
	Name.getNameKind() == DeclarationName::CXXLiteralOperatorName \|\|
	Name.getNameKind() == DeclarationName::CXXConversionFunctionName) {
	diagnostic = diag::err_undeclared_use;
	diagnostic_suggest = diag::err_undeclared_use_suggest;
	}

	// If the original lookup was an unqualified lookup, fake an
	// unqualified lookup. This is useful when (for example) the
	// original lookup would not have found something because it was a
	// dependent name.
	DeclContext *DC = SS.isEmpty() ? CurContext : nullptr;
	while (DC) {
	if (isa<CXXRecordDecl>(DC)) {
	LookupQualifiedName(R, DC);

	if (!R.empty()) {
	// Don't give errors about ambiguities in this lookup.
	R.suppressDiagnostics();

	// During a default argument instantiation the CurContext points
	// to a CXXMethodDecl; but we can't apply a this-> fixit inside a
	// function parameter list, hence add an explicit check.
	bool isDefaultArgument =
	!CodeSynthesisContexts.empty() &&
	CodeSynthesisContexts.back().Kind ==
	CodeSynthesisContext::DefaultFunctionArgumentInstantiation;
	CXXMethodDecl *CurMethod = dyn_cast<CXXMethodDecl>(CurContext);
	bool isInstance = CurMethod &&
	CurMethod->isInstance() &&
	DC == CurMethod->getParent() && !isDefaultArgument;

	// Give a code modification hint to insert 'this->'.
	// TODO: fixit for inserting 'Base<T>::' in the other cases.
	// Actually quite difficult!
	if (getLangOpts().MSVCCompat)
	diagnostic = diag::ext_found_via_dependent_bases_lookup;
	if (isInstance) {
	Diag(R.getNameLoc(), diagnostic) << Name
	<< FixItHint::CreateInsertion(R.getNameLoc(), "this->");
	CheckCXXThisCapture(R.getNameLoc());
	} else {
	Diag(R.getNameLoc(), diagnostic) << Name;
	}

	// Do we really want to note all of these?
	for (NamedDecl *D : R)
	Diag(D->getLocation(), diag::note_dependent_var_use);

	// Return true if we are inside a default argument instantiation
	// and the found name refers to an instance member function, otherwise
	// the function calling DiagnoseEmptyLookup will try to create an
	// implicit member call and this is wrong for default argument.
	if (isDefaultArgument && ((*R.begin())->isCXXInstanceMember())) {
	Diag(R.getNameLoc(), diag::err_member_call_without_object);
	return true;
	}

	// Tell the callee to try to recover.
	return false;
	}

	R.clear();
	}

	// In Microsoft mode, if we are performing lookup from within a friend
	// function definition declared at class scope then we must set
	// DC to the lexical parent to be able to search into the parent
	// class.
	if (getLangOpts().MSVCCompat && isa<FunctionDecl>(DC) &&
	cast<FunctionDecl>(DC)->getFriendObjectKind() &&
	DC->getLexicalParent()->isRecord())
	DC = DC->getLexicalParent();
	else
	DC = DC->getParent();
	}

	// We didn't find anything, so try to correct for a typo.
	TypoCorrection Corrected;
	if (S && Out) {
	SourceLocation TypoLoc = R.getNameLoc();
	assert(!ExplicitTemplateArgs &&
	"Diagnosing an empty lookup with explicit template args!");
	*Out = CorrectTypoDelayed(
	R.getLookupNameInfo(), R.getLookupKind(), S, &SS, std::move(CCC),
	[=](const TypoCorrection &TC) {
	emitEmptyLookupTypoDiagnostic(TC, *this, SS, Name, TypoLoc, Args,
	diagnostic, diagnostic_suggest);
	},
	nullptr, CTK_ErrorRecovery);
	if (*Out)
	return true;
	} else if (S && (Corrected =
	CorrectTypo(R.getLookupNameInfo(), R.getLookupKind(), S,
	&SS, std::move(CCC), CTK_ErrorRecovery))) {
	std::string CorrectedStr(Corrected.getAsString(getLangOpts()));
	bool DroppedSpecifier =
	Corrected.WillReplaceSpecifier() && Name.getAsString() == CorrectedStr;
	R.setLookupName(Corrected.getCorrection());

	bool AcceptableWithRecovery = false;
	bool AcceptableWithoutRecovery = false;
	NamedDecl *ND = Corrected.getFoundDecl();
	if (ND) {
	if (Corrected.isOverloaded()) {
	OverloadCandidateSet OCS(R.getNameLoc(),
	OverloadCandidateSet::CSK_Normal);
	OverloadCandidateSet::iterator Best;
	for (NamedDecl *CD : Corrected) {
	if (FunctionTemplateDecl *FTD =
	dyn_cast<FunctionTemplateDecl>(CD))
	AddTemplateOverloadCandidate(
	FTD, DeclAccessPair::make(FTD, AS_none), ExplicitTemplateArgs,
	Args, OCS);
	else if (FunctionDecl *FD = dyn_cast<FunctionDecl>(CD))
	if (!ExplicitTemplateArgs \|\| ExplicitTemplateArgs->size() == 0)
	AddOverloadCandidate(FD, DeclAccessPair::make(FD, AS_none),
	Args, OCS);
	}
	switch (OCS.BestViableFunction(*this, R.getNameLoc(), Best)) {
	case OR_Success:
	ND = Best->FoundDecl;
	Corrected.setCorrectionDecl(ND);
	break;
	default:
	// FIXME: Arbitrarily pick the first declaration for the note.
	Corrected.setCorrectionDecl(ND);
	break;
	}
	}
	R.addDecl(ND);
	if (getLangOpts().CPlusPlus && ND->isCXXClassMember()) {
	CXXRecordDecl *Record = nullptr;
	if (Corrected.getCorrectionSpecifier()) {
	const Type *Ty = Corrected.getCorrectionSpecifier()->getAsType();
	Record = Ty->getAsCXXRecordDecl();
	}
	if (!Record)
	Record = cast<CXXRecordDecl>(
	ND->getDeclContext()->getRedeclContext());
	R.setNamingClass(Record);
	}

	auto *UnderlyingND = ND->getUnderlyingDecl();
	AcceptableWithRecovery = isa<ValueDecl>(UnderlyingND) \|\|
	isa<FunctionTemplateDecl>(UnderlyingND);
	// FIXME: If we ended up with a typo for a type name or
	// Objective-C class name, we're in trouble because the parser
	// is in the wrong place to recover. Suggest the typo
	// correction, but don't make it a fix-it since we're not going
	// to recover well anyway.
	AcceptableWithoutRecovery =
	isa<TypeDecl>(UnderlyingND) \|\| isa<ObjCInterfaceDecl>(UnderlyingND);
	} else {
	// FIXME: We found a keyword. Suggest it, but don't provide a fix-it
	// because we aren't able to recover.
	AcceptableWithoutRecovery = true;
	}

	if (AcceptableWithRecovery \|\| AcceptableWithoutRecovery) {
	unsigned NoteID = Corrected.getCorrectionDeclAs<ImplicitParamDecl>()
	? diag::note_implicit_param_decl
	: diag::note_previous_decl;
	if (SS.isEmpty())
	diagnoseTypo(Corrected, PDiag(diagnostic_suggest) << Name,
	PDiag(NoteID), AcceptableWithRecovery);
	else
	diagnoseTypo(Corrected, PDiag(diag::err_no_member_suggest)
	<< Name << computeDeclContext(SS, false)
	<< DroppedSpecifier << SS.getRange(),
	PDiag(NoteID), AcceptableWithRecovery);

	// Tell the callee whether to try to recover.
	return !AcceptableWithRecovery;
	}
	}
	R.clear();

	// Emit a special diagnostic for failed member lookups.
	// FIXME: computing the declaration context might fail here (?)
	if (!SS.isEmpty()) {
	Diag(R.getNameLoc(), diag::err_no_member)
	<< Name << computeDeclContext(SS, false)
	<< SS.getRange();
	return true;
	}

	// Give up, we can't recover.
	Diag(R.getNameLoc(), diagnostic) << Name;
	return true;
	}

	/// In Microsoft mode, if we are inside a template class whose parent class has
	/// dependent base classes, and we can't resolve an unqualified identifier, then
	/// assume the identifier is a member of a dependent base class. We can only
	/// recover successfully in static methods, instance methods, and other contexts
	/// where 'this' is available. This doesn't precisely match MSVC's
	/// instantiation model, but it's close enough.
	static Expr *
	recoverFromMSUnqualifiedLookup(Sema &S, ASTContext &Context,
	DeclarationNameInfo &NameInfo,
	SourceLocation TemplateKWLoc,
	const TemplateArgumentListInfo *TemplateArgs) {
	// Only try to recover from lookup into dependent bases in static methods or
	// contexts where 'this' is available.
	QualType ThisType = S.getCurrentThisType();
	const CXXRecordDecl *RD = nullptr;
	if (!ThisType.isNull())
	RD = ThisType->getPointeeType()->getAsCXXRecordDecl();
	else if (auto *MD = dyn_cast<CXXMethodDecl>(S.CurContext))
	RD = MD->getParent();
	if (!RD \|\| !RD->hasAnyDependentBases())
	return nullptr;

	// Diagnose this as unqualified lookup into a dependent base class. If 'this'
	// is available, suggest inserting 'this->' as a fixit.
	SourceLocation Loc = NameInfo.getLoc();
	auto DB = S.Diag(Loc, diag::ext_undeclared_unqual_id_with_dependent_base);
	DB << NameInfo.getName() << RD;

	if (!ThisType.isNull()) {
	DB << FixItHint::CreateInsertion(Loc, "this->");
	return CXXDependentScopeMemberExpr::Create(
	Context, /This=/nullptr, ThisType, /IsArrow=/true,
	/Op=/SourceLocation(), NestedNameSpecifierLoc(), TemplateKWLoc,
	/FirstQualifierInScope=/nullptr, NameInfo, TemplateArgs);
	}

	// Synthesize a fake NNS that points to the derived class. This will
	// perform name lookup during template instantiation.
	CXXScopeSpec SS;
	auto *NNS =
	NestedNameSpecifier::Create(Context, nullptr, true, RD->getTypeForDecl());
	SS.MakeTrivial(Context, NNS, SourceRange(Loc, Loc));
	return DependentScopeDeclRefExpr::Create(
	Context, SS.getWithLocInContext(Context), TemplateKWLoc, NameInfo,
	TemplateArgs);
	}

	ExprResult
	Sema::ActOnIdExpression(Scope *S, CXXScopeSpec &SS,
	SourceLocation TemplateKWLoc, UnqualifiedId &Id,
	bool HasTrailingLParen, bool IsAddressOfOperand,
	std::unique_ptr<CorrectionCandidateCallback> CCC,
	bool IsInlineAsmIdentifier, Token *KeywordReplacement) {
	assert(!(IsAddressOfOperand && HasTrailingLParen) &&
	"cannot be direct & operand and have a trailing lparen");
	if (SS.isInvalid())
	return ExprError();

	TemplateArgumentListInfo TemplateArgsBuffer;

	// Decompose the UnqualifiedId into the following data.
	DeclarationNameInfo NameInfo;
	const TemplateArgumentListInfo *TemplateArgs;
	DecomposeUnqualifiedId(Id, TemplateArgsBuffer, NameInfo, TemplateArgs);

	DeclarationName Name = NameInfo.getName();
	IdentifierInfo *II = Name.getAsIdentifierInfo();
	SourceLocation NameLoc = NameInfo.getLoc();

	if (II && II->isEditorPlaceholder()) {
	// FIXME: When typed placeholders are supported we can create a typed
	// placeholder expression node.
	return ExprError();
	}

	// C++ [temp.dep.expr]p3:
	// An id-expression is type-dependent if it contains:
	// -- an identifier that was declared with a dependent type,
	// (note: handled after lookup)
	// -- a template-id that is dependent,
	// (note: handled in BuildTemplateIdExpr)
	// -- a conversion-function-id that specifies a dependent type,
	// -- a nested-name-specifier that contains a class-name that
	// names a dependent type.
	// Determine whether this is a member of an unknown specialization;
	// we need to handle these differently.
	bool DependentID = false;
	if (Name.getNameKind() == DeclarationName::CXXConversionFunctionName &&
	Name.getCXXNameType()->isDependentType()) {
	DependentID = true;
	} else if (SS.isSet()) {
	if (DeclContext *DC = computeDeclContext(SS, false)) {
	if (RequireCompleteDeclContext(SS, DC))
	return ExprError();
	} else {
	DependentID = true;
	}
	}

	if (DependentID)
	return ActOnDependentIdExpression(SS, TemplateKWLoc, NameInfo,
	IsAddressOfOperand, TemplateArgs);

	// Perform the required lookup.
	LookupResult R(*this, NameInfo,
	(Id.getKind() == UnqualifiedId::IK_ImplicitSelfParam)
	? LookupObjCImplicitSelfParam : LookupOrdinaryName);
	if (TemplateArgs) {
	// Lookup the template name again to correctly establish the context in
	// which it was found. This is really unfortunate as we already did the
	// lookup to determine that it was a template name in the first place. If
	// this becomes a performance hit, we can work harder to preserve those
	// results until we get here but it's likely not worth it.
	bool MemberOfUnknownSpecialization;
	LookupTemplateName(R, S, SS, QualType(), /EnteringContext=/false,
	MemberOfUnknownSpecialization);

	if (MemberOfUnknownSpecialization \|\|
	(R.getResultKind() == LookupResult::NotFoundInCurrentInstantiation))
	return ActOnDependentIdExpression(SS, TemplateKWLoc, NameInfo,
	IsAddressOfOperand, TemplateArgs);
	} else {
	bool IvarLookupFollowUp = II && !SS.isSet() && getCurMethodDecl();
	LookupParsedName(R, S, &SS, !IvarLookupFollowUp);

	// If the result might be in a dependent base class, this is a dependent
	// id-expression.
	if (R.getResultKind() == LookupResult::NotFoundInCurrentInstantiation)
	return ActOnDependentIdExpression(SS, TemplateKWLoc, NameInfo,
	IsAddressOfOperand, TemplateArgs);

	// If this reference is in an Objective-C method, then we need to do
	// some special Objective-C lookup, too.
	if (IvarLookupFollowUp) {
	ExprResult E(LookupInObjCMethod(R, S, II, true));
	if (E.isInvalid())
	return ExprError();

	if (Expr *Ex = E.getAs<Expr>())
	return Ex;
	}
	}

	if (R.isAmbiguous())
	return ExprError();

	// This could be an implicitly declared function reference (legal in C90,
	// extension in C99, forbidden in C++).
	if (R.empty() && HasTrailingLParen && II && !getLangOpts().CPlusPlus) {
	NamedDecl D = ImplicitlyDefineFunction(NameLoc, II, S);
	if (D) R.addDecl(D);
	}

	// Determine whether this name might be a candidate for
	// argument-dependent lookup.
	bool ADL = UseArgumentDependentLookup(SS, R, HasTrailingLParen);

	if (R.empty() && !ADL) {
	if (SS.isEmpty() && getLangOpts().MSVCCompat) {
	if (Expr E = recoverFromMSUnqualifiedLookup(this, Context, NameInfo,
	TemplateKWLoc, TemplateArgs))
	return E;
	}

	// Don't diagnose an empty lookup for inline assembly.
	if (IsInlineAsmIdentifier)
	return ExprError();

	// If this name wasn't predeclared and if this is not a function
	// call, diagnose the problem.
	TypoExpr *TE = nullptr;
	auto DefaultValidator = llvm::make_unique<CorrectionCandidateCallback>(
	II, SS.isValid() ? SS.getScopeRep() : nullptr);
	DefaultValidator->IsAddressOfOperand = IsAddressOfOperand;
	assert((!CCC \|\| CCC->IsAddressOfOperand == IsAddressOfOperand) &&
	"Typo correction callback misconfigured");
	if (CCC) {
	// Make sure the callback knows what the typo being diagnosed is.
	CCC->setTypoName(II);
	if (SS.isValid())
	CCC->setTypoNNS(SS.getScopeRep());
	}
	if (DiagnoseEmptyLookup(S, SS, R,
	CCC ? std::move(CCC) : std::move(DefaultValidator),
	nullptr, None, &TE)) {
	if (TE && KeywordReplacement) {
	auto &State = getTypoExprState(TE);
	auto BestTC = State.Consumer->getNextCorrection();
	if (BestTC.isKeyword()) {
	auto *II = BestTC.getCorrectionAsIdentifierInfo();
	if (State.DiagHandler)
	State.DiagHandler(BestTC);
	KeywordReplacement->startToken();
	KeywordReplacement->setKind(II->getTokenID());
	KeywordReplacement->setIdentifierInfo(II);
	KeywordReplacement->setLocation(BestTC.getCorrectionRange().getBegin());
	// Clean up the state associated with the TypoExpr, since it has
	// now been diagnosed (without a call to CorrectDelayedTyposInExpr).
	clearDelayedTypo(TE);
	// Signal that a correction to a keyword was performed by returning a
	// valid-but-null ExprResult.
	return (Expr*)nullptr;
	}
	State.Consumer->resetCorrectionStream();
	}
	return TE ? TE : ExprError();
	}

	assert(!R.empty() &&
	"DiagnoseEmptyLookup returned false but added no results");

	// If we found an Objective-C instance variable, let
	// LookupInObjCMethod build the appropriate expression to
	// reference the ivar.
	if (ObjCIvarDecl *Ivar = R.getAsSingle<ObjCIvarDecl>()) {
	R.clear();
	ExprResult E(LookupInObjCMethod(R, S, Ivar->getIdentifier()));
	// In a hopelessly buggy code, Objective-C instance variable
	// lookup fails and no expression will be built to reference it.
	if (!E.isInvalid() && !E.get())
	return ExprError();
	return E;
	}
	}

	// This is guaranteed from this point on.
	assert(!R.empty() \|\| ADL);

	// Check whether this might be a C++ implicit instance member access.
	// C++ [class.mfct.non-static]p3:
	// When an id-expression that is not part of a class member access
	// syntax and not used to form a pointer to member is used in the
	// body of a non-static member function of class X, if name lookup
	// resolves the name in the id-expression to a non-static non-type
	// member of some class C, the id-expression is transformed into a
	// class member access expression using (*this) as the
	// postfix-expression to the left of the . operator.
	//
	// But we don't actually need to do this for '&' operands if R
	// resolved to a function or overloaded function set, because the
	// expression is ill-formed if it actually works out to be a
	// non-static member function:
	//
	// C++ [expr.ref]p4:
	// Otherwise, if E1.E2 refers to a non-static member function. . .
	// [t]he expression can be used only as the left-hand operand of a
	// member function call.
	//
	// There are other safeguards against such uses, but it's important
	// to get this right here so that we don't end up making a
	// spuriously dependent expression if we're inside a dependent
	// instance method.
	if (!R.empty() && (*R.begin())->isCXXClassMember()) {
	bool MightBeImplicitMember;
	if (!IsAddressOfOperand)
	MightBeImplicitMember = true;
	else if (!SS.isEmpty())
	MightBeImplicitMember = false;
	else if (R.isOverloadedResult())
	MightBeImplicitMember = false;
	else if (R.isUnresolvableResult())
	MightBeImplicitMember = true;
	else
	MightBeImplicitMember = isa<FieldDecl>(R.getFoundDecl()) \|\|
	isa<IndirectFieldDecl>(R.getFoundDecl()) \|\|
	isa<MSPropertyDecl>(R.getFoundDecl());

	if (MightBeImplicitMember)
	return BuildPossibleImplicitMemberExpr(SS, TemplateKWLoc,
	R, TemplateArgs, S);
	}

	if (TemplateArgs \|\| TemplateKWLoc.isValid()) {

	// In C++1y, if this is a variable template id, then check it
	// in BuildTemplateIdExpr().
	// The single lookup result must be a variable template declaration.
	if (Id.getKind() == UnqualifiedId::IK_TemplateId && Id.TemplateId &&
	Id.TemplateId->Kind == TNK_Var_template) {
	assert(R.getAsSingle<VarTemplateDecl>() &&
	"There should only be one declaration found.");
	}

	return BuildTemplateIdExpr(SS, TemplateKWLoc, R, ADL, TemplateArgs);
	}

	return BuildDeclarationNameExpr(SS, R, ADL);
	}

	/// BuildQualifiedDeclarationNameExpr - Build a C++ qualified
	/// declaration name, generally during template instantiation.
	/// There's a large number of things which don't need to be done along
	/// this path.
	ExprResult Sema::BuildQualifiedDeclarationNameExpr(
	CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo,
	bool IsAddressOfOperand, const Scope S, TypeSourceInfo *RecoveryTSI) {
	DeclContext *DC = computeDeclContext(SS, false);
	if (!DC)
	return BuildDependentDeclRefExpr(SS, /TemplateKWLoc=/SourceLocation(),
	NameInfo, /TemplateArgs=/nullptr);

	if (RequireCompleteDeclContext(SS, DC))
	return ExprError();

	LookupResult R(*this, NameInfo, LookupOrdinaryName);
	LookupQualifiedName(R, DC);

	if (R.isAmbiguous())
	return ExprError();

	if (R.getResultKind() == LookupResult::NotFoundInCurrentInstantiation)
	return BuildDependentDeclRefExpr(SS, /TemplateKWLoc=/SourceLocation(),
	NameInfo, /TemplateArgs=/nullptr);

	if (R.empty()) {
	Diag(NameInfo.getLoc(), diag::err_no_member)
	<< NameInfo.getName() << DC << SS.getRange();
	return ExprError();
	}

	if (const TypeDecl *TD = R.getAsSingle<TypeDecl>()) {
	// Diagnose a missing typename if this resolved unambiguously to a type in
	// a dependent context. If we can recover with a type, downgrade this to
	// a warning in Microsoft compatibility mode.
	unsigned DiagID = diag::err_typename_missing;
	if (RecoveryTSI && getLangOpts().MSVCCompat)
	DiagID = diag::ext_typename_missing;
	SourceLocation Loc = SS.getBeginLoc();
	auto D = Diag(Loc, DiagID);
	D << SS.getScopeRep() << NameInfo.getName().getAsString()
	<< SourceRange(Loc, NameInfo.getEndLoc());

	// Don't recover if the caller isn't expecting us to or if we're in a SFINAE
	// context.
	if (!RecoveryTSI)
	return ExprError();

	// Only issue the fixit if we're prepared to recover.
	D << FixItHint::CreateInsertion(Loc, "typename ");

	// Recover by pretending this was an elaborated type.
	QualType Ty = Context.getTypeDeclType(TD);
	TypeLocBuilder TLB;
	TLB.pushTypeSpec(Ty).setNameLoc(NameInfo.getLoc());

	QualType ET = getElaboratedType(ETK_None, SS, Ty);
	ElaboratedTypeLoc QTL = TLB.push<ElaboratedTypeLoc>(ET);
	QTL.setElaboratedKeywordLoc(SourceLocation());
	QTL.setQualifierLoc(SS.getWithLocInContext(Context));

	*RecoveryTSI = TLB.getTypeSourceInfo(Context, ET);

	return ExprEmpty();
	}

	// Defend against this resolving to an implicit member access. We usually
	// won't get here if this might be a legitimate a class member (we end up in
	// BuildMemberReferenceExpr instead), but this can be valid if we're forming
	// a pointer-to-member or in an unevaluated context in C++11.
	if (!R.empty() && (*R.begin())->isCXXClassMember() && !IsAddressOfOperand)
	return BuildPossibleImplicitMemberExpr(SS,
	/TemplateKWLoc=/SourceLocation(),
	R, /TemplateArgs=/nullptr, S);

	return BuildDeclarationNameExpr(SS, R, /* ADL */ false);
	}

	/// LookupInObjCMethod - The parser has read a name in, and Sema has
	/// detected that we're currently inside an ObjC method. Perform some
	/// additional lookup.
	///
	/// Ideally, most of this would be done by lookup, but there's
	/// actually quite a lot of extra work involved.
	///
	/// Returns a null sentinel to indicate trivial success.
	ExprResult
	Sema::LookupInObjCMethod(LookupResult &Lookup, Scope *S,
	IdentifierInfo *II, bool AllowBuiltinCreation) {
	SourceLocation Loc = Lookup.getNameLoc();
	ObjCMethodDecl *CurMethod = getCurMethodDecl();

	// Check for error condition which is already reported.
	if (!CurMethod)
	return ExprError();

	// There are two cases to handle here. 1) scoped lookup could have failed,
	// in which case we should look for an ivar. 2) scoped lookup could have
	// found a decl, but that decl is outside the current instance method (i.e.
	// a global variable). In these two cases, we do a lookup for an ivar with
	// this name, if the lookup sucedes, we replace it our current decl.

	// If we're in a class method, we don't normally want to look for
	// ivars. But if we don't find anything else, and there's an
	// ivar, that's an error.
	bool IsClassMethod = CurMethod->isClassMethod();

	bool LookForIvars;
	if (Lookup.empty())
	LookForIvars = true;
	else if (IsClassMethod)
	LookForIvars = false;
	else
	LookForIvars = (Lookup.isSingleResult() &&
	Lookup.getFoundDecl()->isDefinedOutsideFunctionOrMethod());
	ObjCInterfaceDecl *IFace = nullptr;
	if (LookForIvars) {
	IFace = CurMethod->getClassInterface();
	ObjCInterfaceDecl *ClassDeclared;
	ObjCIvarDecl *IV = nullptr;
	if (IFace && (IV = IFace->lookupInstanceVariable(II, ClassDeclared))) {
	// Diagnose using an ivar in a class method.
	if (IsClassMethod)
	return ExprError(Diag(Loc, diag::err_ivar_use_in_class_method)
	<< IV->getDeclName());

	// If we're referencing an invalid decl, just return this as a silent
	// error node. The error diagnostic was already emitted on the decl.
	if (IV->isInvalidDecl())
	return ExprError();

	// Check if referencing a field with __attribute__((deprecated)).
	if (DiagnoseUseOfDecl(IV, Loc))
	return ExprError();

	// Diagnose the use of an ivar outside of the declaring class.
	if (IV->getAccessControl() == ObjCIvarDecl::Private &&
	!declaresSameEntity(ClassDeclared, IFace) &&
	!getLangOpts().DebuggerSupport)
	Diag(Loc, diag::err_private_ivar_access) << IV->getDeclName();

	// FIXME: This should use a new expr for a direct reference, don't
	// turn this into Self->ivar, just return a BareIVarExpr or something.
	IdentifierInfo &II = Context.Idents.get("self");
	UnqualifiedId SelfName;
	SelfName.setIdentifier(&II, SourceLocation());
	SelfName.setKind(UnqualifiedId::IK_ImplicitSelfParam);
	CXXScopeSpec SelfScopeSpec;
	SourceLocation TemplateKWLoc;
	ExprResult SelfExpr = ActOnIdExpression(S, SelfScopeSpec, TemplateKWLoc,
	SelfName, false, false);
	if (SelfExpr.isInvalid())
	return ExprError();

	SelfExpr = DefaultLvalueConversion(SelfExpr.get());
	if (SelfExpr.isInvalid())
	return ExprError();

	MarkAnyDeclReferenced(Loc, IV, true);

	ObjCMethodFamily MF = CurMethod->getMethodFamily();
	if (MF != OMF_init && MF != OMF_dealloc && MF != OMF_finalize &&
	!IvarBacksCurrentMethodAccessor(IFace, CurMethod, IV))
	Diag(Loc, diag::warn_direct_ivar_access) << IV->getDeclName();

	ObjCIvarRefExpr *Result = new (Context)
	ObjCIvarRefExpr(IV, IV->getUsageType(SelfExpr.get()->getType()), Loc,
	IV->getLocation(), SelfExpr.get(), true, true);

	if (IV->getType().getObjCLifetime() == Qualifiers::OCL_Weak) {
	if (!Diags.isIgnored(diag::warn_arc_repeated_use_of_weak, Loc))
	recordUseOfEvaluatedWeak(Result);
	}
	if (getLangOpts().ObjCAutoRefCount) {
	if (CurContext->isClosure())
	Diag(Loc, diag::warn_implicitly_retains_self)
	<< FixItHint::CreateInsertion(Loc, "self->");
	}

	return Result;
	}
	} else if (CurMethod->isInstanceMethod()) {
	// We should warn if a local variable hides an ivar.
	if (ObjCInterfaceDecl *IFace = CurMethod->getClassInterface()) {
	ObjCInterfaceDecl *ClassDeclared;
	if (ObjCIvarDecl *IV = IFace->lookupInstanceVariable(II, ClassDeclared)) {
	if (IV->getAccessControl() != ObjCIvarDecl::Private \|\|
	declaresSameEntity(IFace, ClassDeclared))
	Diag(Loc, diag::warn_ivar_use_hidden) << IV->getDeclName();
	}
	}
	} else if (Lookup.isSingleResult() &&
	Lookup.getFoundDecl()->isDefinedOutsideFunctionOrMethod()) {
	// If accessing a stand-alone ivar in a class method, this is an error.
	if (const ObjCIvarDecl *IV = dyn_cast<ObjCIvarDecl>(Lookup.getFoundDecl()))
	return ExprError(Diag(Loc, diag::err_ivar_use_in_class_method)
	<< IV->getDeclName());
	}

	if (Lookup.empty() && II && AllowBuiltinCreation) {
	// FIXME. Consolidate this with similar code in LookupName.
	if (unsigned BuiltinID = II->getBuiltinID()) {
	if (!(getLangOpts().CPlusPlus &&
	Context.BuiltinInfo.isPredefinedLibFunction(BuiltinID))) {
	NamedDecl D = LazilyCreateBuiltin((IdentifierInfo )II, BuiltinID,
	S, Lookup.isForRedeclaration(),
	Lookup.getNameLoc());
	if (D) Lookup.addDecl(D);
	}
	}
	}
	// Sentinel value saying that we didn't do anything special.
	return ExprResult((Expr *)nullptr);
	}

	/// \brief Cast a base object to a member's actual type.
	///
	/// Logically this happens in three phases:
	///
	/// * First we cast from the base type to the naming class.
	/// The naming class is the class into which we were looking
	/// when we found the member; it's the qualifier type if a
	/// qualifier was provided, and otherwise it's the base type.
	///
	/// * Next we cast from the naming class to the declaring class.
	/// If the member we found was brought into a class's scope by
	/// a using declaration, this is that class; otherwise it's
	/// the class declaring the member.
	///
	/// * Finally we cast from the declaring class to the "true"
	/// declaring class of the member. This conversion does not
	/// obey access control.
	ExprResult
	Sema::PerformObjectMemberConversion(Expr *From,
	NestedNameSpecifier *Qualifier,
	NamedDecl *FoundDecl,
	NamedDecl *Member) {
	CXXRecordDecl *RD = dyn_cast<CXXRecordDecl>(Member->getDeclContext());
	if (!RD)
	return From;

	QualType DestRecordType;
	QualType DestType;
	QualType FromRecordType;
	QualType FromType = From->getType();
	bool PointerConversions = false;
	if (isa<FieldDecl>(Member)) {
	DestRecordType = Context.getCanonicalType(Context.getTypeDeclType(RD));

	if (FromType->getAs<PointerType>()) {
	DestType = Context.getPointerType(DestRecordType);
	FromRecordType = FromType->getPointeeType();
	PointerConversions = true;
	} else {
	DestType = DestRecordType;
	FromRecordType = FromType;
	}
	} else if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(Member)) {
	if (Method->isStatic())
	return From;

	DestType = Method->getThisType(Context);
	DestRecordType = DestType->getPointeeType();

	if (FromType->getAs<PointerType>()) {
	FromRecordType = FromType->getPointeeType();
	PointerConversions = true;
	} else {
	FromRecordType = FromType;
	DestType = DestRecordType;
	}
	} else {
	// No conversion necessary.
	return From;
	}

	if (DestType->isDependentType() \|\| FromType->isDependentType())
	return From;

	// If the unqualified types are the same, no conversion is necessary.
	if (Context.hasSameUnqualifiedType(FromRecordType, DestRecordType))
	return From;

	SourceRange FromRange = From->getSourceRange();
	SourceLocation FromLoc = FromRange.getBegin();

	ExprValueKind VK = From->getValueKind();

	// C++ [class.member.lookup]p8:
	// [...] Ambiguities can often be resolved by qualifying a name with its
	// class name.
	//
	// If the member was a qualified name and the qualified referred to a
	// specific base subobject type, we'll cast to that intermediate type
	// first and then to the object in which the member is declared. That allows
	// one to resolve ambiguities in, e.g., a diamond-shaped hierarchy such as:
	//
	// class Base { public: int x; };
	// class Derived1 : public Base { };
	// class Derived2 : public Base { };
	// class VeryDerived : public Derived1, public Derived2 { void f(); };
	//
	// void VeryDerived::f() {
	// x = 17; // error: ambiguous base subobjects
	// Derived1::x = 17; // okay, pick the Base subobject of Derived1
	// }
	if (Qualifier && Qualifier->getAsType()) {
	QualType QType = QualType(Qualifier->getAsType(), 0);
	assert(QType->isRecordType() && "lookup done with non-record type");

	QualType QRecordType = QualType(QType->getAs<RecordType>(), 0);

	// In C++98, the qualifier type doesn't actually have to be a base
	// type of the object type, in which case we just ignore it.
	// Otherwise build the appropriate casts.
	if (IsDerivedFrom(FromLoc, FromRecordType, QRecordType)) {
	CXXCastPath BasePath;
	if (CheckDerivedToBaseConversion(FromRecordType, QRecordType,
	FromLoc, FromRange, &BasePath))
	return ExprError();

	if (PointerConversions)
	QType = Context.getPointerType(QType);
	From = ImpCastExprToType(From, QType, CK_UncheckedDerivedToBase,
	VK, &BasePath).get();

	FromType = QType;
	FromRecordType = QRecordType;

	// If the qualifier type was the same as the destination type,
	// we're done.
	if (Context.hasSameUnqualifiedType(FromRecordType, DestRecordType))
	return From;
	}
	}

	bool IgnoreAccess = false;

	// If we actually found the member through a using declaration, cast
	// down to the using declaration's type.
	//
	// Pointer equality is fine here because only one declaration of a
	// class ever has member declarations.
	if (FoundDecl->getDeclContext() != Member->getDeclContext()) {
	assert(isa<UsingShadowDecl>(FoundDecl));
	QualType URecordType = Context.getTypeDeclType(
	cast<CXXRecordDecl>(FoundDecl->getDeclContext()));

	// We only need to do this if the naming-class to declaring-class
	// conversion is non-trivial.
	if (!Context.hasSameUnqualifiedType(FromRecordType, URecordType)) {
	assert(IsDerivedFrom(FromLoc, FromRecordType, URecordType));
	CXXCastPath BasePath;
	if (CheckDerivedToBaseConversion(FromRecordType, URecordType,
	FromLoc, FromRange, &BasePath))
	return ExprError();

	QualType UType = URecordType;
	if (PointerConversions)
	UType = Context.getPointerType(UType);
	From = ImpCastExprToType(From, UType, CK_UncheckedDerivedToBase,
	VK, &BasePath).get();
	FromType = UType;
	FromRecordType = URecordType;
	}

	// We don't do access control for the conversion from the
	// declaring class to the true declaring class.
	IgnoreAccess = true;
	}

	CXXCastPath BasePath;
	if (CheckDerivedToBaseConversion(FromRecordType, DestRecordType,
	FromLoc, FromRange, &BasePath,
	IgnoreAccess))
	return ExprError();

	return ImpCastExprToType(From, DestType, CK_UncheckedDerivedToBase,
	VK, &BasePath);
	}

	bool Sema::UseArgumentDependentLookup(const CXXScopeSpec &SS,
	const LookupResult &R,
	bool HasTrailingLParen) {
	// Only when used directly as the postfix-expression of a call.
	if (!HasTrailingLParen)
	return false;

	// Never if a scope specifier was provided.
	if (SS.isSet())
	return false;

	// Only in C++ or ObjC++.
	if (!getLangOpts().CPlusPlus)
	return false;

	// Turn off ADL when we find certain kinds of declarations during
	// normal lookup:
	for (NamedDecl *D : R) {
	// C++0x [basic.lookup.argdep]p3:
	// -- a declaration of a class member
	// Since using decls preserve this property, we check this on the
	// original decl.
	if (D->isCXXClassMember())
	return false;

	// C++0x [basic.lookup.argdep]p3:
	// -- a block-scope function declaration that is not a
	// using-declaration
	// NOTE: we also trigger this for function templates (in fact, we
	// don't check the decl type at all, since all other decl types
	// turn off ADL anyway).
	if (isa<UsingShadowDecl>(D))
	D = cast<UsingShadowDecl>(D)->getTargetDecl();
	else if (D->getLexicalDeclContext()->isFunctionOrMethod())
	return false;

	// C++0x [basic.lookup.argdep]p3:
	// -- a declaration that is neither a function or a function
	// template
	// And also for builtin functions.
	if (isa<FunctionDecl>(D)) {
	FunctionDecl *FDecl = cast<FunctionDecl>(D);

	// But also builtin functions.
	if (FDecl->getBuiltinID() && FDecl->isImplicit())
	return false;
	} else if (!isa<FunctionTemplateDecl>(D))
	return false;
	}

	return true;
	}


	/// Diagnoses obvious problems with the use of the given declaration
	/// as an expression. This is only actually called for lookups that
	/// were not overloaded, and it doesn't promise that the declaration
	/// will in fact be used.
	static bool CheckDeclInExpr(Sema &S, SourceLocation Loc, NamedDecl *D) {
	if (D->isInvalidDecl())
	return true;

	if (isa<TypedefNameDecl>(D)) {
	S.Diag(Loc, diag::err_unexpected_typedef) << D->getDeclName();
	return true;
	}

	if (isa<ObjCInterfaceDecl>(D)) {
	S.Diag(Loc, diag::err_unexpected_interface) << D->getDeclName();
	return true;
	}

	if (isa<NamespaceDecl>(D)) {
	S.Diag(Loc, diag::err_unexpected_namespace) << D->getDeclName();
	return true;
	}

	return false;
	}

	ExprResult Sema::BuildDeclarationNameExpr(const CXXScopeSpec &SS,
	LookupResult &R, bool NeedsADL,
	bool AcceptInvalidDecl) {
	// If this is a single, fully-resolved result and we don't need ADL,
	// just build an ordinary singleton decl ref.
	if (!NeedsADL && R.isSingleResult() && !R.getAsSingle<FunctionTemplateDecl>())
	return BuildDeclarationNameExpr(SS, R.getLookupNameInfo(), R.getFoundDecl(),
	R.getRepresentativeDecl(), nullptr,
	AcceptInvalidDecl);

	// We only need to check the declaration if there's exactly one
	// result, because in the overloaded case the results can only be
	// functions and function templates.
	if (R.isSingleResult() &&
	CheckDeclInExpr(*this, R.getNameLoc(), R.getFoundDecl()))
	return ExprError();

	// Otherwise, just build an unresolved lookup expression. Suppress
	// any lookup-related diagnostics; we'll hash these out later, when
	// we've picked a target.
	R.suppressDiagnostics();

	UnresolvedLookupExpr *ULE
	= UnresolvedLookupExpr::Create(Context, R.getNamingClass(),
	SS.getWithLocInContext(Context),
	R.getLookupNameInfo(),
	NeedsADL, R.isOverloadedResult(),
	R.begin(), R.end());

	return ULE;
	}

	static void
	diagnoseUncapturableValueReference(Sema &S, SourceLocation loc,
	ValueDecl var, DeclContext DC);

	/// \brief Complete semantic analysis for a reference to the given declaration.
	ExprResult Sema::BuildDeclarationNameExpr(
	const CXXScopeSpec &SS, const DeclarationNameInfo &NameInfo, NamedDecl *D,
	NamedDecl FoundD, const TemplateArgumentListInfo TemplateArgs,
	bool AcceptInvalidDecl) {
	assert(D && "Cannot refer to a NULL declaration");
	assert(!isa<FunctionTemplateDecl>(D) &&
	"Cannot refer unambiguously to a function template");

	SourceLocation Loc = NameInfo.getLoc();
	if (CheckDeclInExpr(*this, Loc, D))
	return ExprError();

	if (TemplateDecl *Template = dyn_cast<TemplateDecl>(D)) {
	// Specifically diagnose references to class templates that are missing
	// a template argument list.
	Diag(Loc, diag::err_template_decl_ref) << (isa<VarTemplateDecl>(D) ? 1 : 0)
	<< Template << SS.getRange();
	Diag(Template->getLocation(), diag::note_template_decl_here);
	return ExprError();
	}

	// Make sure that we're referring to a value.
	ValueDecl *VD = dyn_cast<ValueDecl>(D);
	if (!VD) {
	Diag(Loc, diag::err_ref_non_value)
	<< D << SS.getRange();
	Diag(D->getLocation(), diag::note_declared_at);
	return ExprError();
	}

	// Check whether this declaration can be used. Note that we suppress
	// this check when we're going to perform argument-dependent lookup
	// on this function name, because this might not be the function
	// that overload resolution actually selects.
	if (DiagnoseUseOfDecl(VD, Loc))
	return ExprError();

	// Only create DeclRefExpr's for valid Decl's.
	if (VD->isInvalidDecl() && !AcceptInvalidDecl)
	return ExprError();

	// Handle members of anonymous structs and unions. If we got here,
	// and the reference is to a class member indirect field, then this
	// must be the subject of a pointer-to-member expression.
	if (IndirectFieldDecl *indirectField = dyn_cast<IndirectFieldDecl>(VD))
	if (!indirectField->isCXXClassMember())
	return BuildAnonymousStructUnionMemberReference(SS, NameInfo.getLoc(),
	indirectField);

	{
	QualType type = VD->getType();
	if (auto *FPT = type->getAs<FunctionProtoType>()) {
	// C++ [except.spec]p17:
	// An exception-specification is considered to be needed when:
	// - in an expression, the function is the unique lookup result or
	// the selected member of a set of overloaded functions.
	ResolveExceptionSpec(Loc, FPT);
	type = VD->getType();
	}
	ExprValueKind valueKind = VK_RValue;

	switch (D->getKind()) {
	// Ignore all the non-ValueDecl kinds.
	#define ABSTRACT_DECL(kind)
	#define VALUE(type, base)
	#define DECL(type, base) \
	case Decl::type:
	#include "clang/AST/DeclNodes.inc"
	llvm_unreachable("invalid value decl kind");

	// These shouldn't make it here.
	case Decl::ObjCAtDefsField:
	case Decl::ObjCIvar:
	llvm_unreachable("forming non-member reference to ivar?");

	// Enum constants are always r-values and never references.
	// Unresolved using declarations are dependent.
	case Decl::EnumConstant:
	case Decl::UnresolvedUsingValue:
	case Decl::OMPDeclareReduction:
	valueKind = VK_RValue;
	break;

	// Fields and indirect fields that got here must be for
	// pointer-to-member expressions; we just call them l-values for
	// internal consistency, because this subexpression doesn't really
	// exist in the high-level semantics.
	case Decl::Field:
	case Decl::IndirectField:
	assert(getLangOpts().CPlusPlus &&
	"building reference to field in C?");

	// These can't have reference type in well-formed programs, but
	// for internal consistency we do this anyway.
	type = type.getNonReferenceType();
	valueKind = VK_LValue;
	break;

	// Non-type template parameters are either l-values or r-values
	// depending on the type.
	case Decl::NonTypeTemplateParm: {
	if (const ReferenceType *reftype = type->getAs<ReferenceType>()) {
	type = reftype->getPointeeType();
	valueKind = VK_LValue; // even if the parameter is an r-value reference
	break;
	}

	// For non-references, we need to strip qualifiers just in case
	// the template parameter was declared as 'const int' or whatever.
	valueKind = VK_RValue;
	type = type.getUnqualifiedType();
	break;
	}

	case Decl::Var:
	case Decl::VarTemplateSpecialization:
	case Decl::VarTemplatePartialSpecialization:
	case Decl::Decomposition:
	case Decl::OMPCapturedExpr:
	// In C, "extern void blah;" is valid and is an r-value.
	if (!getLangOpts().CPlusPlus &&
	!type.hasQualifiers() &&
	type->isVoidType()) {
	valueKind = VK_RValue;
	break;
	}
	// fallthrough

	case Decl::ImplicitParam:
	case Decl::ParmVar: {
	// These are always l-values.
	valueKind = VK_LValue;
	type = type.getNonReferenceType();

	// FIXME: Does the addition of const really only apply in
	// potentially-evaluated contexts? Since the variable isn't actually
	// captured in an unevaluated context, it seems that the answer is no.
	if (!isUnevaluatedContext()) {
	QualType CapturedType = getCapturedDeclRefType(cast<VarDecl>(VD), Loc);
	if (!CapturedType.isNull())
	type = CapturedType;
	}

	break;
	}

	case Decl::Binding: {
	// These are always lvalues.
	valueKind = VK_LValue;
	type = type.getNonReferenceType();
	// FIXME: Support lambda-capture of BindingDecls, once CWG actually
	// decides how that's supposed to work.
	auto *BD = cast<BindingDecl>(VD);
	if (BD->getDeclContext()->isFunctionOrMethod() &&
	BD->getDeclContext() != CurContext)
	diagnoseUncapturableValueReference(*this, Loc, BD, CurContext);
	break;
	}

	case Decl::Function: {
	if (unsigned BID = cast<FunctionDecl>(VD)->getBuiltinID()) {
	if (!Context.BuiltinInfo.isPredefinedLibFunction(BID)) {
	type = Context.BuiltinFnTy;
	valueKind = VK_RValue;
	break;
	}
	}

	const FunctionType *fty = type->castAs<FunctionType>();

	// If we're referring to a function with an __unknown_anytype
	// result type, make the entire expression __unknown_anytype.
	if (fty->getReturnType() == Context.UnknownAnyTy) {
	type = Context.UnknownAnyTy;
	valueKind = VK_RValue;
	break;
	}

	// Functions are l-values in C++.
	if (getLangOpts().CPlusPlus) {
	valueKind = VK_LValue;
	break;
	}

	// C99 DR 316 says that, if a function type comes from a
	// function definition (without a prototype), that type is only
	// used for checking compatibility. Therefore, when referencing
	// the function, we pretend that we don't have the full function
	// type.
	if (!cast<FunctionDecl>(VD)->hasPrototype() &&
	isa<FunctionProtoType>(fty))
	type = Context.getFunctionNoProtoType(fty->getReturnType(),
	fty->getExtInfo());

	// Functions are r-values in C.
	valueKind = VK_RValue;
	break;
	}

	case Decl::CXXDeductionGuide:
	llvm_unreachable("building reference to deduction guide");

	case Decl::MSProperty:
	valueKind = VK_LValue;
	break;

	case Decl::CXXMethod:
	// If we're referring to a method with an __unknown_anytype
	// result type, make the entire expression __unknown_anytype.
	// This should only be possible with a type written directly.
	if (const FunctionProtoType *proto
	= dyn_cast<FunctionProtoType>(VD->getType()))
	if (proto->getReturnType() == Context.UnknownAnyTy) {
	type = Context.UnknownAnyTy;
	valueKind = VK_RValue;
	break;
	}

	// C++ methods are l-values if static, r-values if non-static.
	if (cast<CXXMethodDecl>(VD)->isStatic()) {
	valueKind = VK_LValue;
	break;
	}
	// fallthrough

	case Decl::CXXConversion:
	case Decl::CXXDestructor:
	case Decl::CXXConstructor:
	valueKind = VK_RValue;
	break;
	}

	return BuildDeclRefExpr(VD, type, valueKind, NameInfo, &SS, FoundD,
	TemplateArgs);
	}
	}

	static void ConvertUTF8ToWideString(unsigned CharByteWidth, StringRef Source,
	SmallString<32> &Target) {
	Target.resize(CharByteWidth * (Source.size() + 1));
	char *ResultPtr = &Target[0];
	const llvm::UTF8 *ErrorPtr;
	bool success =
	llvm::ConvertUTF8toWide(CharByteWidth, Source, ResultPtr, ErrorPtr);
	(void)success;
	assert(success);
	Target.resize(ResultPtr - &Target[0]);
	}

	ExprResult Sema::BuildPredefinedExpr(SourceLocation Loc,
	PredefinedExpr::IdentType IT) {
	// Pick the current block, lambda, captured statement or function.
	Decl *currentDecl = nullptr;
	if (const BlockScopeInfo *BSI = getCurBlock())
	currentDecl = BSI->TheDecl;
	else if (const LambdaScopeInfo *LSI = getCurLambda())
	currentDecl = LSI->CallOperator;
	else if (const CapturedRegionScopeInfo *CSI = getCurCapturedRegion())
	currentDecl = CSI->TheCapturedDecl;
	else
	currentDecl = getCurFunctionOrMethodDecl();

	if (!currentDecl) {
	Diag(Loc, diag::ext_predef_outside_function);
	currentDecl = Context.getTranslationUnitDecl();
	}

	QualType ResTy;
	StringLiteral *SL = nullptr;
	if (cast<DeclContext>(currentDecl)->isDependentContext())
	ResTy = Context.DependentTy;
	else {
	// Pre-defined identifiers are of type char[x], where x is the length of
	// the string.
	auto Str = PredefinedExpr::ComputeName(IT, currentDecl);
	unsigned Length = Str.length();

	llvm::APInt LengthI(32, Length + 1);
	if (IT == PredefinedExpr::LFunction) {
	ResTy = Context.WideCharTy.withConst();
	SmallString<32> RawChars;
	ConvertUTF8ToWideString(Context.getTypeSizeInChars(ResTy).getQuantity(),
	Str, RawChars);
	ResTy = Context.getConstantArrayType(ResTy, LengthI, ArrayType::Normal,
	/IndexTypeQuals/ 0);
	SL = StringLiteral::Create(Context, RawChars, StringLiteral::Wide,
	/Pascal/ false, ResTy, Loc);
	} else {
	ResTy = Context.CharTy.withConst();
	ResTy = Context.getConstantArrayType(ResTy, LengthI, ArrayType::Normal,
	/IndexTypeQuals/ 0);
	SL = StringLiteral::Create(Context, Str, StringLiteral::Ascii,
	/Pascal/ false, ResTy, Loc);
	}
	}

	return new (Context) PredefinedExpr(Loc, ResTy, IT, SL);
	}

	ExprResult Sema::ActOnPredefinedExpr(SourceLocation Loc, tok::TokenKind Kind) {
	PredefinedExpr::IdentType IT;

	switch (Kind) {
	default: llvm_unreachable("Unknown simple primary expr!");
	case tok::kw___func__: IT = PredefinedExpr::Func; break; // [C99 6.4.2.2]
	case tok::kw___FUNCTION__: IT = PredefinedExpr::Function; break;
	case tok::kw___FUNCDNAME__: IT = PredefinedExpr::FuncDName; break; // [MS]
	case tok::kw___FUNCSIG__: IT = PredefinedExpr::FuncSig; break; // [MS]
	case tok::kw_L__FUNCTION__: IT = PredefinedExpr::LFunction; break;
	case tok::kw___PRETTY_FUNCTION__: IT = PredefinedExpr::PrettyFunction; break;
	}

	return BuildPredefinedExpr(Loc, IT);
	}

	ExprResult Sema::ActOnCharacterConstant(const Token &Tok, Scope *UDLScope) {
	SmallString<16> CharBuffer;
	bool Invalid = false;
	StringRef ThisTok = PP.getSpelling(Tok, CharBuffer, &Invalid);
	if (Invalid)
	return ExprError();

	CharLiteralParser Literal(ThisTok.begin(), ThisTok.end(), Tok.getLocation(),
	PP, Tok.getKind());
	if (Literal.hadError())
	return ExprError();

	QualType Ty;
	if (Literal.isWide())
	Ty = Context.WideCharTy; // L'x' -> wchar_t in C and C++.
	else if (Literal.isUTF16())
	Ty = Context.Char16Ty; // u'x' -> char16_t in C11 and C++11.
	else if (Literal.isUTF32())
	Ty = Context.Char32Ty; // U'x' -> char32_t in C11 and C++11.
	else if (!getLangOpts().CPlusPlus \|\| Literal.isMultiChar())
	Ty = Context.IntTy; // 'x' -> int in C, 'wxyz' -> int in C++.
	else
	Ty = Context.CharTy; // 'x' -> char in C++

	CharacterLiteral::CharacterKind Kind = CharacterLiteral::Ascii;
	if (Literal.isWide())
	Kind = CharacterLiteral::Wide;
	else if (Literal.isUTF16())
	Kind = CharacterLiteral::UTF16;
	else if (Literal.isUTF32())
	Kind = CharacterLiteral::UTF32;
	else if (Literal.isUTF8())
	Kind = CharacterLiteral::UTF8;

	Expr *Lit = new (Context) CharacterLiteral(Literal.getValue(), Kind, Ty,
	Tok.getLocation());

	if (Literal.getUDSuffix().empty())
	return Lit;

	// We're building a user-defined literal.
	IdentifierInfo *UDSuffix = &Context.Idents.get(Literal.getUDSuffix());
	SourceLocation UDSuffixLoc =
	getUDSuffixLoc(*this, Tok.getLocation(), Literal.getUDSuffixOffset());

	// Make sure we're allowed user-defined literals here.
	if (!UDLScope)
	return ExprError(Diag(UDSuffixLoc, diag::err_invalid_character_udl));

	// C++11 [lex.ext]p6: The literal L is treated as a call of the form
	// operator "" X (ch)
	return BuildCookedLiteralOperatorCall(*this, UDLScope, UDSuffix, UDSuffixLoc,
	Lit, Tok.getLocation());
	}

	ExprResult Sema::ActOnIntegerConstant(SourceLocation Loc, uint64_t Val) {
	unsigned IntSize = Context.getTargetInfo().getIntWidth();
	return IntegerLiteral::Create(Context, llvm::APInt(IntSize, Val),
	Context.IntTy, Loc);
	}

	static Expr *BuildFloatingLiteral(Sema &S, NumericLiteralParser &Literal,
	QualType Ty, SourceLocation Loc) {
	const llvm::fltSemantics &Format = S.Context.getFloatTypeSemantics(Ty);

	using llvm::APFloat;
	APFloat Val(Format);

	APFloat::opStatus result = Literal.GetFloatValue(Val);

	// Overflow is always an error, but underflow is only an error if
	// we underflowed to zero (APFloat reports denormals as underflow).
	if ((result & APFloat::opOverflow) \|\|
	((result & APFloat::opUnderflow) && Val.isZero())) {
	unsigned diagnostic;
	SmallString<20> buffer;
	if (result & APFloat::opOverflow) {
	diagnostic = diag::warn_float_overflow;
	APFloat::getLargest(Format).toString(buffer);
	} else {
	diagnostic = diag::warn_float_underflow;
	APFloat::getSmallest(Format).toString(buffer);
	}

	S.Diag(Loc, diagnostic)
	<< Ty
	<< StringRef(buffer.data(), buffer.size());
	}

	bool isExact = (result == APFloat::opOK);
	return FloatingLiteral::Create(S.Context, Val, isExact, Ty, Loc);
	}

	bool Sema::CheckLoopHintExpr(Expr *E, SourceLocation Loc) {
	assert(E && "Invalid expression");

	if (E->isValueDependent())
	return false;

	QualType QT = E->getType();
	if (!QT->isIntegerType() \|\| QT->isBooleanType() \|\| QT->isCharType()) {
	Diag(E->getExprLoc(), diag::err_pragma_loop_invalid_argument_type) << QT;
	return true;
	}

	llvm::APSInt ValueAPS;
	ExprResult R = VerifyIntegerConstantExpression(E, &ValueAPS);

	if (R.isInvalid())
	return true;

	bool ValueIsPositive = ValueAPS.isStrictlyPositive();
	if (!ValueIsPositive \|\| ValueAPS.getActiveBits() > 31) {
	Diag(E->getExprLoc(), diag::err_pragma_loop_invalid_argument_value)
	<< ValueAPS.toString(10) << ValueIsPositive;
	return true;
	}

	return false;
	}

	ExprResult Sema::ActOnNumericConstant(const Token &Tok, Scope *UDLScope) {
	// Fast path for a single digit (which is quite common). A single digit
	// cannot have a trigraph, escaped newline, radix prefix, or suffix.
	if (Tok.getLength() == 1) {
	const char Val = PP.getSpellingOfSingleCharacterNumericConstant(Tok);
	return ActOnIntegerConstant(Tok.getLocation(), Val-'0');
	}

	SmallString<128> SpellingBuffer;
	// NumericLiteralParser wants to overread by one character. Add padding to
	// the buffer in case the token is copied to the buffer. If getSpelling()
	// returns a StringRef to the memory buffer, it should have a null char at
	// the EOF, so it is also safe.
	SpellingBuffer.resize(Tok.getLength() + 1);

	// Get the spelling of the token, which eliminates trigraphs, etc.
	bool Invalid = false;
	StringRef TokSpelling = PP.getSpelling(Tok, SpellingBuffer, &Invalid);
	if (Invalid)
	return ExprError();

	NumericLiteralParser Literal(TokSpelling, Tok.getLocation(), PP);
	if (Literal.hadError)
	return ExprError();

	if (Literal.hasUDSuffix()) {
	// We're building a user-defined literal.
	IdentifierInfo *UDSuffix = &Context.Idents.get(Literal.getUDSuffix());
	SourceLocation UDSuffixLoc =
	getUDSuffixLoc(*this, Tok.getLocation(), Literal.getUDSuffixOffset());

	// Make sure we're allowed user-defined literals here.
	if (!UDLScope)
	return ExprError(Diag(UDSuffixLoc, diag::err_invalid_numeric_udl));

	QualType CookedTy;
	if (Literal.isFloatingLiteral()) {
	// C++11 [lex.ext]p4: If S contains a literal operator with parameter type
	// long double, the literal is treated as a call of the form
	// operator "" X (f L)
	CookedTy = Context.LongDoubleTy;
	} else {
	// C++11 [lex.ext]p3: If S contains a literal operator with parameter type
	// unsigned long long, the literal is treated as a call of the form
	// operator "" X (n ULL)
	CookedTy = Context.UnsignedLongLongTy;
	}

	DeclarationName OpName =
	Context.DeclarationNames.getCXXLiteralOperatorName(UDSuffix);
	DeclarationNameInfo OpNameInfo(OpName, UDSuffixLoc);
	OpNameInfo.setCXXLiteralOperatorNameLoc(UDSuffixLoc);

	SourceLocation TokLoc = Tok.getLocation();

	// Perform literal operator lookup to determine if we're building a raw
	// literal or a cooked one.
	LookupResult R(*this, OpName, UDSuffixLoc, LookupOrdinaryName);
	switch (LookupLiteralOperator(UDLScope, R, CookedTy,
	/AllowRaw/true, /AllowTemplate/true,
	/AllowStringTemplate/false)) {
	case LOLR_Error:
	return ExprError();

	case LOLR_Cooked: {
	Expr *Lit;
	if (Literal.isFloatingLiteral()) {
	Lit = BuildFloatingLiteral(*this, Literal, CookedTy, Tok.getLocation());
	} else {
	llvm::APInt ResultVal(Context.getTargetInfo().getLongLongWidth(), 0);
	if (Literal.GetIntegerValue(ResultVal))
	Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
	<< /* Unsigned */ 1;
	Lit = IntegerLiteral::Create(Context, ResultVal, CookedTy,
	Tok.getLocation());
	}
	return BuildLiteralOperatorCall(R, OpNameInfo, Lit, TokLoc);
	}

	case LOLR_Raw: {
	// C++11 [lit.ext]p3, p4: If S contains a raw literal operator, the
	// literal is treated as a call of the form
	// operator "" X ("n")
	unsigned Length = Literal.getUDSuffixOffset();
	QualType StrTy = Context.getConstantArrayType(
	Context.CharTy.withConst(), llvm::APInt(32, Length + 1),
	ArrayType::Normal, 0);
	Expr *Lit = StringLiteral::Create(
	Context, StringRef(TokSpelling.data(), Length), StringLiteral::Ascii,
	/Pascal/false, StrTy, &TokLoc, 1);
	return BuildLiteralOperatorCall(R, OpNameInfo, Lit, TokLoc);
	}

	case LOLR_Template: {
	// C++11 [lit.ext]p3, p4: Otherwise (S contains a literal operator
	// template), L is treated as a call fo the form
	// operator "" X <'c1', 'c2', ... 'ck'>()
	// where n is the source character sequence c1 c2 ... ck.
	TemplateArgumentListInfo ExplicitArgs;
	unsigned CharBits = Context.getIntWidth(Context.CharTy);
	bool CharIsUnsigned = Context.CharTy->isUnsignedIntegerType();
	llvm::APSInt Value(CharBits, CharIsUnsigned);
	for (unsigned I = 0, N = Literal.getUDSuffixOffset(); I != N; ++I) {
	Value = TokSpelling[I];
	TemplateArgument Arg(Context, Value, Context.CharTy);
	TemplateArgumentLocInfo ArgInfo;
	ExplicitArgs.addArgument(TemplateArgumentLoc(Arg, ArgInfo));
	}
	return BuildLiteralOperatorCall(R, OpNameInfo, None, TokLoc,
	&ExplicitArgs);
	}
	case LOLR_StringTemplate:
	llvm_unreachable("unexpected literal operator lookup result");
	}
	}

	Expr *Res;

	if (Literal.isFloatingLiteral()) {
	QualType Ty;
	if (Literal.isHalf){
	if (getOpenCLOptions().isEnabled("cl_khr_fp16"))
	Ty = Context.HalfTy;
	else {
	Diag(Tok.getLocation(), diag::err_half_const_requires_fp16);
	return ExprError();
	}
	} else if (Literal.isFloat)
	Ty = Context.FloatTy;
	else if (Literal.isLong)
	Ty = Context.LongDoubleTy;
	else if (Literal.isFloat128)
	Ty = Context.Float128Ty;
	else
	Ty = Context.DoubleTy;

	Res = BuildFloatingLiteral(*this, Literal, Ty, Tok.getLocation());

	if (Ty == Context.DoubleTy) {
	if (getLangOpts().SinglePrecisionConstants) {
	const BuiltinType *BTy = Ty->getAs<BuiltinType>();
	if (BTy->getKind() != BuiltinType::Float) {
	Res = ImpCastExprToType(Res, Context.FloatTy, CK_FloatingCast).get();
	}
	} else if (getLangOpts().OpenCL &&
	!getOpenCLOptions().isEnabled("cl_khr_fp64")) {
	// Impose single-precision float type when cl_khr_fp64 is not enabled.
	Diag(Tok.getLocation(), diag::warn_double_const_requires_fp64);
	Res = ImpCastExprToType(Res, Context.FloatTy, CK_FloatingCast).get();
	}
	}
	} else if (!Literal.isIntegerLiteral()) {
	return ExprError();
	} else {
	QualType Ty;

	// 'long long' is a C99 or C++11 feature.
	if (!getLangOpts().C99 && Literal.isLongLong) {
	if (getLangOpts().CPlusPlus)
	Diag(Tok.getLocation(),
	getLangOpts().CPlusPlus11 ?
	diag::warn_cxx98_compat_longlong : diag::ext_cxx11_longlong);
	else
	Diag(Tok.getLocation(), diag::ext_c99_longlong);
	}

	// Get the value in the widest-possible width.
	unsigned MaxWidth = Context.getTargetInfo().getIntMaxTWidth();
	llvm::APInt ResultVal(MaxWidth, 0);

	if (Literal.GetIntegerValue(ResultVal)) {
	// If this value didn't fit into uintmax_t, error and force to ull.
	Diag(Tok.getLocation(), diag::err_integer_literal_too_large)
	<< /* Unsigned */ 1;
	Ty = Context.UnsignedLongLongTy;
	assert(Context.getTypeSize(Ty) == ResultVal.getBitWidth() &&
	"long long is not intmax_t?");
	} else {
	// If this value fits into a ULL, try to figure out what else it fits into
	// according to the rules of C99 6.4.4.1p5.

	// Octal, Hexadecimal, and integers with a U suffix are allowed to
	// be an unsigned int.
	bool AllowUnsigned = Literal.isUnsigned \|\| Literal.getRadix() != 10;

	// Check from smallest to largest, picking the smallest type we can.
	unsigned Width = 0;

	// Microsoft specific integer suffixes are explicitly sized.
	if (Literal.MicrosoftInteger) {
	if (Literal.MicrosoftInteger == 8 && !Literal.isUnsigned) {
	Width = 8;
	Ty = Context.CharTy;
	} else {
	Width = Literal.MicrosoftInteger;
	Ty = Context.getIntTypeForBitwidth(Width,
	/Signed=/!Literal.isUnsigned);
	}
	}

	if (Ty.isNull() && !Literal.isLong && !Literal.isLongLong) {
	// Are int/unsigned possibilities?
	unsigned IntSize = Context.getTargetInfo().getIntWidth();

	// Does it fit in a unsigned int?
	if (ResultVal.isIntN(IntSize)) {
	// Does it fit in a signed int?
	if (!Literal.isUnsigned && ResultVal[IntSize-1] == 0)
	Ty = Context.IntTy;
	else if (AllowUnsigned)
	Ty = Context.UnsignedIntTy;
	Width = IntSize;
	}
	}

	// Are long/unsigned long possibilities?
	if (Ty.isNull() && !Literal.isLongLong) {
	unsigned LongSize = Context.getTargetInfo().getLongWidth();

	// Does it fit in a unsigned long?
	if (ResultVal.isIntN(LongSize)) {
	// Does it fit in a signed long?
	if (!Literal.isUnsigned && ResultVal[LongSize-1] == 0)
	Ty = Context.LongTy;
	else if (AllowUnsigned)
	Ty = Context.UnsignedLongTy;
	// Check according to the rules of C90 6.1.3.2p5. C++03 [lex.icon]p2
	// is compatible.
	else if (!getLangOpts().C99 && !getLangOpts().CPlusPlus11) {
	const unsigned LongLongSize =
	Context.getTargetInfo().getLongLongWidth();
	Diag(Tok.getLocation(),
	getLangOpts().CPlusPlus
	? Literal.isLong
	? diag::warn_old_implicitly_unsigned_long_cxx
	: /C++98 UB/ diag::
	ext_old_implicitly_unsigned_long_cxx
	: diag::warn_old_implicitly_unsigned_long)
	<< (LongLongSize > LongSize ? /will have type 'long long'/ 0
	: /will be ill-formed/ 1);
	Ty = Context.UnsignedLongTy;
	}
	Width = LongSize;
	}
	}

	// Check long long if needed.
	if (Ty.isNull()) {
	unsigned LongLongSize = Context.getTargetInfo().getLongLongWidth();

	// Does it fit in a unsigned long long?
	if (ResultVal.isIntN(LongLongSize)) {
	// Does it fit in a signed long long?
	// To be compatible with MSVC, hex integer literals ending with the
	// LL or i64 suffix are always signed in Microsoft mode.
	if (!Literal.isUnsigned && (ResultVal[LongLongSize-1] == 0 \|\|
	(getLangOpts().MSVCCompat && Literal.isLongLong)))
	Ty = Context.LongLongTy;
	else if (AllowUnsigned)
	Ty = Context.UnsignedLongLongTy;
	Width = LongLongSize;
	}
	}

	// If we still couldn't decide a type, we probably have something that
	// does not fit in a signed long long, but has no U suffix.
	if (Ty.isNull()) {
	Diag(Tok.getLocation(), diag::ext_integer_literal_too_large_for_signed);
	Ty = Context.UnsignedLongLongTy;
	Width = Context.getTargetInfo().getLongLongWidth();
	}

	if (ResultVal.getBitWidth() != Width)
	ResultVal = ResultVal.trunc(Width);
	}
	Res = IntegerLiteral::Create(Context, ResultVal, Ty, Tok.getLocation());
	}

	// If this is an imaginary literal, create the ImaginaryLiteral wrapper.
	if (Literal.isImaginary)
	Res = new (Context) ImaginaryLiteral(Res,
	Context.getComplexType(Res->getType()));

	return Res;
	}

	ExprResult Sema::ActOnParenExpr(SourceLocation L, SourceLocation R, Expr *E) {
	assert(E && "ActOnParenExpr() missing expr");
	return new (Context) ParenExpr(L, R, E);
	}

	static bool CheckVecStepTraitOperandType(Sema &S, QualType T,
	SourceLocation Loc,
	SourceRange ArgRange) {
	// [OpenCL 1.1 6.11.12] "The vec_step built-in function takes a built-in
	// scalar or vector data type argument..."
	// Every built-in scalar type (OpenCL 1.1 6.1.1) is either an arithmetic
	// type (C99 6.2.5p18) or void.
	if (!(T->isArithmeticType() \|\| T->isVoidType() \|\| T->isVectorType())) {
	S.Diag(Loc, diag::err_vecstep_non_scalar_vector_type)
	<< T << ArgRange;
	return true;
	}

	assert((T->isVoidType() \|\| !T->isIncompleteType()) &&
	"Scalar types should always be complete");
	return false;
	}

	static bool CheckExtensionTraitOperandType(Sema &S, QualType T,
	SourceLocation Loc,
	SourceRange ArgRange,
	UnaryExprOrTypeTrait TraitKind) {
	// Invalid types must be hard errors for SFINAE in C++.
	if (S.LangOpts.CPlusPlus)
	return true;

	// C99 6.5.3.4p1:
	if (T->isFunctionType() &&
	(TraitKind == UETT_SizeOf \|\| TraitKind == UETT_AlignOf)) {
	// sizeof(function)/alignof(function) is allowed as an extension.
	S.Diag(Loc, diag::ext_sizeof_alignof_function_type)
	<< TraitKind << ArgRange;
	return false;
	}

	// Allow sizeof(void)/alignof(void) as an extension, unless in OpenCL where
	// this is an error (OpenCL v1.1 s6.3.k)
	if (T->isVoidType()) {
	unsigned DiagID = S.LangOpts.OpenCL ? diag::err_opencl_sizeof_alignof_type
	: diag::ext_sizeof_alignof_void_type;
	S.Diag(Loc, DiagID) << TraitKind << ArgRange;
	return false;
	}

	return true;
	}

	static bool CheckObjCTraitOperandConstraints(Sema &S, QualType T,
	SourceLocation Loc,
	SourceRange ArgRange,
	UnaryExprOrTypeTrait TraitKind) {
	// Reject sizeof(interface) and sizeof(interface<proto>) if the
	// runtime doesn't allow it.
	if (!S.LangOpts.ObjCRuntime.allowsSizeofAlignof() && T->isObjCObjectType()) {
	S.Diag(Loc, diag::err_sizeof_nonfragile_interface)
	<< T << (TraitKind == UETT_SizeOf)
	<< ArgRange;
	return true;
	}

	return false;
	}

	/// \brief Check whether E is a pointer from a decayed array type (the decayed
	/// pointer type is equal to T) and emit a warning if it is.
	static void warnOnSizeofOnArrayDecay(Sema &S, SourceLocation Loc, QualType T,
	Expr *E) {
	// Don't warn if the operation changed the type.
	if (T != E->getType())
	return;

	// Now look for array decays.
	ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(E);
	if (!ICE \|\| ICE->getCastKind() != CK_ArrayToPointerDecay)
	return;

	S.Diag(Loc, diag::warn_sizeof_array_decay) << ICE->getSourceRange()
	<< ICE->getType()
	<< ICE->getSubExpr()->getType();
	}

	/// \brief Check the constraints on expression operands to unary type expression
	/// and type traits.
	///
	/// Completes any types necessary and validates the constraints on the operand
	/// expression. The logic mostly mirrors the type-based overload, but may modify
	/// the expression as it completes the type for that expression through template
	/// instantiation, etc.
	bool Sema::CheckUnaryExprOrTypeTraitOperand(Expr *E,
	UnaryExprOrTypeTrait ExprKind) {
	QualType ExprTy = E->getType();
	assert(!ExprTy->isReferenceType());

	if (ExprKind == UETT_VecStep)
	return CheckVecStepTraitOperandType(*this, ExprTy, E->getExprLoc(),
	E->getSourceRange());

	// Whitelist some types as extensions
	if (!CheckExtensionTraitOperandType(*this, ExprTy, E->getExprLoc(),
	E->getSourceRange(), ExprKind))
	return false;

	// 'alignof' applied to an expression only requires the base element type of
	// the expression to be complete. 'sizeof' requires the expression's type to
	// be complete (and will attempt to complete it if it's an array of unknown
	// bound).
	if (ExprKind == UETT_AlignOf) {
	if (RequireCompleteType(E->getExprLoc(),
	Context.getBaseElementType(E->getType()),
	diag::err_sizeof_alignof_incomplete_type, ExprKind,
	E->getSourceRange()))
	return true;
	} else {
	if (RequireCompleteExprType(E, diag::err_sizeof_alignof_incomplete_type,
	ExprKind, E->getSourceRange()))
	return true;
	}

	// Completing the expression's type may have changed it.
	ExprTy = E->getType();
	assert(!ExprTy->isReferenceType());

	if (ExprTy->isFunctionType()) {
	Diag(E->getExprLoc(), diag::err_sizeof_alignof_function_type)
	<< ExprKind << E->getSourceRange();
	return true;
	}

	// The operand for sizeof and alignof is in an unevaluated expression context,
	// so side effects could result in unintended consequences.
	if ((ExprKind == UETT_SizeOf \|\| ExprKind == UETT_AlignOf) &&
	!inTemplateInstantiation() && E->HasSideEffects(Context, false))
	Diag(E->getExprLoc(), diag::warn_side_effects_unevaluated_context);

	if (CheckObjCTraitOperandConstraints(*this, ExprTy, E->getExprLoc(),
	E->getSourceRange(), ExprKind))
	return true;

	if (ExprKind == UETT_SizeOf) {
	if (DeclRefExpr *DeclRef = dyn_cast<DeclRefExpr>(E->IgnoreParens())) {
	if (ParmVarDecl *PVD = dyn_cast<ParmVarDecl>(DeclRef->getFoundDecl())) {
	QualType OType = PVD->getOriginalType();
	QualType Type = PVD->getType();
	if (Type->isPointerType() && OType->isArrayType()) {
	Diag(E->getExprLoc(), diag::warn_sizeof_array_param)
	<< Type << OType;
	Diag(PVD->getLocation(), diag::note_declared_at);
	}
	}
	}

	// Warn on "sizeof(array op x)" and "sizeof(x op array)", where the array
	// decays into a pointer and returns an unintended result. This is most
	// likely a typo for "sizeof(array) op x".
	if (BinaryOperator *BO = dyn_cast<BinaryOperator>(E->IgnoreParens())) {
	warnOnSizeofOnArrayDecay(*this, BO->getOperatorLoc(), BO->getType(),
	BO->getLHS());
	warnOnSizeofOnArrayDecay(*this, BO->getOperatorLoc(), BO->getType(),
	BO->getRHS());
	}
	}

	return false;
	}

	/// \brief Check the constraints on operands to unary expression and type
	/// traits.
	///
	/// This will complete any types necessary, and validate the various constraints
	/// on those operands.
	///
	/// The UsualUnaryConversions() function is not called by this routine.
	/// C99 6.3.2.1p[2-4] all state:
	/// Except when it is the operand of the sizeof operator ...
	///
	/// C++ [expr.sizeof]p4
	/// The lvalue-to-rvalue, array-to-pointer, and function-to-pointer
	/// standard conversions are not applied to the operand of sizeof.
	///
	/// This policy is followed for all of the unary trait expressions.
	bool Sema::CheckUnaryExprOrTypeTraitOperand(QualType ExprType,
	SourceLocation OpLoc,
	SourceRange ExprRange,
	UnaryExprOrTypeTrait ExprKind) {
	if (ExprType->isDependentType())
	return false;

	// C++ [expr.sizeof]p2:
	// When applied to a reference or a reference type, the result
	// is the size of the referenced type.
	// C++11 [expr.alignof]p3:
	// When alignof is applied to a reference type, the result
	// shall be the alignment of the referenced type.
	if (const ReferenceType *Ref = ExprType->getAs<ReferenceType>())
	ExprType = Ref->getPointeeType();

	// C11 6.5.3.4/3, C++11 [expr.alignof]p3:
	// When alignof or _Alignof is applied to an array type, the result
	// is the alignment of the element type.
	if (ExprKind == UETT_AlignOf \|\| ExprKind == UETT_OpenMPRequiredSimdAlign)
	ExprType = Context.getBaseElementType(ExprType);

	if (ExprKind == UETT_VecStep)
	return CheckVecStepTraitOperandType(*this, ExprType, OpLoc, ExprRange);

	// Whitelist some types as extensions
	if (!CheckExtensionTraitOperandType(*this, ExprType, OpLoc, ExprRange,
	ExprKind))
	return false;

	if (RequireCompleteType(OpLoc, ExprType,
	diag::err_sizeof_alignof_incomplete_type,
	ExprKind, ExprRange))
	return true;

	if (ExprType->isFunctionType()) {
	Diag(OpLoc, diag::err_sizeof_alignof_function_type)
	<< ExprKind << ExprRange;
	return true;
	}

	if (CheckObjCTraitOperandConstraints(*this, ExprType, OpLoc, ExprRange,
	ExprKind))
	return true;

	return false;
	}

	static bool CheckAlignOfExpr(Sema &S, Expr *E) {
	E = E->IgnoreParens();

	// Cannot know anything else if the expression is dependent.
	if (E->isTypeDependent())
	return false;

	if (E->getObjectKind() == OK_BitField) {
	S.Diag(E->getExprLoc(), diag::err_sizeof_alignof_typeof_bitfield)
	<< 1 << E->getSourceRange();
	return true;
	}

	ValueDecl *D = nullptr;
	if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
	D = DRE->getDecl();
	} else if (MemberExpr *ME = dyn_cast<MemberExpr>(E)) {
	D = ME->getMemberDecl();
	}

	// If it's a field, require the containing struct to have a
	// complete definition so that we can compute the layout.
	//
	// This can happen in C++11 onwards, either by naming the member
	// in a way that is not transformed into a member access expression
	// (in an unevaluated operand, for instance), or by naming the member
	// in a trailing-return-type.
	//
	// For the record, since __alignof__ on expressions is a GCC
	// extension, GCC seems to permit this but always gives the
	// nonsensical answer 0.
	//
	// We don't really need the layout here --- we could instead just
	// directly check for all the appropriate alignment-lowing
	// attributes --- but that would require duplicating a lot of
	// logic that just isn't worth duplicating for such a marginal
	// use-case.
	if (FieldDecl *FD = dyn_cast_or_null<FieldDecl>(D)) {
	// Fast path this check, since we at least know the record has a
	// definition if we can find a member of it.
	if (!FD->getParent()->isCompleteDefinition()) {
	S.Diag(E->getExprLoc(), diag::err_alignof_member_of_incomplete_type)
	<< E->getSourceRange();
	return true;
	}

	// Otherwise, if it's a field, and the field doesn't have
	// reference type, then it must have a complete type (or be a
	// flexible array member, which we explicitly want to
	// white-list anyway), which makes the following checks trivial.
	if (!FD->getType()->isReferenceType())
	return false;
	}

	return S.CheckUnaryExprOrTypeTraitOperand(E, UETT_AlignOf);
	}

	bool Sema::CheckVecStepExpr(Expr *E) {
	E = E->IgnoreParens();

	// Cannot know anything else if the expression is dependent.
	if (E->isTypeDependent())
	return false;

	return CheckUnaryExprOrTypeTraitOperand(E, UETT_VecStep);
	}

	static void captureVariablyModifiedType(ASTContext &Context, QualType T,
	CapturingScopeInfo *CSI) {
	assert(T->isVariablyModifiedType());
	assert(CSI != nullptr);

	// We're going to walk down into the type and look for VLA expressions.
	do {
	const Type *Ty = T.getTypePtr();
	switch (Ty->getTypeClass()) {
	#define TYPE(Class, Base)
	#define ABSTRACT_TYPE(Class, Base)
	#define NON_CANONICAL_TYPE(Class, Base)
	#define DEPENDENT_TYPE(Class, Base) case Type::Class:
	#define NON_CANONICAL_UNLESS_DEPENDENT_TYPE(Class, Base)
	#include "clang/AST/TypeNodes.def"
	T = QualType();
	break;
	// These types are never variably-modified.
	case Type::Builtin:
	case Type::Complex:
	case Type::Vector:
	case Type::ExtVector:
	case Type::Record:
	case Type::Enum:
	case Type::Elaborated:
	case Type::TemplateSpecialization:
	case Type::ObjCObject:
	case Type::ObjCInterface:
	case Type::ObjCObjectPointer:
	case Type::ObjCTypeParam:
	case Type::Pipe:
	llvm_unreachable("type class is never variably-modified!");
	case Type::Adjusted:
	T = cast<AdjustedType>(Ty)->getOriginalType();
	break;
	case Type::Decayed:
	T = cast<DecayedType>(Ty)->getPointeeType();
	break;
	case Type::Pointer:
	T = cast<PointerType>(Ty)->getPointeeType();
	break;
	case Type::BlockPointer:
	T = cast<BlockPointerType>(Ty)->getPointeeType();
	break;
	case Type::LValueReference:
	case Type::RValueReference:
	T = cast<ReferenceType>(Ty)->getPointeeType();
	break;
	case Type::MemberPointer:
	T = cast<MemberPointerType>(Ty)->getPointeeType();
	break;
	case Type::ConstantArray:
	case Type::IncompleteArray:
	// Losing element qualification here is fine.
	T = cast<ArrayType>(Ty)->getElementType();
	break;
	case Type::VariableArray: {
	// Losing element qualification here is fine.
	const VariableArrayType *VAT = cast<VariableArrayType>(Ty);

	// Unknown size indication requires no size computation.
	// Otherwise, evaluate and record it.
	if (auto Size = VAT->getSizeExpr()) {
	if (!CSI->isVLATypeCaptured(VAT)) {
	RecordDecl *CapRecord = nullptr;
	if (auto LSI = dyn_cast<LambdaScopeInfo>(CSI)) {
	CapRecord = LSI->Lambda;
	} else if (auto CRSI = dyn_cast<CapturedRegionScopeInfo>(CSI)) {
	CapRecord = CRSI->TheRecordDecl;
	}
	if (CapRecord) {
	auto ExprLoc = Size->getExprLoc();
	auto SizeType = Context.getSizeType();
	// Build the non-static data member.
	auto Field =
	FieldDecl::Create(Context, CapRecord, ExprLoc, ExprLoc,
	/Id/ nullptr, SizeType, /TInfo/ nullptr,
	/BW/ nullptr, /Mutable/ false,
	/InitStyle/ ICIS_NoInit);
	Field->setImplicit(true);
	Field->setAccess(AS_private);
	Field->setCapturedVLAType(VAT);
	CapRecord->addDecl(Field);

	CSI->addVLATypeCapture(ExprLoc, SizeType);
	}
	}
	}
	T = VAT->getElementType();
	break;
	}
	case Type::FunctionProto:
	case Type::FunctionNoProto:
	T = cast<FunctionType>(Ty)->getReturnType();
	break;
	case Type::Paren:
	case Type::TypeOf:
	case Type::UnaryTransform:
	case Type::Attributed:
	case Type::SubstTemplateTypeParm:
	case Type::PackExpansion:
	// Keep walking after single level desugaring.
	T = T.getSingleStepDesugaredType(Context);
	break;
	case Type::Typedef:
	T = cast<TypedefType>(Ty)->desugar();
	break;
	case Type::Decltype:
	T = cast<DecltypeType>(Ty)->desugar();
	break;
	case Type::Auto:
	case Type::DeducedTemplateSpecialization:
	T = cast<DeducedType>(Ty)->getDeducedType();
	break;
	case Type::TypeOfExpr:
	T = cast<TypeOfExprType>(Ty)->getUnderlyingExpr()->getType();
	break;
	case Type::Atomic:
	T = cast<AtomicType>(Ty)->getValueType();
	break;
	}
	} while (!T.isNull() && T->isVariablyModifiedType());
	}

	/// \brief Build a sizeof or alignof expression given a type operand.
	ExprResult
	Sema::CreateUnaryExprOrTypeTraitExpr(TypeSourceInfo *TInfo,
	SourceLocation OpLoc,
	UnaryExprOrTypeTrait ExprKind,
	SourceRange R) {
	if (!TInfo)
	return ExprError();

	QualType T = TInfo->getType();

	if (!T->isDependentType() &&
	CheckUnaryExprOrTypeTraitOperand(T, OpLoc, R, ExprKind))
	return ExprError();

	if (T->isVariablyModifiedType() && FunctionScopes.size() > 1) {
	if (auto *TT = T->getAs<TypedefType>()) {
	for (auto I = FunctionScopes.rbegin(),
	E = std::prev(FunctionScopes.rend());
	I != E; ++I) {
	auto CSI = dyn_cast<CapturingScopeInfo>(I);
	if (CSI == nullptr)
	break;
	DeclContext *DC = nullptr;
	if (auto *LSI = dyn_cast<LambdaScopeInfo>(CSI))
	DC = LSI->CallOperator;
	else if (auto *CRSI = dyn_cast<CapturedRegionScopeInfo>(CSI))
	DC = CRSI->TheCapturedDecl;
	else if (auto *BSI = dyn_cast<BlockScopeInfo>(CSI))
	DC = BSI->TheDecl;
	if (DC) {
	if (DC->containsDecl(TT->getDecl()))
	break;
	captureVariablyModifiedType(Context, T, CSI);
	}
	}
	}
	}

	// C99 6.5.3.4p4: the type (an unsigned integer type) is size_t.
	return new (Context) UnaryExprOrTypeTraitExpr(
	ExprKind, TInfo, Context.getSizeType(), OpLoc, R.getEnd());
	}

	/// \brief Build a sizeof or alignof expression given an expression
	/// operand.
	ExprResult
	Sema::CreateUnaryExprOrTypeTraitExpr(Expr *E, SourceLocation OpLoc,
	UnaryExprOrTypeTrait ExprKind) {
	ExprResult PE = CheckPlaceholderExpr(E);
	if (PE.isInvalid())
	return ExprError();

	E = PE.get();

	// Verify that the operand is valid.
	bool isInvalid = false;
	if (E->isTypeDependent()) {
	// Delay type-checking for type-dependent expressions.
	} else if (ExprKind == UETT_AlignOf) {
	isInvalid = CheckAlignOfExpr(*this, E);
	} else if (ExprKind == UETT_VecStep) {
	isInvalid = CheckVecStepExpr(E);
	} else if (ExprKind == UETT_OpenMPRequiredSimdAlign) {
	Diag(E->getExprLoc(), diag::err_openmp_default_simd_align_expr);
	isInvalid = true;
	} else if (E->refersToBitField()) { // C99 6.5.3.4p1.
	Diag(E->getExprLoc(), diag::err_sizeof_alignof_typeof_bitfield) << 0;
	isInvalid = true;
	} else {
	isInvalid = CheckUnaryExprOrTypeTraitOperand(E, UETT_SizeOf);
	}

	if (isInvalid)
	return ExprError();

	if (ExprKind == UETT_SizeOf && E->getType()->isVariableArrayType()) {
	PE = TransformToPotentiallyEvaluated(E);
	if (PE.isInvalid()) return ExprError();
	E = PE.get();
	}

	// C99 6.5.3.4p4: the type (an unsigned integer type) is size_t.
	return new (Context) UnaryExprOrTypeTraitExpr(
	ExprKind, E, Context.getSizeType(), OpLoc, E->getSourceRange().getEnd());
	}

	/// ActOnUnaryExprOrTypeTraitExpr - Handle @c sizeof(type) and @c sizeof @c
	/// expr and the same for @c alignof and @c __alignof
	/// Note that the ArgRange is invalid if isType is false.
	ExprResult
	Sema::ActOnUnaryExprOrTypeTraitExpr(SourceLocation OpLoc,
	UnaryExprOrTypeTrait ExprKind, bool IsType,
	void *TyOrEx, SourceRange ArgRange) {
	// If error parsing type, ignore.
	if (!TyOrEx) return ExprError();

	if (IsType) {
	TypeSourceInfo *TInfo;
	(void) GetTypeFromParser(ParsedType::getFromOpaquePtr(TyOrEx), &TInfo);
	return CreateUnaryExprOrTypeTraitExpr(TInfo, OpLoc, ExprKind, ArgRange);
	}

	Expr ArgEx = (Expr )TyOrEx;
	ExprResult Result = CreateUnaryExprOrTypeTraitExpr(ArgEx, OpLoc, ExprKind);
	return Result;
	}

	static QualType CheckRealImagOperand(Sema &S, ExprResult &V, SourceLocation Loc,
	bool IsReal) {
	if (V.get()->isTypeDependent())
	return S.Context.DependentTy;

	// _Real and _Imag are only l-values for normal l-values.
	if (V.get()->getObjectKind() != OK_Ordinary) {
	V = S.DefaultLvalueConversion(V.get());
	if (V.isInvalid())
	return QualType();
	}

	// These operators return the element type of a complex type.
	if (const ComplexType *CT = V.get()->getType()->getAs<ComplexType>())
	return CT->getElementType();

	// Otherwise they pass through real integer and floating point types here.
	if (V.get()->getType()->isArithmeticType())
	return V.get()->getType();

	// Test for placeholders.
	ExprResult PR = S.CheckPlaceholderExpr(V.get());
	if (PR.isInvalid()) return QualType();
	if (PR.get() != V.get()) {
	V = PR;
	return CheckRealImagOperand(S, V, Loc, IsReal);
	}

	// Reject anything else.
	S.Diag(Loc, diag::err_realimag_invalid_type) << V.get()->getType()
	<< (IsReal ? "__real" : "__imag");
	return QualType();
	}



	ExprResult
	Sema::ActOnPostfixUnaryOp(Scope *S, SourceLocation OpLoc,
	tok::TokenKind Kind, Expr *Input) {
	UnaryOperatorKind Opc;
	switch (Kind) {
	default: llvm_unreachable("Unknown unary op!");
	case tok::plusplus: Opc = UO_PostInc; break;
	case tok::minusminus: Opc = UO_PostDec; break;
	}

	// Since this might is a postfix expression, get rid of ParenListExprs.
	ExprResult Result = MaybeConvertParenListExprToParenExpr(S, Input);
	if (Result.isInvalid()) return ExprError();
	Input = Result.get();

	return BuildUnaryOp(S, OpLoc, Opc, Input);
	}

	/// \brief Diagnose if arithmetic on the given ObjC pointer is illegal.
	///
	/// \return true on error
	static bool checkArithmeticOnObjCPointer(Sema &S,
	SourceLocation opLoc,
	Expr *op) {
	assert(op->getType()->isObjCObjectPointerType());
	if (S.LangOpts.ObjCRuntime.allowsPointerArithmetic() &&
	!S.LangOpts.ObjCSubscriptingLegacyRuntime)
	return false;

	S.Diag(opLoc, diag::err_arithmetic_nonfragile_interface)
	<< op->getType()->castAs<ObjCObjectPointerType>()->getPointeeType()
	<< op->getSourceRange();
	return true;
	}

	static bool isMSPropertySubscriptExpr(Sema &S, Expr *Base) {
	auto *BaseNoParens = Base->IgnoreParens();
	if (auto *MSProp = dyn_cast<MSPropertyRefExpr>(BaseNoParens))
	return MSProp->getPropertyDecl()->getType()->isArrayType();
	return isa<MSPropertySubscriptExpr>(BaseNoParens);
	}

	ExprResult
	Sema::ActOnArraySubscriptExpr(Scope S, Expr base, SourceLocation lbLoc,
	Expr *idx, SourceLocation rbLoc) {
	if (base && !base->getType().isNull() &&
	base->getType()->isSpecificPlaceholderType(BuiltinType::OMPArraySection))
	return ActOnOMPArraySectionExpr(base, lbLoc, idx, SourceLocation(),
	/Length=/nullptr, rbLoc);

	// Since this might be a postfix expression, get rid of ParenListExprs.
	if (isa<ParenListExpr>(base)) {
	ExprResult result = MaybeConvertParenListExprToParenExpr(S, base);
	if (result.isInvalid()) return ExprError();
	base = result.get();
	}

	// Handle any non-overload placeholder types in the base and index
	// expressions. We can't handle overloads here because the other
	// operand might be an overloadable type, in which case the overload
	// resolution for the operator overload should get the first crack
	// at the overload.
	bool IsMSPropertySubscript = false;
	if (base->getType()->isNonOverloadPlaceholderType()) {
	IsMSPropertySubscript = isMSPropertySubscriptExpr(*this, base);
	if (!IsMSPropertySubscript) {
	ExprResult result = CheckPlaceholderExpr(base);
	if (result.isInvalid())
	return ExprError();
	base = result.get();
	}
	}
	if (idx->getType()->isNonOverloadPlaceholderType()) {
	ExprResult result = CheckPlaceholderExpr(idx);
	if (result.isInvalid()) return ExprError();
	idx = result.get();
	}

	// Build an unanalyzed expression if either operand is type-dependent.
	if (getLangOpts().CPlusPlus &&
	(base->isTypeDependent() \|\| idx->isTypeDependent())) {
	return new (Context) ArraySubscriptExpr(base, idx, Context.DependentTy,
	VK_LValue, OK_Ordinary, rbLoc);
	}

	// MSDN, property (C++)
	// https://msdn.microsoft.com/en-us/library/yhfk0thd(v=vs.120).aspx
	// This attribute can also be used in the declaration of an empty array in a
	// class or structure definition. For example:
	// __declspec(property(get=GetX, put=PutX)) int x[];
	// The above statement indicates that x[] can be used with one or more array
	// indices. In this case, i=p->x[a][b] will be turned into i=p->GetX(a, b),
	// and p->x[a][b] = i will be turned into p->PutX(a, b, i);
	if (IsMSPropertySubscript) {
	// Build MS property subscript expression if base is MS property reference
	// or MS property subscript.
	return new (Context) MSPropertySubscriptExpr(
	base, idx, Context.PseudoObjectTy, VK_LValue, OK_Ordinary, rbLoc);
	}

	// Use C++ overloaded-operator rules if either operand has record
	// type. The spec says to do this if either type is overloadable,
	// but enum types can't declare subscript operators or conversion
	// operators, so there's nothing interesting for overload resolution
	// to do if there aren't any record types involved.
	//
	// ObjC pointers have their own subscripting logic that is not tied
	// to overload resolution and so should not take this path.
	if (getLangOpts().CPlusPlus &&
	(base->getType()->isRecordType() \|\|
	(!base->getType()->isObjCObjectPointerType() &&
	idx->getType()->isRecordType()))) {
	return CreateOverloadedArraySubscriptExpr(lbLoc, rbLoc, base, idx);
	}

	return CreateBuiltinArraySubscriptExpr(base, lbLoc, idx, rbLoc);
	}

	ExprResult Sema::ActOnOMPArraySectionExpr(Expr *Base, SourceLocation LBLoc,
	Expr *LowerBound,
	SourceLocation ColonLoc, Expr *Length,
	SourceLocation RBLoc) {
	if (Base->getType()->isPlaceholderType() &&
	!Base->getType()->isSpecificPlaceholderType(
	BuiltinType::OMPArraySection)) {
	ExprResult Result = CheckPlaceholderExpr(Base);
	if (Result.isInvalid())
	return ExprError();
	Base = Result.get();
	}
	if (LowerBound && LowerBound->getType()->isNonOverloadPlaceholderType()) {
	ExprResult Result = CheckPlaceholderExpr(LowerBound);
	if (Result.isInvalid())
	return ExprError();
	Result = DefaultLvalueConversion(Result.get());
	if (Result.isInvalid())
	return ExprError();
	LowerBound = Result.get();
	}
	if (Length && Length->getType()->isNonOverloadPlaceholderType()) {
	ExprResult Result = CheckPlaceholderExpr(Length);
	if (Result.isInvalid())
	return ExprError();
	Result = DefaultLvalueConversion(Result.get());
	if (Result.isInvalid())
	return ExprError();
	Length = Result.get();
	}

	// Build an unanalyzed expression if either operand is type-dependent.
	if (Base->isTypeDependent() \|\|
	(LowerBound &&
	(LowerBound->isTypeDependent() \|\| LowerBound->isValueDependent())) \|\|
	(Length && (Length->isTypeDependent() \|\| Length->isValueDependent()))) {
	return new (Context)
	OMPArraySectionExpr(Base, LowerBound, Length, Context.DependentTy,
	VK_LValue, OK_Ordinary, ColonLoc, RBLoc);
	}

	// Perform default conversions.
	QualType OriginalTy = OMPArraySectionExpr::getBaseOriginalType(Base);
	QualType ResultTy;
	if (OriginalTy->isAnyPointerType()) {
	ResultTy = OriginalTy->getPointeeType();
	} else if (OriginalTy->isArrayType()) {
	ResultTy = OriginalTy->getAsArrayTypeUnsafe()->getElementType();
	} else {
	return ExprError(
	Diag(Base->getExprLoc(), diag::err_omp_typecheck_section_value)
	<< Base->getSourceRange());
	}
	// C99 6.5.2.1p1
	if (LowerBound) {
	auto Res = PerformOpenMPImplicitIntegerConversion(LowerBound->getExprLoc(),
	LowerBound);
	if (Res.isInvalid())
	return ExprError(Diag(LowerBound->getExprLoc(),
	diag::err_omp_typecheck_section_not_integer)
	<< 0 << LowerBound->getSourceRange());
	LowerBound = Res.get();

	if (LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_S) \|\|
	LowerBound->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
	Diag(LowerBound->getExprLoc(), diag::warn_omp_section_is_char)
	<< 0 << LowerBound->getSourceRange();
	}
	if (Length) {
	auto Res =
	PerformOpenMPImplicitIntegerConversion(Length->getExprLoc(), Length);
	if (Res.isInvalid())
	return ExprError(Diag(Length->getExprLoc(),
	diag::err_omp_typecheck_section_not_integer)
	<< 1 << Length->getSourceRange());
	Length = Res.get();

	if (Length->getType()->isSpecificBuiltinType(BuiltinType::Char_S) \|\|
	Length->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
	Diag(Length->getExprLoc(), diag::warn_omp_section_is_char)
	<< 1 << Length->getSourceRange();
	}

	// C99 6.5.2.1p1: "shall have type "pointer to object type". Similarly,
	// C++ [expr.sub]p1: The type "T" shall be a completely-defined object
	// type. Note that functions are not objects, and that (in C99 parlance)
	// incomplete types are not object types.
	if (ResultTy->isFunctionType()) {
	Diag(Base->getExprLoc(), diag::err_omp_section_function_type)
	<< ResultTy << Base->getSourceRange();
	return ExprError();
	}

	if (RequireCompleteType(Base->getExprLoc(), ResultTy,
	diag::err_omp_section_incomplete_type, Base))
	return ExprError();

	if (LowerBound && !OriginalTy->isAnyPointerType()) {
	llvm::APSInt LowerBoundValue;
	if (LowerBound->EvaluateAsInt(LowerBoundValue, Context)) {
	// OpenMP 4.5, [2.4 Array Sections]
	// The array section must be a subset of the original array.
	if (LowerBoundValue.isNegative()) {
	Diag(LowerBound->getExprLoc(), diag::err_omp_section_not_subset_of_array)
	<< LowerBound->getSourceRange();
	return ExprError();
	}
	}
	}

	if (Length) {
	llvm::APSInt LengthValue;
	if (Length->EvaluateAsInt(LengthValue, Context)) {
	// OpenMP 4.5, [2.4 Array Sections]
	// The length must evaluate to non-negative integers.
	if (LengthValue.isNegative()) {
	Diag(Length->getExprLoc(), diag::err_omp_section_length_negative)
	<< LengthValue.toString(/Radix=/10, /Signed=/true)
	<< Length->getSourceRange();
	return ExprError();
	}
	}
	} else if (ColonLoc.isValid() &&
	(OriginalTy.isNull() \|\| (!OriginalTy->isConstantArrayType() &&
	!OriginalTy->isVariableArrayType()))) {
	// OpenMP 4.5, [2.4 Array Sections]
	// When the size of the array dimension is not known, the length must be
	// specified explicitly.
	Diag(ColonLoc, diag::err_omp_section_length_undefined)
	<< (!OriginalTy.isNull() && OriginalTy->isArrayType());
	return ExprError();
	}

	if (!Base->getType()->isSpecificPlaceholderType(
	BuiltinType::OMPArraySection)) {
	ExprResult Result = DefaultFunctionArrayLvalueConversion(Base);
	if (Result.isInvalid())
	return ExprError();
	Base = Result.get();
	}
	return new (Context)
	OMPArraySectionExpr(Base, LowerBound, Length, Context.OMPArraySectionTy,
	VK_LValue, OK_Ordinary, ColonLoc, RBLoc);
	}

	ExprResult
	Sema::CreateBuiltinArraySubscriptExpr(Expr *Base, SourceLocation LLoc,
	Expr *Idx, SourceLocation RLoc) {
	Expr *LHSExp = Base;
	Expr *RHSExp = Idx;

	ExprValueKind VK = VK_LValue;
	ExprObjectKind OK = OK_Ordinary;

	// Per C++ core issue 1213, the result is an xvalue if either operand is
	// a non-lvalue array, and an lvalue otherwise.
	if (getLangOpts().CPlusPlus11 &&
	((LHSExp->getType()->isArrayType() && !LHSExp->isLValue()) \|\|
	(RHSExp->getType()->isArrayType() && !RHSExp->isLValue())))
	VK = VK_XValue;

	// Perform default conversions.
	if (!LHSExp->getType()->getAs<VectorType>()) {
	ExprResult Result = DefaultFunctionArrayLvalueConversion(LHSExp);
	if (Result.isInvalid())
	return ExprError();
	LHSExp = Result.get();
	}
	ExprResult Result = DefaultFunctionArrayLvalueConversion(RHSExp);
	if (Result.isInvalid())
	return ExprError();
	RHSExp = Result.get();

	QualType LHSTy = LHSExp->getType(), RHSTy = RHSExp->getType();

	// C99 6.5.2.1p2: the expression e1[e2] is by definition precisely equivalent
	// to the expression *((e1)+(e2)). This means the array "Base" may actually be
	// in the subscript position. As a result, we need to derive the array base
	// and index from the expression types.
	Expr BaseExpr, IndexExpr;
	QualType ResultType;
	if (LHSTy->isDependentType() \|\| RHSTy->isDependentType()) {
	BaseExpr = LHSExp;
	IndexExpr = RHSExp;
	ResultType = Context.DependentTy;
	} else if (const PointerType *PTy = LHSTy->getAs<PointerType>()) {
	BaseExpr = LHSExp;
	IndexExpr = RHSExp;
	ResultType = PTy->getPointeeType();
	} else if (const ObjCObjectPointerType *PTy =
	LHSTy->getAs<ObjCObjectPointerType>()) {
	BaseExpr = LHSExp;
	IndexExpr = RHSExp;

	// Use custom logic if this should be the pseudo-object subscript
	// expression.
	if (!LangOpts.isSubscriptPointerArithmetic())
	return BuildObjCSubscriptExpression(RLoc, BaseExpr, IndexExpr, nullptr,
	nullptr);

	ResultType = PTy->getPointeeType();
	} else if (const PointerType *PTy = RHSTy->getAs<PointerType>()) {
	// Handle the uncommon case of "123[Ptr]".
	BaseExpr = RHSExp;
	IndexExpr = LHSExp;
	ResultType = PTy->getPointeeType();
	} else if (const ObjCObjectPointerType *PTy =
	RHSTy->getAs<ObjCObjectPointerType>()) {
	// Handle the uncommon case of "123[Ptr]".
	BaseExpr = RHSExp;
	IndexExpr = LHSExp;
	ResultType = PTy->getPointeeType();
	if (!LangOpts.isSubscriptPointerArithmetic()) {
	Diag(LLoc, diag::err_subscript_nonfragile_interface)
	<< ResultType << BaseExpr->getSourceRange();
	return ExprError();
	}
	} else if (const VectorType *VTy = LHSTy->getAs<VectorType>()) {
	BaseExpr = LHSExp; // vectors: V[123]
	IndexExpr = RHSExp;
	VK = LHSExp->getValueKind();
	if (VK != VK_RValue)
	OK = OK_VectorComponent;

	// FIXME: need to deal with const...
	ResultType = VTy->getElementType();
	} else if (LHSTy->isArrayType()) {
	// If we see an array that wasn't promoted by
	// DefaultFunctionArrayLvalueConversion, it must be an array that
	// wasn't promoted because of the C90 rule that doesn't
	// allow promoting non-lvalue arrays. Warn, then
	// force the promotion here.
	Diag(LHSExp->getLocStart(), diag::ext_subscript_non_lvalue) <<
	LHSExp->getSourceRange();
	LHSExp = ImpCastExprToType(LHSExp, Context.getArrayDecayedType(LHSTy),
	CK_ArrayToPointerDecay).get();
	LHSTy = LHSExp->getType();

	BaseExpr = LHSExp;
	IndexExpr = RHSExp;
	ResultType = LHSTy->getAs<PointerType>()->getPointeeType();
	} else if (RHSTy->isArrayType()) {
	// Same as previous, except for 123[f().a] case
	Diag(RHSExp->getLocStart(), diag::ext_subscript_non_lvalue) <<
	RHSExp->getSourceRange();
	RHSExp = ImpCastExprToType(RHSExp, Context.getArrayDecayedType(RHSTy),
	CK_ArrayToPointerDecay).get();
	RHSTy = RHSExp->getType();

	BaseExpr = RHSExp;
	IndexExpr = LHSExp;
	ResultType = RHSTy->getAs<PointerType>()->getPointeeType();
	} else {
	return ExprError(Diag(LLoc, diag::err_typecheck_subscript_value)
	<< LHSExp->getSourceRange() << RHSExp->getSourceRange());
	}
	// C99 6.5.2.1p1
	if (!IndexExpr->getType()->isIntegerType() && !IndexExpr->isTypeDependent())
	return ExprError(Diag(LLoc, diag::err_typecheck_subscript_not_integer)
	<< IndexExpr->getSourceRange());

	if ((IndexExpr->getType()->isSpecificBuiltinType(BuiltinType::Char_S) \|\|
	IndexExpr->getType()->isSpecificBuiltinType(BuiltinType::Char_U))
	&& !IndexExpr->isTypeDependent())
	Diag(LLoc, diag::warn_subscript_is_char) << IndexExpr->getSourceRange();

	// C99 6.5.2.1p1: "shall have type "pointer to object type". Similarly,
	// C++ [expr.sub]p1: The type "T" shall be a completely-defined object
	// type. Note that Functions are not objects, and that (in C99 parlance)
	// incomplete types are not object types.
	if (ResultType->isFunctionType()) {
	Diag(BaseExpr->getLocStart(), diag::err_subscript_function_type)
	<< ResultType << BaseExpr->getSourceRange();
	return ExprError();
	}

	if (ResultType->isVoidType() && !getLangOpts().CPlusPlus) {
	// GNU extension: subscripting on pointer to void
	Diag(LLoc, diag::ext_gnu_subscript_void_type)
	<< BaseExpr->getSourceRange();

	// C forbids expressions of unqualified void type from being l-values.
	// See IsCForbiddenLValueType.
	if (!ResultType.hasQualifiers()) VK = VK_RValue;
	} else if (!ResultType->isDependentType() &&
	RequireCompleteType(LLoc, ResultType,
	diag::err_subscript_incomplete_type, BaseExpr))
	return ExprError();

	assert(VK == VK_RValue \|\| LangOpts.CPlusPlus \|\|
	!ResultType.isCForbiddenLValueType());

	return new (Context)
	ArraySubscriptExpr(LHSExp, RHSExp, ResultType, VK, OK, RLoc);
	}

	bool Sema::CheckCXXDefaultArgExpr(SourceLocation CallLoc, FunctionDecl *FD,
	ParmVarDecl *Param) {
	if (Param->hasUnparsedDefaultArg()) {
	Diag(CallLoc,
	diag::err_use_of_default_argument_to_function_declared_later) <<
	FD << cast<CXXRecordDecl>(FD->getDeclContext())->getDeclName();
	Diag(UnparsedDefaultArgLocs[Param],
	diag::note_default_argument_declared_here);
	return true;
	}

	if (Param->hasUninstantiatedDefaultArg()) {
	Expr *UninstExpr = Param->getUninstantiatedDefaultArg();

	EnterExpressionEvaluationContext EvalContext(
	*this, ExpressionEvaluationContext::PotentiallyEvaluated, Param);

	// Instantiate the expression.
	MultiLevelTemplateArgumentList MutiLevelArgList
	= getTemplateInstantiationArgs(FD, nullptr, /RelativeToPrimary=/true);

	InstantiatingTemplate Inst(*this, CallLoc, Param,
	MutiLevelArgList.getInnermost());
	if (Inst.isInvalid())
	return true;
	if (Inst.isAlreadyInstantiating()) {
	Diag(Param->getLocStart(), diag::err_recursive_default_argument) << FD;
	Param->setInvalidDecl();
	return true;
	}

	ExprResult Result;
	{
	// C++ [dcl.fct.default]p5:
	// The names in the [default argument] expression are bound, and
	// the semantic constraints are checked, at the point where the
	// default argument expression appears.
	ContextRAII SavedContext(*this, FD);
	LocalInstantiationScope Local(*this);
	Result = SubstInitializer(UninstExpr, MutiLevelArgList,
	/DirectInit/false);
	}
	if (Result.isInvalid())
	return true;

	// Check the expression as an initializer for the parameter.
	InitializedEntity Entity
	= InitializedEntity::InitializeParameter(Context, Param);
	InitializationKind Kind
	= InitializationKind::CreateCopy(Param->getLocation(),
	/FIXME:EqualLoc/UninstExpr->getLocStart());
	Expr *ResultE = Result.getAs<Expr>();

	InitializationSequence InitSeq(*this, Entity, Kind, ResultE);
	Result = InitSeq.Perform(*this, Entity, Kind, ResultE);
	if (Result.isInvalid())
	return true;

	Result = ActOnFinishFullExpr(Result.getAs<Expr>(),
	Param->getOuterLocStart());
	if (Result.isInvalid())
	return true;

	// Remember the instantiated default argument.
	Param->setDefaultArg(Result.getAs<Expr>());
	if (ASTMutationListener *L = getASTMutationListener()) {
	L->DefaultArgumentInstantiated(Param);
	}
	}

	// If the default argument expression is not set yet, we are building it now.
	if (!Param->hasInit()) {
	Diag(Param->getLocStart(), diag::err_recursive_default_argument) << FD;
	Param->setInvalidDecl();
	return true;
	}

	// If the default expression creates temporaries, we need to
	// push them to the current stack of expression temporaries so they'll
	// be properly destroyed.
	// FIXME: We should really be rebuilding the default argument with new
	// bound temporaries; see the comment in PR5810.
	// We don't need to do that with block decls, though, because
	// blocks in default argument expression can never capture anything.
	if (auto Init = dyn_cast<ExprWithCleanups>(Param->getInit())) {
	// Set the "needs cleanups" bit regardless of whether there are
	// any explicit objects.
	Cleanup.setExprNeedsCleanups(Init->cleanupsHaveSideEffects());

	// Append all the objects to the cleanup list. Right now, this
	// should always be a no-op, because blocks in default argument
	// expressions should never be able to capture anything.
	assert(!Init->getNumObjects() &&
	"default argument expression has capturing blocks?");
	}

	// We already type-checked the argument, so we know it works.
	// Just mark all of the declarations in this potentially-evaluated expression
	// as being "referenced".
	MarkDeclarationsReferencedInExpr(Param->getDefaultArg(),
	/SkipLocalVariables=/true);
	return false;
	}

	ExprResult Sema::BuildCXXDefaultArgExpr(SourceLocation CallLoc,
	FunctionDecl FD, ParmVarDecl Param) {
	if (CheckCXXDefaultArgExpr(CallLoc, FD, Param))
	return ExprError();
	return CXXDefaultArgExpr::Create(Context, CallLoc, Param);
	}

	Sema::VariadicCallType
	Sema::getVariadicCallType(FunctionDecl FDecl, const FunctionProtoType Proto,
	Expr *Fn) {
	if (Proto && Proto->isVariadic()) {
	if (dyn_cast_or_null<CXXConstructorDecl>(FDecl))
	return VariadicConstructor;
	else if (Fn && Fn->getType()->isBlockPointerType())
	return VariadicBlock;
	else if (FDecl) {
	if (CXXMethodDecl *Method = dyn_cast_or_null<CXXMethodDecl>(FDecl))
	if (Method->isInstance())
	return VariadicMethod;
	} else if (Fn && Fn->getType() == Context.BoundMemberTy)
	return VariadicMethod;
	return VariadicFunction;
	}
	return VariadicDoesNotApply;
	}

	namespace {
	class FunctionCallCCC : public FunctionCallFilterCCC {
	public:
	FunctionCallCCC(Sema &SemaRef, const IdentifierInfo *FuncName,
	unsigned NumArgs, MemberExpr *ME)
	: FunctionCallFilterCCC(SemaRef, NumArgs, false, ME),
	FunctionName(FuncName) {}

	bool ValidateCandidate(const TypoCorrection &candidate) override {
	if (!candidate.getCorrectionSpecifier() \|\|
	candidate.getCorrectionAsIdentifierInfo() != FunctionName) {
	return false;
	}

	return FunctionCallFilterCCC::ValidateCandidate(candidate);
	}

	private:
	const IdentifierInfo *const FunctionName;
	};
	}

	static TypoCorrection TryTypoCorrectionForCall(Sema &S, Expr *Fn,
	FunctionDecl *FDecl,
	ArrayRef<Expr *> Args) {
	MemberExpr *ME = dyn_cast<MemberExpr>(Fn);
	DeclarationName FuncName = FDecl->getDeclName();
	SourceLocation NameLoc = ME ? ME->getMemberLoc() : Fn->getLocStart();

	if (TypoCorrection Corrected = S.CorrectTypo(
	DeclarationNameInfo(FuncName, NameLoc), Sema::LookupOrdinaryName,
	S.getScopeForContext(S.CurContext), nullptr,
	llvm::make_unique<FunctionCallCCC>(S, FuncName.getAsIdentifierInfo(),
	Args.size(), ME),
	Sema::CTK_ErrorRecovery)) {
	if (NamedDecl *ND = Corrected.getFoundDecl()) {
	if (Corrected.isOverloaded()) {
	OverloadCandidateSet OCS(NameLoc, OverloadCandidateSet::CSK_Normal);
	OverloadCandidateSet::iterator Best;
	for (NamedDecl *CD : Corrected) {
	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(CD))
	S.AddOverloadCandidate(FD, DeclAccessPair::make(FD, AS_none), Args,
	OCS);
	}
	switch (OCS.BestViableFunction(S, NameLoc, Best)) {
	case OR_Success:
	ND = Best->FoundDecl;
	Corrected.setCorrectionDecl(ND);
	break;
	default:
	break;
	}
	}
	ND = ND->getUnderlyingDecl();
	if (isa<ValueDecl>(ND) \|\| isa<FunctionTemplateDecl>(ND))
	return Corrected;
	}
	}
	return TypoCorrection();
	}

	/// ConvertArgumentsForCall - Converts the arguments specified in
	/// Args/NumArgs to the parameter types of the function FDecl with
	/// function prototype Proto. Call is the call expression itself, and
	/// Fn is the function expression. For a C++ member function, this
	/// routine does not attempt to convert the object argument. Returns
	/// true if the call is ill-formed.
	bool
	Sema::ConvertArgumentsForCall(CallExpr Call, Expr Fn,
	FunctionDecl *FDecl,
	const FunctionProtoType *Proto,
	ArrayRef<Expr *> Args,
	SourceLocation RParenLoc,
	bool IsExecConfig) {
	// Bail out early if calling a builtin with custom typechecking.
	if (FDecl)
	if (unsigned ID = FDecl->getBuiltinID())
	if (Context.BuiltinInfo.hasCustomTypechecking(ID))
	return false;

	// C99 6.5.2.2p7 - the arguments are implicitly converted, as if by
	// assignment, to the types of the corresponding parameter, ...
	unsigned NumParams = Proto->getNumParams();
	bool Invalid = false;
	unsigned MinArgs = FDecl ? FDecl->getMinRequiredArguments() : NumParams;
	unsigned FnKind = Fn->getType()->isBlockPointerType()
	? 1 /* block */
	: (IsExecConfig ? 3 /* kernel function (exec config) */
	: 0 /* function */);

	// If too few arguments are available (and we don't have default
	// arguments for the remaining parameters), don't make the call.
	if (Args.size() < NumParams) {
	if (Args.size() < MinArgs) {
	TypoCorrection TC;
	if (FDecl && (TC = TryTypoCorrectionForCall(*this, Fn, FDecl, Args))) {
	unsigned diag_id =
	MinArgs == NumParams && !Proto->isVariadic()
	? diag::err_typecheck_call_too_few_args_suggest
	: diag::err_typecheck_call_too_few_args_at_least_suggest;
	diagnoseTypo(TC, PDiag(diag_id) << FnKind << MinArgs
	<< static_cast<unsigned>(Args.size())
	<< TC.getCorrectionRange());
	} else if (MinArgs == 1 && FDecl && FDecl->getParamDecl(0)->getDeclName())
	Diag(RParenLoc,
	MinArgs == NumParams && !Proto->isVariadic()
	? diag::err_typecheck_call_too_few_args_one
	: diag::err_typecheck_call_too_few_args_at_least_one)
	<< FnKind << FDecl->getParamDecl(0) << Fn->getSourceRange();
	else
	Diag(RParenLoc, MinArgs == NumParams && !Proto->isVariadic()
	? diag::err_typecheck_call_too_few_args
	: diag::err_typecheck_call_too_few_args_at_least)
	<< FnKind << MinArgs << static_cast<unsigned>(Args.size())
	<< Fn->getSourceRange();

	// Emit the location of the prototype.
	if (!TC && FDecl && !FDecl->getBuiltinID() && !IsExecConfig)
	Diag(FDecl->getLocStart(), diag::note_callee_decl)
	<< FDecl;

	return true;
	}
	Call->setNumArgs(Context, NumParams);
	}

	// If too many are passed and not variadic, error on the extras and drop
	// them.
	if (Args.size() > NumParams) {
	if (!Proto->isVariadic()) {
	TypoCorrection TC;
	if (FDecl && (TC = TryTypoCorrectionForCall(*this, Fn, FDecl, Args))) {
	unsigned diag_id =
	MinArgs == NumParams && !Proto->isVariadic()
	? diag::err_typecheck_call_too_many_args_suggest
	: diag::err_typecheck_call_too_many_args_at_most_suggest;
	diagnoseTypo(TC, PDiag(diag_id) << FnKind << NumParams
	<< static_cast<unsigned>(Args.size())
	<< TC.getCorrectionRange());
	} else if (NumParams == 1 && FDecl &&
	FDecl->getParamDecl(0)->getDeclName())
	Diag(Args[NumParams]->getLocStart(),
	MinArgs == NumParams
	? diag::err_typecheck_call_too_many_args_one
	: diag::err_typecheck_call_too_many_args_at_most_one)
	<< FnKind << FDecl->getParamDecl(0)
	<< static_cast<unsigned>(Args.size()) << Fn->getSourceRange()
	<< SourceRange(Args[NumParams]->getLocStart(),
	Args.back()->getLocEnd());
	else
	Diag(Args[NumParams]->getLocStart(),
	MinArgs == NumParams
	? diag::err_typecheck_call_too_many_args
	: diag::err_typecheck_call_too_many_args_at_most)
	<< FnKind << NumParams << static_cast<unsigned>(Args.size())
	<< Fn->getSourceRange()
	<< SourceRange(Args[NumParams]->getLocStart(),
	Args.back()->getLocEnd());

	// Emit the location of the prototype.
	if (!TC && FDecl && !FDecl->getBuiltinID() && !IsExecConfig)
	Diag(FDecl->getLocStart(), diag::note_callee_decl)
	<< FDecl;

	// This deletes the extra arguments.
	Call->setNumArgs(Context, NumParams);
	return true;
	}
	}
	SmallVector<Expr *, 8> AllArgs;
	VariadicCallType CallType = getVariadicCallType(FDecl, Proto, Fn);

	Invalid = GatherArgumentsForCall(Call->getLocStart(), FDecl,
	Proto, 0, Args, AllArgs, CallType);
	if (Invalid)
	return true;
	unsigned TotalNumArgs = AllArgs.size();
	for (unsigned i = 0; i < TotalNumArgs; ++i)
	Call->setArg(i, AllArgs[i]);

	return false;
	}

	bool Sema::GatherArgumentsForCall(SourceLocation CallLoc, FunctionDecl *FDecl,
	const FunctionProtoType *Proto,
	unsigned FirstParam, ArrayRef<Expr *> Args,
	SmallVectorImpl<Expr *> &AllArgs,
	VariadicCallType CallType, bool AllowExplicit,
	bool IsListInitialization) {
	unsigned NumParams = Proto->getNumParams();
	bool Invalid = false;
	size_t ArgIx = 0;
	// Continue to check argument types (even if we have too few/many args).
	for (unsigned i = FirstParam; i < NumParams; i++) {
	QualType ProtoArgType = Proto->getParamType(i);

	Expr *Arg;
	ParmVarDecl *Param = FDecl ? FDecl->getParamDecl(i) : nullptr;
	if (ArgIx < Args.size()) {
	Arg = Args[ArgIx++];

	if (RequireCompleteType(Arg->getLocStart(),
	ProtoArgType,
	diag::err_call_incomplete_argument, Arg))
	return true;

	// Strip the unbridged-cast placeholder expression off, if applicable.
	bool CFAudited = false;
	if (Arg->getType() == Context.ARCUnbridgedCastTy &&
	FDecl && FDecl->hasAttr<CFAuditedTransferAttr>() &&
	(!Param \|\| !Param->hasAttr<CFConsumedAttr>()))
	Arg = stripARCUnbridgedCast(Arg);
	else if (getLangOpts().ObjCAutoRefCount &&
	FDecl && FDecl->hasAttr<CFAuditedTransferAttr>() &&
	(!Param \|\| !Param->hasAttr<CFConsumedAttr>()))
	CFAudited = true;

	InitializedEntity Entity =
	Param ? InitializedEntity::InitializeParameter(Context, Param,
	ProtoArgType)
	: InitializedEntity::InitializeParameter(
	Context, ProtoArgType, Proto->isParamConsumed(i));

	// Remember that parameter belongs to a CF audited API.
	if (CFAudited)
	Entity.setParameterCFAudited();

	ExprResult ArgE = PerformCopyInitialization(
	Entity, SourceLocation(), Arg, IsListInitialization, AllowExplicit);
	if (ArgE.isInvalid())
	return true;

	Arg = ArgE.getAs<Expr>();
	} else {
	assert(Param && "can't use default arguments without a known callee");

	ExprResult ArgExpr =
	BuildCXXDefaultArgExpr(CallLoc, FDecl, Param);
	if (ArgExpr.isInvalid())
	return true;

	Arg = ArgExpr.getAs<Expr>();
	}

	// Check for array bounds violations for each argument to the call. This
	// check only triggers warnings when the argument isn't a more complex Expr
	// with its own checking, such as a BinaryOperator.
	CheckArrayAccess(Arg);

	// Check for violations of C99 static array rules (C99 6.7.5.3p7).
	CheckStaticArrayArgument(CallLoc, Param, Arg);

	AllArgs.push_back(Arg);
	}

	// If this is a variadic call, handle args passed through "...".
	if (CallType != VariadicDoesNotApply) {
	// Assume that extern "C" functions with variadic arguments that
	// return __unknown_anytype aren't really variadic.
	if (Proto->getReturnType() == Context.UnknownAnyTy && FDecl &&
	FDecl->isExternC()) {
	for (Expr *A : Args.slice(ArgIx)) {
	QualType paramType; // ignored
	ExprResult arg = checkUnknownAnyArg(CallLoc, A, paramType);
	Invalid \|= arg.isInvalid();
	AllArgs.push_back(arg.get());
	}

	// Otherwise do argument promotion, (C99 6.5.2.2p7).
	} else {
	for (Expr *A : Args.slice(ArgIx)) {
	ExprResult Arg = DefaultVariadicArgumentPromotion(A, CallType, FDecl);
	Invalid \|= Arg.isInvalid();
	AllArgs.push_back(Arg.get());
	}
	}

	// Check for array bounds violations.
	for (Expr *A : Args.slice(ArgIx))
	CheckArrayAccess(A);
	}
	return Invalid;
	}

	static void DiagnoseCalleeStaticArrayParam(Sema &S, ParmVarDecl *PVD) {
	TypeLoc TL = PVD->getTypeSourceInfo()->getTypeLoc();
	if (DecayedTypeLoc DTL = TL.getAs<DecayedTypeLoc>())
	TL = DTL.getOriginalLoc();
	if (ArrayTypeLoc ATL = TL.getAs<ArrayTypeLoc>())
	S.Diag(PVD->getLocation(), diag::note_callee_static_array)
	<< ATL.getLocalSourceRange();
	}

	/// CheckStaticArrayArgument - If the given argument corresponds to a static
	/// array parameter, check that it is non-null, and that if it is formed by
	/// array-to-pointer decay, the underlying array is sufficiently large.
	///
	/// C99 6.7.5.3p7: If the keyword static also appears within the [ and ] of the
	/// array type derivation, then for each call to the function, the value of the
	/// corresponding actual argument shall provide access to the first element of
	/// an array with at least as many elements as specified by the size expression.
	void
	Sema::CheckStaticArrayArgument(SourceLocation CallLoc,
	ParmVarDecl *Param,
	const Expr *ArgExpr) {
	// Static array parameters are not supported in C++.
	if (!Param \|\| getLangOpts().CPlusPlus)
	return;

	QualType OrigTy = Param->getOriginalType();

	const ArrayType *AT = Context.getAsArrayType(OrigTy);
	if (!AT \|\| AT->getSizeModifier() != ArrayType::Static)
	return;

	if (ArgExpr->isNullPointerConstant(Context,
	Expr::NPC_NeverValueDependent)) {
	Diag(CallLoc, diag::warn_null_arg) << ArgExpr->getSourceRange();
	DiagnoseCalleeStaticArrayParam(*this, Param);
	return;
	}

	const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(AT);
	if (!CAT)
	return;

	const ConstantArrayType *ArgCAT =
	Context.getAsConstantArrayType(ArgExpr->IgnoreParenImpCasts()->getType());
	if (!ArgCAT)
	return;

	if (ArgCAT->getSize().ult(CAT->getSize())) {
	Diag(CallLoc, diag::warn_static_array_too_small)
	<< ArgExpr->getSourceRange()
	<< (unsigned) ArgCAT->getSize().getZExtValue()
	<< (unsigned) CAT->getSize().getZExtValue();
	DiagnoseCalleeStaticArrayParam(*this, Param);
	}
	}

	/// Given a function expression of unknown-any type, try to rebuild it
	/// to have a function type.
	static ExprResult rebuildUnknownAnyFunction(Sema &S, Expr *fn);

	/// Is the given type a placeholder that we need to lower out
	/// immediately during argument processing?
	static bool isPlaceholderToRemoveAsArg(QualType type) {
	// Placeholders are never sugared.
	const BuiltinType *placeholder = dyn_cast<BuiltinType>(type);
	if (!placeholder) return false;

	switch (placeholder->getKind()) {
	// Ignore all the non-placeholder types.
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLImageTypes.def"
	#define PLACEHOLDER_TYPE(ID, SINGLETON_ID)
	#define BUILTIN_TYPE(ID, SINGLETON_ID) case BuiltinType::ID:
	#include "clang/AST/BuiltinTypes.def"
	return false;

	// We cannot lower out overload sets; they might validly be resolved
	// by the call machinery.
	case BuiltinType::Overload:
	return false;

	// Unbridged casts in ARC can be handled in some call positions and
	// should be left in place.
	case BuiltinType::ARCUnbridgedCast:
	return false;

	// Pseudo-objects should be converted as soon as possible.
	case BuiltinType::PseudoObject:
	return true;

	// The debugger mode could theoretically but currently does not try
	// to resolve unknown-typed arguments based on known parameter types.
	case BuiltinType::UnknownAny:
	return true;

	// These are always invalid as call arguments and should be reported.
	case BuiltinType::BoundMember:
	case BuiltinType::BuiltinFn:
	case BuiltinType::OMPArraySection:
	return true;

	}
	llvm_unreachable("bad builtin type kind");
	}

	/// Check an argument list for placeholders that we won't try to
	/// handle later.
	static bool checkArgsForPlaceholders(Sema &S, MultiExprArg args) {
	// Apply this processing to all the arguments at once instead of
	// dying at the first failure.
	bool hasInvalid = false;
	for (size_t i = 0, e = args.size(); i != e; i++) {
	if (isPlaceholderToRemoveAsArg(args[i]->getType())) {
	ExprResult result = S.CheckPlaceholderExpr(args[i]);
	if (result.isInvalid()) hasInvalid = true;
	else args[i] = result.get();
	} else if (hasInvalid) {
	(void)S.CorrectDelayedTyposInExpr(args[i]);
	}
	}
	return hasInvalid;
	}

	/// If a builtin function has a pointer argument with no explicit address
	/// space, then it should be able to accept a pointer to any address
	/// space as input. In order to do this, we need to replace the
	/// standard builtin declaration with one that uses the same address space
	/// as the call.
	///
	/// \returns nullptr If this builtin is not a candidate for a rewrite i.e.
	/// it does not contain any pointer arguments without
	/// an address space qualifer. Otherwise the rewritten
	/// FunctionDecl is returned.
	/// TODO: Handle pointer return types.
	static FunctionDecl rewriteBuiltinFunctionDecl(Sema Sema, ASTContext &Context,
	const FunctionDecl *FDecl,
	MultiExprArg ArgExprs) {

	QualType DeclType = FDecl->getType();
	const FunctionProtoType *FT = dyn_cast<FunctionProtoType>(DeclType);

	if (!Context.BuiltinInfo.hasPtrArgsOrResult(FDecl->getBuiltinID()) \|\|
	!FT \|\| FT->isVariadic() \|\| ArgExprs.size() != FT->getNumParams())
	return nullptr;

	bool NeedsNewDecl = false;
	unsigned i = 0;
	SmallVector<QualType, 8> OverloadParams;

	for (QualType ParamType : FT->param_types()) {

	// Convert array arguments to pointer to simplify type lookup.
	ExprResult ArgRes =
	Sema->DefaultFunctionArrayLvalueConversion(ArgExprs[i++]);
	if (ArgRes.isInvalid())
	return nullptr;
	Expr *Arg = ArgRes.get();
	QualType ArgType = Arg->getType();
	if (!ParamType->isPointerType() \|\|
	ParamType.getQualifiers().hasAddressSpace() \|\|
	!ArgType->isPointerType() \|\|
	!ArgType->getPointeeType().getQualifiers().hasAddressSpace()) {
	OverloadParams.push_back(ParamType);
	continue;
	}

	NeedsNewDecl = true;
	unsigned AS = ArgType->getPointeeType().getQualifiers().getAddressSpace();

	QualType PointeeType = ParamType->getPointeeType();
	PointeeType = Context.getAddrSpaceQualType(PointeeType, AS);
	OverloadParams.push_back(Context.getPointerType(PointeeType));
	}

	if (!NeedsNewDecl)
	return nullptr;

	FunctionProtoType::ExtProtoInfo EPI;
	QualType OverloadTy = Context.getFunctionType(FT->getReturnType(),
	OverloadParams, EPI);
	DeclContext *Parent = Context.getTranslationUnitDecl();
	FunctionDecl *OverloadDecl = FunctionDecl::Create(Context, Parent,
	FDecl->getLocation(),
	FDecl->getLocation(),
	FDecl->getIdentifier(),
	OverloadTy,
	/TInfo=/nullptr,
	SC_Extern, false,
	/hasPrototype=/true);
	SmallVector<ParmVarDecl*, 16> Params;
	FT = cast<FunctionProtoType>(OverloadTy);
	for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
	QualType ParamType = FT->getParamType(i);
	ParmVarDecl *Parm =
	ParmVarDecl::Create(Context, OverloadDecl, SourceLocation(),
	SourceLocation(), nullptr, ParamType,
	/TInfo=/nullptr, SC_None, nullptr);
	Parm->setScopeInfo(0, i);
	Params.push_back(Parm);
	}
	OverloadDecl->setParams(Params);
	return OverloadDecl;
	}

	static void checkDirectCallValidity(Sema &S, const Expr *Fn,
	FunctionDecl *Callee,
	MultiExprArg ArgExprs) {
	// `Callee` (when called with ArgExprs) may be ill-formed. enable_if (and
	// similar attributes) really don't like it when functions are called with an
	// invalid number of args.
	if (S.TooManyArguments(Callee->getNumParams(), ArgExprs.size(),
	/PartialOverloading=/false) &&
	!Callee->isVariadic())
	return;
	if (Callee->getMinRequiredArguments() > ArgExprs.size())
	return;

	if (const EnableIfAttr *Attr = S.CheckEnableIf(Callee, ArgExprs, true)) {
	S.Diag(Fn->getLocStart(),
	isa<CXXMethodDecl>(Callee)
	? diag::err_ovl_no_viable_member_function_in_call
	: diag::err_ovl_no_viable_function_in_call)
	<< Callee << Callee->getSourceRange();
	S.Diag(Callee->getLocation(),
	diag::note_ovl_candidate_disabled_by_function_cond_attr)
	<< Attr->getCond()->getSourceRange() << Attr->getMessage();
	return;
	}
	}

	/// ActOnCallExpr - Handle a call to Fn with the specified array of arguments.
	/// This provides the location of the left/right parens and a list of comma
	/// locations.
	ExprResult Sema::ActOnCallExpr(Scope Scope, Expr Fn, SourceLocation LParenLoc,
	MultiExprArg ArgExprs, SourceLocation RParenLoc,
	Expr *ExecConfig, bool IsExecConfig) {
	// Since this might be a postfix expression, get rid of ParenListExprs.
	ExprResult Result = MaybeConvertParenListExprToParenExpr(Scope, Fn);
	if (Result.isInvalid()) return ExprError();
	Fn = Result.get();

	if (checkArgsForPlaceholders(*this, ArgExprs))
	return ExprError();

	if (getLangOpts().CPlusPlus) {
	// If this is a pseudo-destructor expression, build the call immediately.
	if (isa<CXXPseudoDestructorExpr>(Fn)) {
	if (!ArgExprs.empty()) {
	// Pseudo-destructor calls should not have any arguments.
	Diag(Fn->getLocStart(), diag::err_pseudo_dtor_call_with_args)
	<< FixItHint::CreateRemoval(
	SourceRange(ArgExprs.front()->getLocStart(),
	ArgExprs.back()->getLocEnd()));
	}

	return new (Context)
	CallExpr(Context, Fn, None, Context.VoidTy, VK_RValue, RParenLoc);
	}
	if (Fn->getType() == Context.PseudoObjectTy) {
	ExprResult result = CheckPlaceholderExpr(Fn);
	if (result.isInvalid()) return ExprError();
	Fn = result.get();
	}

	// Determine whether this is a dependent call inside a C++ template,
	// in which case we won't do any semantic analysis now.
	bool Dependent = false;
	if (Fn->isTypeDependent())
	Dependent = true;
	else if (Expr::hasAnyTypeDependentArguments(ArgExprs))
	Dependent = true;

	if (Dependent) {
	if (ExecConfig) {
	return new (Context) CUDAKernelCallExpr(
	Context, Fn, cast<CallExpr>(ExecConfig), ArgExprs,
	Context.DependentTy, VK_RValue, RParenLoc);
	} else {
	return new (Context) CallExpr(
	Context, Fn, ArgExprs, Context.DependentTy, VK_RValue, RParenLoc);
	}
	}

	// Determine whether this is a call to an object (C++ [over.call.object]).
	if (Fn->getType()->isRecordType())
	return BuildCallToObjectOfClassType(Scope, Fn, LParenLoc, ArgExprs,
	RParenLoc);

	if (Fn->getType() == Context.UnknownAnyTy) {
	ExprResult result = rebuildUnknownAnyFunction(*this, Fn);
	if (result.isInvalid()) return ExprError();
	Fn = result.get();
	}

	if (Fn->getType() == Context.BoundMemberTy) {
	return BuildCallToMemberFunction(Scope, Fn, LParenLoc, ArgExprs,
	RParenLoc);
	}
	}

	// Check for overloaded calls. This can happen even in C due to extensions.
	if (Fn->getType() == Context.OverloadTy) {
	OverloadExpr::FindResult find = OverloadExpr::find(Fn);

	// We aren't supposed to apply this logic if there's an '&' involved.
	if (!find.HasFormOfMemberPointer) {
	if (Expr::hasAnyTypeDependentArguments(ArgExprs))
	return new (Context) CallExpr(
	Context, Fn, ArgExprs, Context.DependentTy, VK_RValue, RParenLoc);
	OverloadExpr *ovl = find.Expression;
	if (UnresolvedLookupExpr *ULE = dyn_cast<UnresolvedLookupExpr>(ovl))
	return BuildOverloadedCallExpr(
	Scope, Fn, ULE, LParenLoc, ArgExprs, RParenLoc, ExecConfig,
	/AllowTypoCorrection=/true, find.IsAddressOfOperand);
	return BuildCallToMemberFunction(Scope, Fn, LParenLoc, ArgExprs,
	RParenLoc);
	}
	}

	// If we're directly calling a function, get the appropriate declaration.
	if (Fn->getType() == Context.UnknownAnyTy) {
	ExprResult result = rebuildUnknownAnyFunction(*this, Fn);
	if (result.isInvalid()) return ExprError();
	Fn = result.get();
	}

	Expr *NakedFn = Fn->IgnoreParens();

	bool CallingNDeclIndirectly = false;
	NamedDecl *NDecl = nullptr;
	if (UnaryOperator *UnOp = dyn_cast<UnaryOperator>(NakedFn)) {
	if (UnOp->getOpcode() == UO_AddrOf) {
	CallingNDeclIndirectly = true;
	NakedFn = UnOp->getSubExpr()->IgnoreParens();
	}
	}

	if (isa<DeclRefExpr>(NakedFn)) {
	NDecl = cast<DeclRefExpr>(NakedFn)->getDecl();

	FunctionDecl *FDecl = dyn_cast<FunctionDecl>(NDecl);
	if (FDecl && FDecl->getBuiltinID()) {
	// Rewrite the function decl for this builtin by replacing parameters
	// with no explicit address space with the address space of the arguments
	// in ArgExprs.
	if ((FDecl =
	rewriteBuiltinFunctionDecl(this, Context, FDecl, ArgExprs))) {
	NDecl = FDecl;
	Fn = DeclRefExpr::Create(
	Context, FDecl->getQualifierLoc(), SourceLocation(), FDecl, false,
	SourceLocation(), FDecl->getType(), Fn->getValueKind(), FDecl);
	}
	}
	} else if (isa<MemberExpr>(NakedFn))
	NDecl = cast<MemberExpr>(NakedFn)->getMemberDecl();

	if (FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(NDecl)) {
	if (CallingNDeclIndirectly &&
	!checkAddressOfFunctionIsAvailable(FD, /Complain=/true,
	Fn->getLocStart()))
	return ExprError();

	if (getLangOpts().OpenCL && checkOpenCLDisabledDecl(FD, Fn))
	return ExprError();

	checkDirectCallValidity(*this, Fn, FD, ArgExprs);
	}

	return BuildResolvedCallExpr(Fn, NDecl, LParenLoc, ArgExprs, RParenLoc,
	ExecConfig, IsExecConfig);
	}

	/// ActOnAsTypeExpr - create a new asType (bitcast) from the arguments.
	///
	/// __builtin_astype( value, dst type )
	///
	ExprResult Sema::ActOnAsTypeExpr(Expr *E, ParsedType ParsedDestTy,
	SourceLocation BuiltinLoc,
	SourceLocation RParenLoc) {
	ExprValueKind VK = VK_RValue;
	ExprObjectKind OK = OK_Ordinary;
	QualType DstTy = GetTypeFromParser(ParsedDestTy);
	QualType SrcTy = E->getType();
	if (Context.getTypeSize(DstTy) != Context.getTypeSize(SrcTy))
	return ExprError(Diag(BuiltinLoc,
	diag::err_invalid_astype_of_different_size)
	<< DstTy
	<< SrcTy
	<< E->getSourceRange());
	return new (Context) AsTypeExpr(E, DstTy, VK, OK, BuiltinLoc, RParenLoc);
	}

	/// ActOnConvertVectorExpr - create a new convert-vector expression from the
	/// provided arguments.
	///
	/// __builtin_convertvector( value, dst type )
	///
	ExprResult Sema::ActOnConvertVectorExpr(Expr *E, ParsedType ParsedDestTy,
	SourceLocation BuiltinLoc,
	SourceLocation RParenLoc) {
	TypeSourceInfo *TInfo;
	GetTypeFromParser(ParsedDestTy, &TInfo);
	return SemaConvertVectorExpr(E, TInfo, BuiltinLoc, RParenLoc);
	}

	/// BuildResolvedCallExpr - Build a call to a resolved expression,
	/// i.e. an expression not of \p OverloadTy. The expression should
	/// unary-convert to an expression of function-pointer or
	/// block-pointer type.
	///
	/// \param NDecl the declaration being called, if available
	ExprResult
	Sema::BuildResolvedCallExpr(Expr Fn, NamedDecl NDecl,
	SourceLocation LParenLoc,
	ArrayRef<Expr *> Args,
	SourceLocation RParenLoc,
	Expr *Config, bool IsExecConfig) {
	FunctionDecl *FDecl = dyn_cast_or_null<FunctionDecl>(NDecl);
	unsigned BuiltinID = (FDecl ? FDecl->getBuiltinID() : 0);

	// Functions with 'interrupt' attribute cannot be called directly.
	if (FDecl && FDecl->hasAttr<AnyX86InterruptAttr>()) {
	Diag(Fn->getExprLoc(), diag::err_anyx86_interrupt_called);
	return ExprError();
	}

	// Interrupt handlers don't save off the VFP regs automatically on ARM,
	// so there's some risk when calling out to non-interrupt handler functions
	// that the callee might not preserve them. This is easy to diagnose here,
	// but can be very challenging to debug.
	if (auto *Caller = getCurFunctionDecl())
	if (Caller->hasAttr<ARMInterruptAttr>()) {
	bool VFP = Context.getTargetInfo().hasFeature("vfp");
	if (VFP && (!FDecl \|\| !FDecl->hasAttr<ARMInterruptAttr>()))
	Diag(Fn->getExprLoc(), diag::warn_arm_interrupt_calling_convention);
	}

	// Promote the function operand.
	// We special-case function promotion here because we only allow promoting
	// builtin functions to function pointers in the callee of a call.
	ExprResult Result;
	if (BuiltinID &&
	Fn->getType()->isSpecificBuiltinType(BuiltinType::BuiltinFn)) {
	Result = ImpCastExprToType(Fn, Context.getPointerType(FDecl->getType()),
	CK_BuiltinFnToFnPtr).get();
	} else {
	Result = CallExprUnaryConversions(Fn);
	}
	if (Result.isInvalid())
	return ExprError();
	Fn = Result.get();

	// Make the call expr early, before semantic checks. This guarantees cleanup
	// of arguments and function on error.
	CallExpr *TheCall;
	if (Config)
	TheCall = new (Context) CUDAKernelCallExpr(Context, Fn,
	cast<CallExpr>(Config), Args,
	Context.BoolTy, VK_RValue,
	RParenLoc);
	else
	TheCall = new (Context) CallExpr(Context, Fn, Args, Context.BoolTy,
	VK_RValue, RParenLoc);

	if (!getLangOpts().CPlusPlus) {
	// C cannot always handle TypoExpr nodes in builtin calls and direct
	// function calls as their argument checking don't necessarily handle
	// dependent types properly, so make sure any TypoExprs have been
	// dealt with.
	ExprResult Result = CorrectDelayedTyposInExpr(TheCall);
	if (!Result.isUsable()) return ExprError();
	TheCall = dyn_cast<CallExpr>(Result.get());
	if (!TheCall) return Result;
	Args = llvm::makeArrayRef(TheCall->getArgs(), TheCall->getNumArgs());
	}

	// Bail out early if calling a builtin with custom typechecking.
	if (BuiltinID && Context.BuiltinInfo.hasCustomTypechecking(BuiltinID))
	return CheckBuiltinFunctionCall(FDecl, BuiltinID, TheCall);

	retry:
	const FunctionType *FuncT;
	if (const PointerType *PT = Fn->getType()->getAs<PointerType>()) {
	// C99 6.5.2.2p1 - "The expression that denotes the called function shall
	// have type pointer to function".
	FuncT = PT->getPointeeType()->getAs<FunctionType>();
	if (!FuncT)
	return ExprError(Diag(LParenLoc, diag::err_typecheck_call_not_function)
	<< Fn->getType() << Fn->getSourceRange());
	} else if (const BlockPointerType *BPT =
	Fn->getType()->getAs<BlockPointerType>()) {
	FuncT = BPT->getPointeeType()->castAs<FunctionType>();
	} else {
	// Handle calls to expressions of unknown-any type.
	if (Fn->getType() == Context.UnknownAnyTy) {
	ExprResult rewrite = rebuildUnknownAnyFunction(*this, Fn);
	if (rewrite.isInvalid()) return ExprError();
	Fn = rewrite.get();
	TheCall->setCallee(Fn);
	goto retry;
	}

	return ExprError(Diag(LParenLoc, diag::err_typecheck_call_not_function)
	<< Fn->getType() << Fn->getSourceRange());
	}

	if (getLangOpts().CUDA) {
	if (Config) {
	// CUDA: Kernel calls must be to global functions
	if (FDecl && !FDecl->hasAttr<CUDAGlobalAttr>())
	return ExprError(Diag(LParenLoc,diag::err_kern_call_not_global_function)
	<< FDecl->getName() << Fn->getSourceRange());

	// CUDA: Kernel function must have 'void' return type
	if (!FuncT->getReturnType()->isVoidType())
	return ExprError(Diag(LParenLoc, diag::err_kern_type_not_void_return)
	<< Fn->getType() << Fn->getSourceRange());
	} else {
	// CUDA: Calls to global functions must be configured
	if (FDecl && FDecl->hasAttr<CUDAGlobalAttr>())
	return ExprError(Diag(LParenLoc, diag::err_global_call_not_config)
	<< FDecl->getName() << Fn->getSourceRange());
	}
	}

	// Check for a valid return type
	if (CheckCallReturnType(FuncT->getReturnType(), Fn->getLocStart(), TheCall,
	FDecl))
	return ExprError();

	// We know the result type of the call, set it.
	TheCall->setType(FuncT->getCallResultType(Context));
	TheCall->setValueKind(Expr::getValueKindForType(FuncT->getReturnType()));

	const FunctionProtoType *Proto = dyn_cast<FunctionProtoType>(FuncT);
	if (Proto) {
	if (ConvertArgumentsForCall(TheCall, Fn, FDecl, Proto, Args, RParenLoc,
	IsExecConfig))
	return ExprError();
	} else {
	assert(isa<FunctionNoProtoType>(FuncT) && "Unknown FunctionType!");

	if (FDecl) {
	// Check if we have too few/too many template arguments, based
	// on our knowledge of the function definition.
	const FunctionDecl *Def = nullptr;
	if (FDecl->hasBody(Def) && Args.size() != Def->param_size()) {
	Proto = Def->getType()->getAs<FunctionProtoType>();
	if (!Proto \|\| !(Proto->isVariadic() && Args.size() >= Def->param_size()))
	Diag(RParenLoc, diag::warn_call_wrong_number_of_arguments)
	<< (Args.size() > Def->param_size()) << FDecl << Fn->getSourceRange();
	}

	// If the function we're calling isn't a function prototype, but we have
	// a function prototype from a prior declaratiom, use that prototype.
	if (!FDecl->hasPrototype())
	Proto = FDecl->getType()->getAs<FunctionProtoType>();
	}

	// Promote the arguments (C99 6.5.2.2p6).
	for (unsigned i = 0, e = Args.size(); i != e; i++) {
	Expr *Arg = Args[i];

	if (Proto && i < Proto->getNumParams()) {
	InitializedEntity Entity = InitializedEntity::InitializeParameter(
	Context, Proto->getParamType(i), Proto->isParamConsumed(i));
	ExprResult ArgE =
	PerformCopyInitialization(Entity, SourceLocation(), Arg);
	if (ArgE.isInvalid())
	return true;

	Arg = ArgE.getAs<Expr>();

	} else {
	ExprResult ArgE = DefaultArgumentPromotion(Arg);

	if (ArgE.isInvalid())
	return true;

	Arg = ArgE.getAs<Expr>();
	}

	if (RequireCompleteType(Arg->getLocStart(),
	Arg->getType(),
	diag::err_call_incomplete_argument, Arg))
	return ExprError();

	TheCall->setArg(i, Arg);
	}
	}

	if (CXXMethodDecl *Method = dyn_cast_or_null<CXXMethodDecl>(FDecl))
	if (!Method->isStatic())
	return ExprError(Diag(LParenLoc, diag::err_member_call_without_object)
	<< Fn->getSourceRange());

	// Check for sentinels
	if (NDecl)
	DiagnoseSentinelCalls(NDecl, LParenLoc, Args);

	// Do special checking on direct calls to functions.
	if (FDecl) {
	if (CheckFunctionCall(FDecl, TheCall, Proto))
	return ExprError();

	if (BuiltinID)
	return CheckBuiltinFunctionCall(FDecl, BuiltinID, TheCall);
	} else if (NDecl) {
	if (CheckPointerCall(NDecl, TheCall, Proto))
	return ExprError();
	} else {
	if (CheckOtherCall(TheCall, Proto))
	return ExprError();
	}

	return MaybeBindToTemporary(TheCall);
	}

	ExprResult
	Sema::ActOnCompoundLiteral(SourceLocation LParenLoc, ParsedType Ty,
	SourceLocation RParenLoc, Expr *InitExpr) {
	assert(Ty && "ActOnCompoundLiteral(): missing type");
	assert(InitExpr && "ActOnCompoundLiteral(): missing expression");

	TypeSourceInfo *TInfo;
	QualType literalType = GetTypeFromParser(Ty, &TInfo);
	if (!TInfo)
	TInfo = Context.getTrivialTypeSourceInfo(literalType);

	return BuildCompoundLiteralExpr(LParenLoc, TInfo, RParenLoc, InitExpr);
	}

	ExprResult
	Sema::BuildCompoundLiteralExpr(SourceLocation LParenLoc, TypeSourceInfo *TInfo,
	SourceLocation RParenLoc, Expr *LiteralExpr) {
	QualType literalType = TInfo->getType();

	if (literalType->isArrayType()) {
	if (RequireCompleteType(LParenLoc, Context.getBaseElementType(literalType),
	diag::err_illegal_decl_array_incomplete_type,
	SourceRange(LParenLoc,
	LiteralExpr->getSourceRange().getEnd())))
	return ExprError();
	if (literalType->isVariableArrayType())
	return ExprError(Diag(LParenLoc, diag::err_variable_object_no_init)
	<< SourceRange(LParenLoc, LiteralExpr->getSourceRange().getEnd()));
	} else if (!literalType->isDependentType() &&
	RequireCompleteType(LParenLoc, literalType,
	diag::err_typecheck_decl_incomplete_type,
	SourceRange(LParenLoc, LiteralExpr->getSourceRange().getEnd())))
	return ExprError();

	InitializedEntity Entity
	= InitializedEntity::InitializeCompoundLiteralInit(TInfo);
	InitializationKind Kind
	= InitializationKind::CreateCStyleCast(LParenLoc,
	SourceRange(LParenLoc, RParenLoc),
	/InitList=/true);
	InitializationSequence InitSeq(*this, Entity, Kind, LiteralExpr);
	ExprResult Result = InitSeq.Perform(*this, Entity, Kind, LiteralExpr,
	&literalType);
	if (Result.isInvalid())
	return ExprError();
	LiteralExpr = Result.get();

	bool isFileScope = !CurContext->isFunctionOrMethod();
	if (isFileScope &&
	!LiteralExpr->isTypeDependent() &&
	!LiteralExpr->isValueDependent() &&
	!literalType->isDependentType()) { // 6.5.2.5p3
	if (CheckForConstantInitializer(LiteralExpr, literalType))
	return ExprError();
	}

	// In C, compound literals are l-values for some reason.
	// For GCC compatibility, in C++, file-scope array compound literals with
	// constant initializers are also l-values, and compound literals are
	// otherwise prvalues.
	//
	// (GCC also treats C++ list-initialized file-scope array prvalues with
	// constant initializers as l-values, but that's non-conforming, so we don't
	// follow it there.)
	//
	// FIXME: It would be better to handle the lvalue cases as materializing and
	// lifetime-extending a temporary object, but our materialized temporaries
	// representation only supports lifetime extension from a variable, not "out
	// of thin air".
	// FIXME: For C++, we might want to instead lifetime-extend only if a pointer
	// is bound to the result of applying array-to-pointer decay to the compound
	// literal.
	// FIXME: GCC supports compound literals of reference type, which should
	// obviously have a value kind derived from the kind of reference involved.
	ExprValueKind VK =
	(getLangOpts().CPlusPlus && !(isFileScope && literalType->isArrayType()))
	? VK_RValue
	: VK_LValue;

	return MaybeBindToTemporary(
	new (Context) CompoundLiteralExpr(LParenLoc, TInfo, literalType,
	VK, LiteralExpr, isFileScope));
	}

	ExprResult
	Sema::ActOnInitList(SourceLocation LBraceLoc, MultiExprArg InitArgList,
	SourceLocation RBraceLoc) {
	// Immediately handle non-overload placeholders. Overloads can be
	// resolved contextually, but everything else here can't.
	for (unsigned I = 0, E = InitArgList.size(); I != E; ++I) {
	if (InitArgList[I]->getType()->isNonOverloadPlaceholderType()) {
	ExprResult result = CheckPlaceholderExpr(InitArgList[I]);

	// Ignore failures; dropping the entire initializer list because
	// of one failure would be terrible for indexing/etc.
	if (result.isInvalid()) continue;

	InitArgList[I] = result.get();
	}
	}

	// Semantic analysis for initializers is done by ActOnDeclarator() and
	// CheckInitializer() - it requires knowledge of the object being intialized.

	InitListExpr *E = new (Context) InitListExpr(Context, LBraceLoc, InitArgList,
	RBraceLoc);
	E->setType(Context.VoidTy); // FIXME: just a place holder for now.
	return E;
	}

	/// Do an explicit extend of the given block pointer if we're in ARC.
	void Sema::maybeExtendBlockObject(ExprResult &E) {
	assert(E.get()->getType()->isBlockPointerType());
	assert(E.get()->isRValue());

	// Only do this in an r-value context.
	if (!getLangOpts().ObjCAutoRefCount) return;

	E = ImplicitCastExpr::Create(Context, E.get()->getType(),
	CK_ARCExtendBlockObject, E.get(),
	/base path/ nullptr, VK_RValue);
	Cleanup.setExprNeedsCleanups(true);
	}

	/// Prepare a conversion of the given expression to an ObjC object
	/// pointer type.
	CastKind Sema::PrepareCastToObjCObjectPointer(ExprResult &E) {
	QualType type = E.get()->getType();
	if (type->isObjCObjectPointerType()) {
	return CK_BitCast;
	} else if (type->isBlockPointerType()) {
	maybeExtendBlockObject(E);
	return CK_BlockPointerToObjCPointerCast;
	} else {
	assert(type->isPointerType());
	return CK_CPointerToObjCPointerCast;
	}
	}

	/// Prepares for a scalar cast, performing all the necessary stages
	/// except the final cast and returning the kind required.
	CastKind Sema::PrepareScalarCast(ExprResult &Src, QualType DestTy) {
	// Both Src and Dest are scalar types, i.e. arithmetic or pointer.
	// Also, callers should have filtered out the invalid cases with
	// pointers. Everything else should be possible.

	QualType SrcTy = Src.get()->getType();
	if (Context.hasSameUnqualifiedType(SrcTy, DestTy))
	return CK_NoOp;

	switch (Type::ScalarTypeKind SrcKind = SrcTy->getScalarTypeKind()) {
	case Type::STK_MemberPointer:
	llvm_unreachable("member pointer type in C");

	case Type::STK_CPointer:
	case Type::STK_BlockPointer:
	case Type::STK_ObjCObjectPointer:
	switch (DestTy->getScalarTypeKind()) {
	case Type::STK_CPointer: {
	unsigned SrcAS = SrcTy->getPointeeType().getAddressSpace();
	unsigned DestAS = DestTy->getPointeeType().getAddressSpace();
	if (SrcAS != DestAS)
	return CK_AddressSpaceConversion;
	return CK_BitCast;
	}
	case Type::STK_BlockPointer:
	return (SrcKind == Type::STK_BlockPointer
	? CK_BitCast : CK_AnyPointerToBlockPointerCast);
	case Type::STK_ObjCObjectPointer:
	if (SrcKind == Type::STK_ObjCObjectPointer)
	return CK_BitCast;
	if (SrcKind == Type::STK_CPointer)
	return CK_CPointerToObjCPointerCast;
	maybeExtendBlockObject(Src);
	return CK_BlockPointerToObjCPointerCast;
	case Type::STK_Bool:
	return CK_PointerToBoolean;
	case Type::STK_Integral:
	return CK_PointerToIntegral;
	case Type::STK_Floating:
	case Type::STK_FloatingComplex:
	case Type::STK_IntegralComplex:
	case Type::STK_MemberPointer:
	llvm_unreachable("illegal cast from pointer");
	}
	llvm_unreachable("Should have returned before this");

	case Type::STK_Bool: // casting from bool is like casting from an integer
	case Type::STK_Integral:
	switch (DestTy->getScalarTypeKind()) {
	case Type::STK_CPointer:
	case Type::STK_ObjCObjectPointer:
	case Type::STK_BlockPointer:
	if (Src.get()->isNullPointerConstant(Context,
	Expr::NPC_ValueDependentIsNull))
	return CK_NullToPointer;
	return CK_IntegralToPointer;
	case Type::STK_Bool:
	return CK_IntegralToBoolean;
	case Type::STK_Integral:
	return CK_IntegralCast;
	case Type::STK_Floating:
	return CK_IntegralToFloating;
	case Type::STK_IntegralComplex:
	Src = ImpCastExprToType(Src.get(),
	DestTy->castAs<ComplexType>()->getElementType(),
	CK_IntegralCast);
	return CK_IntegralRealToComplex;
	case Type::STK_FloatingComplex:
	Src = ImpCastExprToType(Src.get(),
	DestTy->castAs<ComplexType>()->getElementType(),
	CK_IntegralToFloating);
	return CK_FloatingRealToComplex;
	case Type::STK_MemberPointer:
	llvm_unreachable("member pointer type in C");
	}
	llvm_unreachable("Should have returned before this");

	case Type::STK_Floating:
	switch (DestTy->getScalarTypeKind()) {
	case Type::STK_Floating:
	return CK_FloatingCast;
	case Type::STK_Bool:
	return CK_FloatingToBoolean;
	case Type::STK_Integral:
	return CK_FloatingToIntegral;
	case Type::STK_FloatingComplex:
	Src = ImpCastExprToType(Src.get(),
	DestTy->castAs<ComplexType>()->getElementType(),
	CK_FloatingCast);
	return CK_FloatingRealToComplex;
	case Type::STK_IntegralComplex:
	Src = ImpCastExprToType(Src.get(),
	DestTy->castAs<ComplexType>()->getElementType(),
	CK_FloatingToIntegral);
	return CK_IntegralRealToComplex;
	case Type::STK_CPointer:
	case Type::STK_ObjCObjectPointer:
	case Type::STK_BlockPointer:
	llvm_unreachable("valid float->pointer cast?");
	case Type::STK_MemberPointer:
	llvm_unreachable("member pointer type in C");
	}
	llvm_unreachable("Should have returned before this");

	case Type::STK_FloatingComplex:
	switch (DestTy->getScalarTypeKind()) {
	case Type::STK_FloatingComplex:
	return CK_FloatingComplexCast;
	case Type::STK_IntegralComplex:
	return CK_FloatingComplexToIntegralComplex;
	case Type::STK_Floating: {
	QualType ET = SrcTy->castAs<ComplexType>()->getElementType();
	if (Context.hasSameType(ET, DestTy))
	return CK_FloatingComplexToReal;
	Src = ImpCastExprToType(Src.get(), ET, CK_FloatingComplexToReal);
	return CK_FloatingCast;
	}
	case Type::STK_Bool:
	return CK_FloatingComplexToBoolean;
	case Type::STK_Integral:
	Src = ImpCastExprToType(Src.get(),
	SrcTy->castAs<ComplexType>()->getElementType(),
	CK_FloatingComplexToReal);
	return CK_FloatingToIntegral;
	case Type::STK_CPointer:
	case Type::STK_ObjCObjectPointer:
	case Type::STK_BlockPointer:
	llvm_unreachable("valid complex float->pointer cast?");
	case Type::STK_MemberPointer:
	llvm_unreachable("member pointer type in C");
	}
	llvm_unreachable("Should have returned before this");

	case Type::STK_IntegralComplex:
	switch (DestTy->getScalarTypeKind()) {
	case Type::STK_FloatingComplex:
	return CK_IntegralComplexToFloatingComplex;
	case Type::STK_IntegralComplex:
	return CK_IntegralComplexCast;
	case Type::STK_Integral: {
	QualType ET = SrcTy->castAs<ComplexType>()->getElementType();
	if (Context.hasSameType(ET, DestTy))
	return CK_IntegralComplexToReal;
	Src = ImpCastExprToType(Src.get(), ET, CK_IntegralComplexToReal);
	return CK_IntegralCast;
	}
	case Type::STK_Bool:
	return CK_IntegralComplexToBoolean;
	case Type::STK_Floating:
	Src = ImpCastExprToType(Src.get(),
	SrcTy->castAs<ComplexType>()->getElementType(),
	CK_IntegralComplexToReal);
	return CK_IntegralToFloating;
	case Type::STK_CPointer:
	case Type::STK_ObjCObjectPointer:
	case Type::STK_BlockPointer:
	llvm_unreachable("valid complex int->pointer cast?");
	case Type::STK_MemberPointer:
	llvm_unreachable("member pointer type in C");
	}
	llvm_unreachable("Should have returned before this");
	}

	llvm_unreachable("Unhandled scalar cast");
	}

	static bool breakDownVectorType(QualType type, uint64_t &len,
	QualType &eltType) {
	// Vectors are simple.
	if (const VectorType *vecType = type->getAs<VectorType>()) {
	len = vecType->getNumElements();
	eltType = vecType->getElementType();
	assert(eltType->isScalarType());
	return true;
	}

	// We allow lax conversion to and from non-vector types, but only if
	// they're real types (i.e. non-complex, non-pointer scalar types).
	if (!type->isRealType()) return false;

	len = 1;
	eltType = type;
	return true;
	}

	/// Are the two types lax-compatible vector types? That is, given
	/// that one of them is a vector, do they have equal storage sizes,
	/// where the storage size is the number of elements times the element
	/// size?
	///
	/// This will also return false if either of the types is neither a
	/// vector nor a real type.
	bool Sema::areLaxCompatibleVectorTypes(QualType srcTy, QualType destTy) {
	assert(destTy->isVectorType() \|\| srcTy->isVectorType());

	// Disallow lax conversions between scalars and ExtVectors (these
	// conversions are allowed for other vector types because common headers
	// depend on them). Most scalar OP ExtVector cases are handled by the
	// splat path anyway, which does what we want (convert, not bitcast).
	// What this rules out for ExtVectors is crazy things like char4*float.
	if (srcTy->isScalarType() && destTy->isExtVectorType()) return false;
	if (destTy->isScalarType() && srcTy->isExtVectorType()) return false;

	uint64_t srcLen, destLen;
	QualType srcEltTy, destEltTy;
	if (!breakDownVectorType(srcTy, srcLen, srcEltTy)) return false;
	if (!breakDownVectorType(destTy, destLen, destEltTy)) return false;

	// ASTContext::getTypeSize will return the size rounded up to a
	// power of 2, so instead of using that, we need to use the raw
	// element size multiplied by the element count.
	uint64_t srcEltSize = Context.getTypeSize(srcEltTy);
	uint64_t destEltSize = Context.getTypeSize(destEltTy);

	return (srcLen * srcEltSize == destLen * destEltSize);
	}

	/// Is this a legal conversion between two types, one of which is
	/// known to be a vector type?
	bool Sema::isLaxVectorConversion(QualType srcTy, QualType destTy) {
	assert(destTy->isVectorType() \|\| srcTy->isVectorType());

	if (!Context.getLangOpts().LaxVectorConversions)
	return false;
	return areLaxCompatibleVectorTypes(srcTy, destTy);
	}

	bool Sema::CheckVectorCast(SourceRange R, QualType VectorTy, QualType Ty,
	CastKind &Kind) {
	assert(VectorTy->isVectorType() && "Not a vector type!");

	if (Ty->isVectorType() \|\| Ty->isIntegralType(Context)) {
	if (!areLaxCompatibleVectorTypes(Ty, VectorTy))
	return Diag(R.getBegin(),
	Ty->isVectorType() ?
	diag::err_invalid_conversion_between_vectors :
	diag::err_invalid_conversion_between_vector_and_integer)
	<< VectorTy << Ty << R;
	} else
	return Diag(R.getBegin(),
	diag::err_invalid_conversion_between_vector_and_scalar)
	<< VectorTy << Ty << R;

	Kind = CK_BitCast;
	return false;
	}

	ExprResult Sema::prepareVectorSplat(QualType VectorTy, Expr *SplattedExpr) {
	QualType DestElemTy = VectorTy->castAs<VectorType>()->getElementType();

	if (DestElemTy == SplattedExpr->getType())
	return SplattedExpr;

	assert(DestElemTy->isFloatingType() \|\|
	DestElemTy->isIntegralOrEnumerationType());

	CastKind CK;
	if (VectorTy->isExtVectorType() && SplattedExpr->getType()->isBooleanType()) {
	// OpenCL requires that we convert `true` boolean expressions to -1, but
	// only when splatting vectors.
	if (DestElemTy->isFloatingType()) {
	// To avoid having to have a CK_BooleanToSignedFloating cast kind, we cast
	// in two steps: boolean to signed integral, then to floating.
	ExprResult CastExprRes = ImpCastExprToType(SplattedExpr, Context.IntTy,
	CK_BooleanToSignedIntegral);
	SplattedExpr = CastExprRes.get();
	CK = CK_IntegralToFloating;
	} else {
	CK = CK_BooleanToSignedIntegral;
	}
	} else {
	ExprResult CastExprRes = SplattedExpr;
	CK = PrepareScalarCast(CastExprRes, DestElemTy);
	if (CastExprRes.isInvalid())
	return ExprError();
	SplattedExpr = CastExprRes.get();
	}
	return ImpCastExprToType(SplattedExpr, DestElemTy, CK);
	}

	ExprResult Sema::CheckExtVectorCast(SourceRange R, QualType DestTy,
	Expr *CastExpr, CastKind &Kind) {
	assert(DestTy->isExtVectorType() && "Not an extended vector type!");

	QualType SrcTy = CastExpr->getType();

	// If SrcTy is a VectorType, the total size must match to explicitly cast to
	// an ExtVectorType.
	// In OpenCL, casts between vectors of different types are not allowed.
	// (See OpenCL 6.2).
	if (SrcTy->isVectorType()) {
	if (!areLaxCompatibleVectorTypes(SrcTy, DestTy)
	\|\| (getLangOpts().OpenCL &&
	(DestTy.getCanonicalType() != SrcTy.getCanonicalType()))) {
	Diag(R.getBegin(),diag::err_invalid_conversion_between_ext_vectors)
	<< DestTy << SrcTy << R;
	return ExprError();
	}
	Kind = CK_BitCast;
	return CastExpr;
	}

	// All non-pointer scalars can be cast to ExtVector type. The appropriate
	// conversion will take place first from scalar to elt type, and then
	// splat from elt type to vector.
	if (SrcTy->isPointerType())
	return Diag(R.getBegin(),
	diag::err_invalid_conversion_between_vector_and_scalar)
	<< DestTy << SrcTy << R;

	Kind = CK_VectorSplat;
	return prepareVectorSplat(DestTy, CastExpr);
	}

	ExprResult
	Sema::ActOnCastExpr(Scope *S, SourceLocation LParenLoc,
	Declarator &D, ParsedType &Ty,
	SourceLocation RParenLoc, Expr *CastExpr) {
	assert(!D.isInvalidType() && (CastExpr != nullptr) &&
	"ActOnCastExpr(): missing type or expr");

	TypeSourceInfo *castTInfo = GetTypeForDeclaratorCast(D, CastExpr->getType());
	if (D.isInvalidType())
	return ExprError();

	if (getLangOpts().CPlusPlus) {
	// Check that there are no default arguments (C++ only).
	CheckExtraCXXDefaultArguments(D);
	} else {
	// Make sure any TypoExprs have been dealt with.
	ExprResult Res = CorrectDelayedTyposInExpr(CastExpr);
	if (!Res.isUsable())
	return ExprError();
	CastExpr = Res.get();
	}

	checkUnusedDeclAttributes(D);

	QualType castType = castTInfo->getType();
	Ty = CreateParsedType(castType, castTInfo);

	bool isVectorLiteral = false;

	// Check for an altivec or OpenCL literal,
	// i.e. all the elements are integer constants.
	ParenExpr *PE = dyn_cast<ParenExpr>(CastExpr);
	ParenListExpr *PLE = dyn_cast<ParenListExpr>(CastExpr);
	if ((getLangOpts().AltiVec \|\| getLangOpts().ZVector \|\| getLangOpts().OpenCL)
	&& castType->isVectorType() && (PE \|\| PLE)) {
	if (PLE && PLE->getNumExprs() == 0) {
	Diag(PLE->getExprLoc(), diag::err_altivec_empty_initializer);
	return ExprError();
	}
	if (PE \|\| PLE->getNumExprs() == 1) {
	Expr *E = (PE ? PE->getSubExpr() : PLE->getExpr(0));
	if (!E->getType()->isVectorType())
	isVectorLiteral = true;
	}
	else
	isVectorLiteral = true;
	}

	// If this is a vector initializer, '(' type ')' '(' init, ..., init ')'
	// then handle it as such.
	if (isVectorLiteral)
	return BuildVectorLiteral(LParenLoc, RParenLoc, CastExpr, castTInfo);

	// If the Expr being casted is a ParenListExpr, handle it specially.
	// This is not an AltiVec-style cast, so turn the ParenListExpr into a
	// sequence of BinOp comma operators.
	if (isa<ParenListExpr>(CastExpr)) {
	ExprResult Result = MaybeConvertParenListExprToParenExpr(S, CastExpr);
	if (Result.isInvalid()) return ExprError();
	CastExpr = Result.get();
	}

	if (getLangOpts().CPlusPlus && !castType->isVoidType() &&
	!getSourceManager().isInSystemMacro(LParenLoc))
	Diag(LParenLoc, diag::warn_old_style_cast) << CastExpr->getSourceRange();

	CheckTollFreeBridgeCast(castType, CastExpr);

	CheckObjCBridgeRelatedCast(castType, CastExpr);

	DiscardMisalignedMemberAddress(castType.getTypePtr(), CastExpr);

	return BuildCStyleCastExpr(LParenLoc, castTInfo, RParenLoc, CastExpr);
	}

	ExprResult Sema::BuildVectorLiteral(SourceLocation LParenLoc,
	SourceLocation RParenLoc, Expr *E,
	TypeSourceInfo *TInfo) {
	assert((isa<ParenListExpr>(E) \|\| isa<ParenExpr>(E)) &&
	"Expected paren or paren list expression");

	Expr **exprs;
	unsigned numExprs;
	Expr *subExpr;
	SourceLocation LiteralLParenLoc, LiteralRParenLoc;
	if (ParenListExpr *PE = dyn_cast<ParenListExpr>(E)) {
	LiteralLParenLoc = PE->getLParenLoc();
	LiteralRParenLoc = PE->getRParenLoc();
	exprs = PE->getExprs();
	numExprs = PE->getNumExprs();
	} else { // isa<ParenExpr> by assertion at function entrance
	LiteralLParenLoc = cast<ParenExpr>(E)->getLParen();
	LiteralRParenLoc = cast<ParenExpr>(E)->getRParen();
	subExpr = cast<ParenExpr>(E)->getSubExpr();
	exprs = &subExpr;
	numExprs = 1;
	}

	QualType Ty = TInfo->getType();
	assert(Ty->isVectorType() && "Expected vector type");

	SmallVector<Expr *, 8> initExprs;
	const VectorType *VTy = Ty->getAs<VectorType>();
	unsigned numElems = Ty->getAs<VectorType>()->getNumElements();

	// '(...)' form of vector initialization in AltiVec: the number of
	// initializers must be one or must match the size of the vector.
	// If a single value is specified in the initializer then it will be
	// replicated to all the components of the vector
	if (VTy->getVectorKind() == VectorType::AltiVecVector) {
	// The number of initializers must be one or must match the size of the
	// vector. If a single value is specified in the initializer then it will
	// be replicated to all the components of the vector
	if (numExprs == 1) {
	QualType ElemTy = Ty->getAs<VectorType>()->getElementType();
	ExprResult Literal = DefaultLvalueConversion(exprs[0]);
	if (Literal.isInvalid())
	return ExprError();
	Literal = ImpCastExprToType(Literal.get(), ElemTy,
	PrepareScalarCast(Literal, ElemTy));
	return BuildCStyleCastExpr(LParenLoc, TInfo, RParenLoc, Literal.get());
	}
	else if (numExprs < numElems) {
	Diag(E->getExprLoc(),
	diag::err_incorrect_number_of_vector_initializers);
	return ExprError();
	}
	else
	initExprs.append(exprs, exprs + numExprs);
	}
	else {
	// For OpenCL, when the number of initializers is a single value,
	// it will be replicated to all components of the vector.
	if (getLangOpts().OpenCL &&
	VTy->getVectorKind() == VectorType::GenericVector &&
	numExprs == 1) {
	QualType ElemTy = Ty->getAs<VectorType>()->getElementType();
	ExprResult Literal = DefaultLvalueConversion(exprs[0]);
	if (Literal.isInvalid())
	return ExprError();
	Literal = ImpCastExprToType(Literal.get(), ElemTy,
	PrepareScalarCast(Literal, ElemTy));
	return BuildCStyleCastExpr(LParenLoc, TInfo, RParenLoc, Literal.get());
	}

	initExprs.append(exprs, exprs + numExprs);
	}
	// FIXME: This means that pretty-printing the final AST will produce curly
	// braces instead of the original commas.
	InitListExpr *initE = new (Context) InitListExpr(Context, LiteralLParenLoc,
	initExprs, LiteralRParenLoc);
	initE->setType(Ty);
	return BuildCompoundLiteralExpr(LParenLoc, TInfo, RParenLoc, initE);
	}

	/// This is not an AltiVec-style cast or or C++ direct-initialization, so turn
	/// the ParenListExpr into a sequence of comma binary operators.
	ExprResult
	Sema::MaybeConvertParenListExprToParenExpr(Scope S, Expr OrigExpr) {
	ParenListExpr *E = dyn_cast<ParenListExpr>(OrigExpr);
	if (!E)
	return OrigExpr;

	ExprResult Result(E->getExpr(0));

	for (unsigned i = 1, e = E->getNumExprs(); i != e && !Result.isInvalid(); ++i)
	Result = ActOnBinOp(S, E->getExprLoc(), tok::comma, Result.get(),
	E->getExpr(i));

	if (Result.isInvalid()) return ExprError();

	return ActOnParenExpr(E->getLParenLoc(), E->getRParenLoc(), Result.get());
	}

	ExprResult Sema::ActOnParenListExpr(SourceLocation L,
	SourceLocation R,
	MultiExprArg Val) {
	Expr *expr = new (Context) ParenListExpr(Context, L, Val, R);
	return expr;
	}

	/// \brief Emit a specialized diagnostic when one expression is a null pointer
	/// constant and the other is not a pointer. Returns true if a diagnostic is
	/// emitted.
	bool Sema::DiagnoseConditionalForNull(Expr LHSExpr, Expr RHSExpr,
	SourceLocation QuestionLoc) {
	Expr *NullExpr = LHSExpr;
	Expr *NonPointerExpr = RHSExpr;
	Expr::NullPointerConstantKind NullKind =
	NullExpr->isNullPointerConstant(Context,
	Expr::NPC_ValueDependentIsNotNull);

	if (NullKind == Expr::NPCK_NotNull) {
	NullExpr = RHSExpr;
	NonPointerExpr = LHSExpr;
	NullKind =
	NullExpr->isNullPointerConstant(Context,
	Expr::NPC_ValueDependentIsNotNull);
	}

	if (NullKind == Expr::NPCK_NotNull)
	return false;

	if (NullKind == Expr::NPCK_ZeroExpression)
	return false;

	if (NullKind == Expr::NPCK_ZeroLiteral) {
	// In this case, check to make sure that we got here from a "NULL"
	// string in the source code.
	NullExpr = NullExpr->IgnoreParenImpCasts();
	SourceLocation loc = NullExpr->getExprLoc();
	if (!findMacroSpelling(loc, "NULL"))
	return false;
	}

	int DiagType = (NullKind == Expr::NPCK_CXX11_nullptr);
	Diag(QuestionLoc, diag::err_typecheck_cond_incompatible_operands_null)
	<< NonPointerExpr->getType() << DiagType
	<< NonPointerExpr->getSourceRange();
	return true;
	}

	/// \brief Return false if the condition expression is valid, true otherwise.
	static bool checkCondition(Sema &S, Expr *Cond, SourceLocation QuestionLoc) {
	QualType CondTy = Cond->getType();

	// OpenCL v1.1 s6.3.i says the condition cannot be a floating point type.
	if (S.getLangOpts().OpenCL && CondTy->isFloatingType()) {
	S.Diag(QuestionLoc, diag::err_typecheck_cond_expect_nonfloat)
	<< CondTy << Cond->getSourceRange();
	return true;
	}

	// C99 6.5.15p2
	if (CondTy->isScalarType()) return false;

	S.Diag(QuestionLoc, diag::err_typecheck_cond_expect_scalar)
	<< CondTy << Cond->getSourceRange();
	return true;
	}

	/// \brief Handle when one or both operands are void type.
	static QualType checkConditionalVoidType(Sema &S, ExprResult &LHS,
	ExprResult &RHS) {
	Expr *LHSExpr = LHS.get();
	Expr *RHSExpr = RHS.get();

	if (!LHSExpr->getType()->isVoidType())
	S.Diag(RHSExpr->getLocStart(), diag::ext_typecheck_cond_one_void)
	<< RHSExpr->getSourceRange();
	if (!RHSExpr->getType()->isVoidType())
	S.Diag(LHSExpr->getLocStart(), diag::ext_typecheck_cond_one_void)
	<< LHSExpr->getSourceRange();
	LHS = S.ImpCastExprToType(LHS.get(), S.Context.VoidTy, CK_ToVoid);
	RHS = S.ImpCastExprToType(RHS.get(), S.Context.VoidTy, CK_ToVoid);
	return S.Context.VoidTy;
	}

	/// \brief Return false if the NullExpr can be promoted to PointerTy,
	/// true otherwise.
	static bool checkConditionalNullPointer(Sema &S, ExprResult &NullExpr,
	QualType PointerTy) {
	if ((!PointerTy->isAnyPointerType() && !PointerTy->isBlockPointerType()) \|\|
	!NullExpr.get()->isNullPointerConstant(S.Context,
	Expr::NPC_ValueDependentIsNull))
	return true;

	NullExpr = S.ImpCastExprToType(NullExpr.get(), PointerTy, CK_NullToPointer);
	return false;
	}

	/// \brief Checks compatibility between two pointers and return the resulting
	/// type.
	static QualType checkConditionalPointerCompatibility(Sema &S, ExprResult &LHS,
	ExprResult &RHS,
	SourceLocation Loc) {
	QualType LHSTy = LHS.get()->getType();
	QualType RHSTy = RHS.get()->getType();

	if (S.Context.hasSameType(LHSTy, RHSTy)) {
	// Two identical pointers types are always compatible.
	return LHSTy;
	}

	QualType lhptee, rhptee;

	// Get the pointee types.
	bool IsBlockPointer = false;
	if (const BlockPointerType *LHSBTy = LHSTy->getAs<BlockPointerType>()) {
	lhptee = LHSBTy->getPointeeType();
	rhptee = RHSTy->castAs<BlockPointerType>()->getPointeeType();
	IsBlockPointer = true;
	} else {
	lhptee = LHSTy->castAs<PointerType>()->getPointeeType();
	rhptee = RHSTy->castAs<PointerType>()->getPointeeType();
	}

	// C99 6.5.15p6: If both operands are pointers to compatible types or to
	// differently qualified versions of compatible types, the result type is
	// a pointer to an appropriately qualified version of the composite
	// type.

	// Only CVR-qualifiers exist in the standard, and the differently-qualified
	// clause doesn't make sense for our extensions. E.g. address space 2 should
	// be incompatible with address space 3: they may live on different devices or
	// anything.
	Qualifiers lhQual = lhptee.getQualifiers();
	Qualifiers rhQual = rhptee.getQualifiers();

	unsigned ResultAddrSpace = 0;
	unsigned LAddrSpace = lhQual.getAddressSpace();
	unsigned RAddrSpace = rhQual.getAddressSpace();
	if (S.getLangOpts().OpenCL) {
	// OpenCL v1.1 s6.5 - Conversion between pointers to distinct address
	// spaces is disallowed.
	if (lhQual.isAddressSpaceSupersetOf(rhQual))
	ResultAddrSpace = LAddrSpace;
	else if (rhQual.isAddressSpaceSupersetOf(lhQual))
	ResultAddrSpace = RAddrSpace;
	else {
	S.Diag(Loc,
	diag::err_typecheck_op_on_nonoverlapping_address_space_pointers)
	<< LHSTy << RHSTy << 2 << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	return QualType();
	}
	}

	unsigned MergedCVRQual = lhQual.getCVRQualifiers() \| rhQual.getCVRQualifiers();
	auto LHSCastKind = CK_BitCast, RHSCastKind = CK_BitCast;
	lhQual.removeCVRQualifiers();
	rhQual.removeCVRQualifiers();

	// OpenCL v2.0 specification doesn't extend compatibility of type qualifiers
	// (C99 6.7.3) for address spaces. We assume that the check should behave in
	// the same manner as it's defined for CVR qualifiers, so for OpenCL two
	// qual types are compatible iff
	// * corresponded types are compatible
	// * CVR qualifiers are equal
	// * address spaces are equal
	// Thus for conditional operator we merge CVR and address space unqualified
	// pointees and if there is a composite type we return a pointer to it with
	// merged qualifiers.
	if (S.getLangOpts().OpenCL) {
	LHSCastKind = LAddrSpace == ResultAddrSpace
	? CK_BitCast
	: CK_AddressSpaceConversion;
	RHSCastKind = RAddrSpace == ResultAddrSpace
	? CK_BitCast
	: CK_AddressSpaceConversion;
	lhQual.removeAddressSpace();
	rhQual.removeAddressSpace();
	}

	lhptee = S.Context.getQualifiedType(lhptee.getUnqualifiedType(), lhQual);
	rhptee = S.Context.getQualifiedType(rhptee.getUnqualifiedType(), rhQual);

	QualType CompositeTy = S.Context.mergeTypes(lhptee, rhptee);

	if (CompositeTy.isNull()) {
	// In this situation, we assume void* type. No especially good
	// reason, but this is what gcc does, and we do have to pick
	// to get a consistent AST.
	QualType incompatTy;
	incompatTy = S.Context.getPointerType(
	S.Context.getAddrSpaceQualType(S.Context.VoidTy, ResultAddrSpace));
	LHS = S.ImpCastExprToType(LHS.get(), incompatTy, LHSCastKind);
	RHS = S.ImpCastExprToType(RHS.get(), incompatTy, RHSCastKind);
	// FIXME: For OpenCL the warning emission and cast to void* leaves a room
	// for casts between types with incompatible address space qualifiers.
	// For the following code the compiler produces casts between global and
	// local address spaces of the corresponded innermost pointees:
	// local int global a;
	// global int global b;
	// a = (0 ? a : b); // see C99 6.5.16.1.p1.
	S.Diag(Loc, diag::ext_typecheck_cond_incompatible_pointers)
	<< LHSTy << RHSTy << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	return incompatTy;
	}

	// The pointer types are compatible.
	// In case of OpenCL ResultTy should have the address space qualifier
	// which is a superset of address spaces of both the 2nd and the 3rd
	// operands of the conditional operator.
	QualType ResultTy = [&, ResultAddrSpace]() {
	if (S.getLangOpts().OpenCL) {
	Qualifiers CompositeQuals = CompositeTy.getQualifiers();
	CompositeQuals.setAddressSpace(ResultAddrSpace);
	return S.Context
	.getQualifiedType(CompositeTy.getUnqualifiedType(), CompositeQuals)
	.withCVRQualifiers(MergedCVRQual);
	}
	return CompositeTy.withCVRQualifiers(MergedCVRQual);
	}();
	if (IsBlockPointer)
	ResultTy = S.Context.getBlockPointerType(ResultTy);
	else
	ResultTy = S.Context.getPointerType(ResultTy);

	LHS = S.ImpCastExprToType(LHS.get(), ResultTy, LHSCastKind);
	RHS = S.ImpCastExprToType(RHS.get(), ResultTy, RHSCastKind);
	return ResultTy;
	}

	/// \brief Return the resulting type when the operands are both block pointers.
	static QualType checkConditionalBlockPointerCompatibility(Sema &S,
	ExprResult &LHS,
	ExprResult &RHS,
	SourceLocation Loc) {
	QualType LHSTy = LHS.get()->getType();
	QualType RHSTy = RHS.get()->getType();

	if (!LHSTy->isBlockPointerType() \|\| !RHSTy->isBlockPointerType()) {
	if (LHSTy->isVoidPointerType() \|\| RHSTy->isVoidPointerType()) {
	QualType destType = S.Context.getPointerType(S.Context.VoidTy);
	LHS = S.ImpCastExprToType(LHS.get(), destType, CK_BitCast);
	RHS = S.ImpCastExprToType(RHS.get(), destType, CK_BitCast);
	return destType;
	}
	S.Diag(Loc, diag::err_typecheck_cond_incompatible_operands)
	<< LHSTy << RHSTy << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	return QualType();
	}

	// We have 2 block pointer types.
	return checkConditionalPointerCompatibility(S, LHS, RHS, Loc);
	}

	/// \brief Return the resulting type when the operands are both pointers.
	static QualType
	checkConditionalObjectPointersCompatibility(Sema &S, ExprResult &LHS,
	ExprResult &RHS,
	SourceLocation Loc) {
	// get the pointer types
	QualType LHSTy = LHS.get()->getType();
	QualType RHSTy = RHS.get()->getType();

	// get the "pointed to" types
	QualType lhptee = LHSTy->getAs<PointerType>()->getPointeeType();
	QualType rhptee = RHSTy->getAs<PointerType>()->getPointeeType();

	// ignore qualifiers on void (C99 6.5.15p3, clause 6)
	if (lhptee->isVoidType() && rhptee->isIncompleteOrObjectType()) {
	// Figure out necessary qualifiers (C99 6.5.15p6)
	QualType destPointee
	= S.Context.getQualifiedType(lhptee, rhptee.getQualifiers());
	QualType destType = S.Context.getPointerType(destPointee);
	// Add qualifiers if necessary.
	LHS = S.ImpCastExprToType(LHS.get(), destType, CK_NoOp);
	// Promote to void*.
	RHS = S.ImpCastExprToType(RHS.get(), destType, CK_BitCast);
	return destType;
	}
	if (rhptee->isVoidType() && lhptee->isIncompleteOrObjectType()) {
	QualType destPointee
	= S.Context.getQualifiedType(rhptee, lhptee.getQualifiers());
	QualType destType = S.Context.getPointerType(destPointee);
	// Add qualifiers if necessary.
	RHS = S.ImpCastExprToType(RHS.get(), destType, CK_NoOp);
	// Promote to void*.
	LHS = S.ImpCastExprToType(LHS.get(), destType, CK_BitCast);
	return destType;
	}

	return checkConditionalPointerCompatibility(S, LHS, RHS, Loc);
	}

	/// \brief Return false if the first expression is not an integer and the second
	/// expression is not a pointer, true otherwise.
	static bool checkPointerIntegerMismatch(Sema &S, ExprResult &Int,
	Expr* PointerExpr, SourceLocation Loc,
	bool IsIntFirstExpr) {
	if (!PointerExpr->getType()->isPointerType() \|\|
	!Int.get()->getType()->isIntegerType())
	return false;

	Expr *Expr1 = IsIntFirstExpr ? Int.get() : PointerExpr;
	Expr *Expr2 = IsIntFirstExpr ? PointerExpr : Int.get();

	S.Diag(Loc, diag::ext_typecheck_cond_pointer_integer_mismatch)
	<< Expr1->getType() << Expr2->getType()
	<< Expr1->getSourceRange() << Expr2->getSourceRange();
	Int = S.ImpCastExprToType(Int.get(), PointerExpr->getType(),
	CK_IntegralToPointer);
	return true;
	}

	/// \brief Simple conversion between integer and floating point types.
	///
	/// Used when handling the OpenCL conditional operator where the
	/// condition is a vector while the other operands are scalar.
	///
	/// OpenCL v1.1 s6.3.i and s6.11.6 together require that the scalar
	/// types are either integer or floating type. Between the two
	/// operands, the type with the higher rank is defined as the "result
	/// type". The other operand needs to be promoted to the same type. No
	/// other type promotion is allowed. We cannot use
	/// UsualArithmeticConversions() for this purpose, since it always
	/// promotes promotable types.
	static QualType OpenCLArithmeticConversions(Sema &S, ExprResult &LHS,
	ExprResult &RHS,
	SourceLocation QuestionLoc) {
	LHS = S.DefaultFunctionArrayLvalueConversion(LHS.get());
	if (LHS.isInvalid())
	return QualType();
	RHS = S.DefaultFunctionArrayLvalueConversion(RHS.get());
	if (RHS.isInvalid())
	return QualType();

	// For conversion purposes, we ignore any qualifiers.
	// For example, "const float" and "float" are equivalent.
	QualType LHSType =
	S.Context.getCanonicalType(LHS.get()->getType()).getUnqualifiedType();
	QualType RHSType =
	S.Context.getCanonicalType(RHS.get()->getType()).getUnqualifiedType();

	if (!LHSType->isIntegerType() && !LHSType->isRealFloatingType()) {
	S.Diag(QuestionLoc, diag::err_typecheck_cond_expect_int_float)
	<< LHSType << LHS.get()->getSourceRange();
	return QualType();
	}

	if (!RHSType->isIntegerType() && !RHSType->isRealFloatingType()) {
	S.Diag(QuestionLoc, diag::err_typecheck_cond_expect_int_float)
	<< RHSType << RHS.get()->getSourceRange();
	return QualType();
	}

	// If both types are identical, no conversion is needed.
	if (LHSType == RHSType)
	return LHSType;

	// Now handle "real" floating types (i.e. float, double, long double).
	if (LHSType->isRealFloatingType() \|\| RHSType->isRealFloatingType())
	return handleFloatConversion(S, LHS, RHS, LHSType, RHSType,
	/IsCompAssign = / false);

	// Finally, we have two differing integer types.
	return handleIntegerConversion<doIntegralCast, doIntegralCast>
	(S, LHS, RHS, LHSType, RHSType, /IsCompAssign = / false);
	}

	/// \brief Convert scalar operands to a vector that matches the
	/// condition in length.
	///
	/// Used when handling the OpenCL conditional operator where the
	/// condition is a vector while the other operands are scalar.
	///
	/// We first compute the "result type" for the scalar operands
	/// according to OpenCL v1.1 s6.3.i. Both operands are then converted
	/// into a vector of that type where the length matches the condition
	/// vector type. s6.11.6 requires that the element types of the result
	/// and the condition must have the same number of bits.
	static QualType
	OpenCLConvertScalarsToVectors(Sema &S, ExprResult &LHS, ExprResult &RHS,
	QualType CondTy, SourceLocation QuestionLoc) {
	QualType ResTy = OpenCLArithmeticConversions(S, LHS, RHS, QuestionLoc);
	if (ResTy.isNull()) return QualType();

	const VectorType *CV = CondTy->getAs<VectorType>();
	assert(CV);

	// Determine the vector result type
	unsigned NumElements = CV->getNumElements();
	QualType VectorTy = S.Context.getExtVectorType(ResTy, NumElements);

	// Ensure that all types have the same number of bits
	if (S.Context.getTypeSize(CV->getElementType())
	!= S.Context.getTypeSize(ResTy)) {
	// Since VectorTy is created internally, it does not pretty print
	// with an OpenCL name. Instead, we just print a description.
	std::string EleTyName = ResTy.getUnqualifiedType().getAsString();
	SmallString<64> Str;
	llvm::raw_svector_ostream OS(Str);
	OS << "(vector of " << NumElements << " '" << EleTyName << "' values)";
	S.Diag(QuestionLoc, diag::err_conditional_vector_element_size)
	<< CondTy << OS.str();
	return QualType();
	}

	// Convert operands to the vector result type
	LHS = S.ImpCastExprToType(LHS.get(), VectorTy, CK_VectorSplat);
	RHS = S.ImpCastExprToType(RHS.get(), VectorTy, CK_VectorSplat);

	return VectorTy;
	}

	/// \brief Return false if this is a valid OpenCL condition vector
	static bool checkOpenCLConditionVector(Sema &S, Expr *Cond,
	SourceLocation QuestionLoc) {
	// OpenCL v1.1 s6.11.6 says the elements of the vector must be of
	// integral type.
	const VectorType *CondTy = Cond->getType()->getAs<VectorType>();
	assert(CondTy);
	QualType EleTy = CondTy->getElementType();
	if (EleTy->isIntegerType()) return false;

	S.Diag(QuestionLoc, diag::err_typecheck_cond_expect_nonfloat)
	<< Cond->getType() << Cond->getSourceRange();
	return true;
	}

	/// \brief Return false if the vector condition type and the vector
	/// result type are compatible.
	///
	/// OpenCL v1.1 s6.11.6 requires that both vector types have the same
	/// number of elements, and their element types have the same number
	/// of bits.
	static bool checkVectorResult(Sema &S, QualType CondTy, QualType VecResTy,
	SourceLocation QuestionLoc) {
	const VectorType *CV = CondTy->getAs<VectorType>();
	const VectorType *RV = VecResTy->getAs<VectorType>();
	assert(CV && RV);

	if (CV->getNumElements() != RV->getNumElements()) {
	S.Diag(QuestionLoc, diag::err_conditional_vector_size)
	<< CondTy << VecResTy;
	return true;
	}

	QualType CVE = CV->getElementType();
	QualType RVE = RV->getElementType();

	if (S.Context.getTypeSize(CVE) != S.Context.getTypeSize(RVE)) {
	S.Diag(QuestionLoc, diag::err_conditional_vector_element_size)
	<< CondTy << VecResTy;
	return true;
	}

	return false;
	}

	/// \brief Return the resulting type for the conditional operator in
	/// OpenCL (aka "ternary selection operator", OpenCL v1.1
	/// s6.3.i) when the condition is a vector type.
	static QualType
	OpenCLCheckVectorConditional(Sema &S, ExprResult &Cond,
	ExprResult &LHS, ExprResult &RHS,
	SourceLocation QuestionLoc) {
	Cond = S.DefaultFunctionArrayLvalueConversion(Cond.get());
	if (Cond.isInvalid())
	return QualType();
	QualType CondTy = Cond.get()->getType();

	if (checkOpenCLConditionVector(S, Cond.get(), QuestionLoc))
	return QualType();

	// If either operand is a vector then find the vector type of the
	// result as specified in OpenCL v1.1 s6.3.i.
	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType()) {
	QualType VecResTy = S.CheckVectorOperands(LHS, RHS, QuestionLoc,
	/isCompAssign/false,
	/AllowBothBool/true,
	/AllowBoolConversions/false);
	if (VecResTy.isNull()) return QualType();
	// The result type must match the condition type as specified in
	// OpenCL v1.1 s6.11.6.
	if (checkVectorResult(S, CondTy, VecResTy, QuestionLoc))
	return QualType();
	return VecResTy;
	}

	// Both operands are scalar.
	return OpenCLConvertScalarsToVectors(S, LHS, RHS, CondTy, QuestionLoc);
	}

	/// \brief Return true if the Expr is block type
	static bool checkBlockType(Sema &S, const Expr *E) {
	if (const CallExpr *CE = dyn_cast<CallExpr>(E)) {
	QualType Ty = CE->getCallee()->getType();
	if (Ty->isBlockPointerType()) {
	S.Diag(E->getExprLoc(), diag::err_opencl_ternary_with_block);
	return true;
	}
	}
	return false;
	}

	/// Note that LHS is not null here, even if this is the gnu "x ?: y" extension.
	/// In that case, LHS = cond.
	/// C99 6.5.15
	QualType Sema::CheckConditionalOperands(ExprResult &Cond, ExprResult &LHS,
	ExprResult &RHS, ExprValueKind &VK,
	ExprObjectKind &OK,
	SourceLocation QuestionLoc) {

	ExprResult LHSResult = CheckPlaceholderExpr(LHS.get());
	if (!LHSResult.isUsable()) return QualType();
	LHS = LHSResult;

	ExprResult RHSResult = CheckPlaceholderExpr(RHS.get());
	if (!RHSResult.isUsable()) return QualType();
	RHS = RHSResult;

	// C++ is sufficiently different to merit its own checker.
	if (getLangOpts().CPlusPlus)
	return CXXCheckConditionalOperands(Cond, LHS, RHS, VK, OK, QuestionLoc);

	VK = VK_RValue;
	OK = OK_Ordinary;

	// The OpenCL operator with a vector condition is sufficiently
	// different to merit its own checker.
	if (getLangOpts().OpenCL && Cond.get()->getType()->isVectorType())
	return OpenCLCheckVectorConditional(*this, Cond, LHS, RHS, QuestionLoc);

	// First, check the condition.
	Cond = UsualUnaryConversions(Cond.get());
	if (Cond.isInvalid())
	return QualType();
	if (checkCondition(*this, Cond.get(), QuestionLoc))
	return QualType();

	// Now check the two expressions.
	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType())
	return CheckVectorOperands(LHS, RHS, QuestionLoc, /isCompAssign/false,
	/AllowBothBool/true,
	/AllowBoolConversions/false);

	QualType ResTy = UsualArithmeticConversions(LHS, RHS);
	if (LHS.isInvalid() \|\| RHS.isInvalid())
	return QualType();

	QualType LHSTy = LHS.get()->getType();
	QualType RHSTy = RHS.get()->getType();

	// Diagnose attempts to convert between __float128 and long double where
	// such conversions currently can't be handled.
	if (unsupportedTypeConversion(*this, LHSTy, RHSTy)) {
	Diag(QuestionLoc,
	diag::err_typecheck_cond_incompatible_operands) << LHSTy << RHSTy
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	return QualType();
	}

	// OpenCL v2.0 s6.12.5 - Blocks cannot be used as expressions of the ternary
	// selection operator (?:).
	if (getLangOpts().OpenCL &&
	(checkBlockType(this, LHS.get()) \| checkBlockType(this, RHS.get()))) {
	return QualType();
	}

	// If both operands have arithmetic type, do the usual arithmetic conversions
	// to find a common type: C99 6.5.15p3,5.
	if (LHSTy->isArithmeticType() && RHSTy->isArithmeticType()) {
	LHS = ImpCastExprToType(LHS.get(), ResTy, PrepareScalarCast(LHS, ResTy));
	RHS = ImpCastExprToType(RHS.get(), ResTy, PrepareScalarCast(RHS, ResTy));

	return ResTy;
	}

	// If both operands are the same structure or union type, the result is that
	// type.
	if (const RecordType *LHSRT = LHSTy->getAs<RecordType>()) { // C99 6.5.15p3
	if (const RecordType *RHSRT = RHSTy->getAs<RecordType>())
	if (LHSRT->getDecl() == RHSRT->getDecl())
	// "If both the operands have structure or union type, the result has
	// that type." This implies that CV qualifiers are dropped.
	return LHSTy.getUnqualifiedType();
	// FIXME: Type of conditional expression must be complete in C mode.
	}

	// C99 6.5.15p5: "If both operands have void type, the result has void type."
	// The following \|\| allows only one side to be void (a GCC-ism).
	if (LHSTy->isVoidType() \|\| RHSTy->isVoidType()) {
	return checkConditionalVoidType(*this, LHS, RHS);
	}

	// C99 6.5.15p6 - "if one operand is a null pointer constant, the result has
	// the type of the other operand."
	if (!checkConditionalNullPointer(*this, RHS, LHSTy)) return LHSTy;
	if (!checkConditionalNullPointer(*this, LHS, RHSTy)) return RHSTy;

	// All objective-c pointer type analysis is done here.
	QualType compositeType = FindCompositeObjCPointerType(LHS, RHS,
	QuestionLoc);
	if (LHS.isInvalid() \|\| RHS.isInvalid())
	return QualType();
	if (!compositeType.isNull())
	return compositeType;


	// Handle block pointer types.
	if (LHSTy->isBlockPointerType() \|\| RHSTy->isBlockPointerType())
	return checkConditionalBlockPointerCompatibility(*this, LHS, RHS,
	QuestionLoc);

	// Check constraints for C object pointers types (C99 6.5.15p3,6).
	if (LHSTy->isPointerType() && RHSTy->isPointerType())
	return checkConditionalObjectPointersCompatibility(*this, LHS, RHS,
	QuestionLoc);

	// GCC compatibility: soften pointer/integer mismatch. Note that
	// null pointers have been filtered out by this point.
	if (checkPointerIntegerMismatch(*this, LHS, RHS.get(), QuestionLoc,
	/isIntFirstExpr=/true))
	return RHSTy;
	if (checkPointerIntegerMismatch(*this, RHS, LHS.get(), QuestionLoc,
	/isIntFirstExpr=/false))
	return LHSTy;

	// Emit a better diagnostic if one of the expressions is a null pointer
	// constant and the other is not a pointer type. In this case, the user most
	// likely forgot to take the address of the other expression.
	if (DiagnoseConditionalForNull(LHS.get(), RHS.get(), QuestionLoc))
	return QualType();

	// Otherwise, the operands are not compatible.
	Diag(QuestionLoc, diag::err_typecheck_cond_incompatible_operands)
	<< LHSTy << RHSTy << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	return QualType();
	}

	/// FindCompositeObjCPointerType - Helper method to find composite type of
	/// two objective-c pointer types of the two input expressions.
	QualType Sema::FindCompositeObjCPointerType(ExprResult &LHS, ExprResult &RHS,
	SourceLocation QuestionLoc) {
	QualType LHSTy = LHS.get()->getType();
	QualType RHSTy = RHS.get()->getType();

	// Handle things like Class and struct objc_class*. Here we case the result
	// to the pseudo-builtin, because that will be implicitly cast back to the
	// redefinition type if an attempt is made to access its fields.
	if (LHSTy->isObjCClassType() &&
	(Context.hasSameType(RHSTy, Context.getObjCClassRedefinitionType()))) {
	RHS = ImpCastExprToType(RHS.get(), LHSTy, CK_CPointerToObjCPointerCast);
	return LHSTy;
	}
	if (RHSTy->isObjCClassType() &&
	(Context.hasSameType(LHSTy, Context.getObjCClassRedefinitionType()))) {
	LHS = ImpCastExprToType(LHS.get(), RHSTy, CK_CPointerToObjCPointerCast);
	return RHSTy;
	}
	// And the same for struct objc_object* / id
	if (LHSTy->isObjCIdType() &&
	(Context.hasSameType(RHSTy, Context.getObjCIdRedefinitionType()))) {
	RHS = ImpCastExprToType(RHS.get(), LHSTy, CK_CPointerToObjCPointerCast);
	return LHSTy;
	}
	if (RHSTy->isObjCIdType() &&
	(Context.hasSameType(LHSTy, Context.getObjCIdRedefinitionType()))) {
	LHS = ImpCastExprToType(LHS.get(), RHSTy, CK_CPointerToObjCPointerCast);
	return RHSTy;
	}
	// And the same for struct objc_selector* / SEL
	if (Context.isObjCSelType(LHSTy) &&
	(Context.hasSameType(RHSTy, Context.getObjCSelRedefinitionType()))) {
	RHS = ImpCastExprToType(RHS.get(), LHSTy, CK_BitCast);
	return LHSTy;
	}
	if (Context.isObjCSelType(RHSTy) &&
	(Context.hasSameType(LHSTy, Context.getObjCSelRedefinitionType()))) {
	LHS = ImpCastExprToType(LHS.get(), RHSTy, CK_BitCast);
	return RHSTy;
	}
	// Check constraints for Objective-C object pointers types.
	if (LHSTy->isObjCObjectPointerType() && RHSTy->isObjCObjectPointerType()) {

	if (Context.getCanonicalType(LHSTy) == Context.getCanonicalType(RHSTy)) {
	// Two identical object pointer types are always compatible.
	return LHSTy;
	}
	const ObjCObjectPointerType *LHSOPT = LHSTy->castAs<ObjCObjectPointerType>();
	const ObjCObjectPointerType *RHSOPT = RHSTy->castAs<ObjCObjectPointerType>();
	QualType compositeType = LHSTy;

	// If both operands are interfaces and either operand can be
	// assigned to the other, use that type as the composite
	// type. This allows
	// xxx ? (A) a : (B) b
	// where B is a subclass of A.
	//
	// Additionally, as for assignment, if either type is 'id'
	// allow silent coercion. Finally, if the types are
	// incompatible then make sure to use 'id' as the composite
	// type so the result is acceptable for sending messages to.

	// FIXME: Consider unifying with 'areComparableObjCPointerTypes'.
	// It could return the composite type.
	if (!(compositeType =
	Context.areCommonBaseCompatible(LHSOPT, RHSOPT)).isNull()) {
	// Nothing more to do.
	} else if (Context.canAssignObjCInterfaces(LHSOPT, RHSOPT)) {
	compositeType = RHSOPT->isObjCBuiltinType() ? RHSTy : LHSTy;
	} else if (Context.canAssignObjCInterfaces(RHSOPT, LHSOPT)) {
	compositeType = LHSOPT->isObjCBuiltinType() ? LHSTy : RHSTy;
	} else if ((LHSTy->isObjCQualifiedIdType() \|\|
	RHSTy->isObjCQualifiedIdType()) &&
	Context.ObjCQualifiedIdTypesAreCompatible(LHSTy, RHSTy, true)) {
	// Need to handle "id<xx>" explicitly.
	// GCC allows qualified id and any Objective-C type to devolve to
	// id. Currently localizing to here until clear this should be
	// part of ObjCQualifiedIdTypesAreCompatible.
	compositeType = Context.getObjCIdType();
	} else if (LHSTy->isObjCIdType() \|\| RHSTy->isObjCIdType()) {
	compositeType = Context.getObjCIdType();
	} else {
	Diag(QuestionLoc, diag::ext_typecheck_cond_incompatible_operands)
	<< LHSTy << RHSTy
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	QualType incompatTy = Context.getObjCIdType();
	LHS = ImpCastExprToType(LHS.get(), incompatTy, CK_BitCast);
	RHS = ImpCastExprToType(RHS.get(), incompatTy, CK_BitCast);
	return incompatTy;
	}
	// The object pointer types are compatible.
	LHS = ImpCastExprToType(LHS.get(), compositeType, CK_BitCast);
	RHS = ImpCastExprToType(RHS.get(), compositeType, CK_BitCast);
	return compositeType;
	}
	// Check Objective-C object pointer types and 'void *'
	if (LHSTy->isVoidPointerType() && RHSTy->isObjCObjectPointerType()) {
	if (getLangOpts().ObjCAutoRefCount) {
	// ARC forbids the implicit conversion of object pointers to 'void *',
	// so these types are not compatible.
	Diag(QuestionLoc, diag::err_cond_voidptr_arc) << LHSTy << RHSTy
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	LHS = RHS = true;
	return QualType();
	}
	QualType lhptee = LHSTy->getAs<PointerType>()->getPointeeType();
	QualType rhptee = RHSTy->getAs<ObjCObjectPointerType>()->getPointeeType();
	QualType destPointee
	= Context.getQualifiedType(lhptee, rhptee.getQualifiers());
	QualType destType = Context.getPointerType(destPointee);
	// Add qualifiers if necessary.
	LHS = ImpCastExprToType(LHS.get(), destType, CK_NoOp);
	// Promote to void*.
	RHS = ImpCastExprToType(RHS.get(), destType, CK_BitCast);
	return destType;
	}
	if (LHSTy->isObjCObjectPointerType() && RHSTy->isVoidPointerType()) {
	if (getLangOpts().ObjCAutoRefCount) {
	// ARC forbids the implicit conversion of object pointers to 'void *',
	// so these types are not compatible.
	Diag(QuestionLoc, diag::err_cond_voidptr_arc) << LHSTy << RHSTy
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	LHS = RHS = true;
	return QualType();
	}
	QualType lhptee = LHSTy->getAs<ObjCObjectPointerType>()->getPointeeType();
	QualType rhptee = RHSTy->getAs<PointerType>()->getPointeeType();
	QualType destPointee
	= Context.getQualifiedType(rhptee, lhptee.getQualifiers());
	QualType destType = Context.getPointerType(destPointee);
	// Add qualifiers if necessary.
	RHS = ImpCastExprToType(RHS.get(), destType, CK_NoOp);
	// Promote to void*.
	LHS = ImpCastExprToType(LHS.get(), destType, CK_BitCast);
	return destType;
	}
	return QualType();
	}

	/// SuggestParentheses - Emit a note with a fixit hint that wraps
	/// ParenRange in parentheses.
	static void SuggestParentheses(Sema &Self, SourceLocation Loc,
	const PartialDiagnostic &Note,
	SourceRange ParenRange) {
	SourceLocation EndLoc = Self.getLocForEndOfToken(ParenRange.getEnd());
	if (ParenRange.getBegin().isFileID() && ParenRange.getEnd().isFileID() &&
	EndLoc.isValid()) {
	Self.Diag(Loc, Note)
	<< FixItHint::CreateInsertion(ParenRange.getBegin(), "(")
	<< FixItHint::CreateInsertion(EndLoc, ")");
	} else {
	// We can't display the parentheses, so just show the bare note.
	Self.Diag(Loc, Note) << ParenRange;
	}
	}

	static bool IsArithmeticOp(BinaryOperatorKind Opc) {
	return BinaryOperator::isAdditiveOp(Opc) \|\|
	BinaryOperator::isMultiplicativeOp(Opc) \|\|
	BinaryOperator::isShiftOp(Opc);
	}

	/// IsArithmeticBinaryExpr - Returns true if E is an arithmetic binary
	/// expression, either using a built-in or overloaded operator,
	/// and sets OpCode to the opcode and RHSExprs to the right-hand side
	/// expression.
	static bool IsArithmeticBinaryExpr(Expr E, BinaryOperatorKind Opcode,
	Expr **RHSExprs) {
	// Don't strip parenthesis: we should not warn if E is in parenthesis.
	E = E->IgnoreImpCasts();
	E = E->IgnoreConversionOperator();
	E = E->IgnoreImpCasts();

	// Built-in binary operator.
	if (BinaryOperator *OP = dyn_cast<BinaryOperator>(E)) {
	if (IsArithmeticOp(OP->getOpcode())) {
	*Opcode = OP->getOpcode();
	*RHSExprs = OP->getRHS();
	return true;
	}
	}

	// Overloaded operator.
	if (CXXOperatorCallExpr *Call = dyn_cast<CXXOperatorCallExpr>(E)) {
	if (Call->getNumArgs() != 2)
	return false;

	// Make sure this is really a binary operator that is safe to pass into
	// BinaryOperator::getOverloadedOpcode(), e.g. it's not a subscript op.
	OverloadedOperatorKind OO = Call->getOperator();
	if (OO < OO_Plus \|\| OO > OO_Arrow \|\|
	OO == OO_PlusPlus \|\| OO == OO_MinusMinus)
	return false;

	BinaryOperatorKind OpKind = BinaryOperator::getOverloadedOpcode(OO);
	if (IsArithmeticOp(OpKind)) {
	*Opcode = OpKind;
	*RHSExprs = Call->getArg(1);
	return true;
	}
	}

	return false;
	}

	/// ExprLooksBoolean - Returns true if E looks boolean, i.e. it has boolean type
	/// or is a logical expression such as (x==y) which has int type, but is
	/// commonly interpreted as boolean.
	static bool ExprLooksBoolean(Expr *E) {
	E = E->IgnoreParenImpCasts();

	if (E->getType()->isBooleanType())
	return true;
	if (BinaryOperator *OP = dyn_cast<BinaryOperator>(E))
	return OP->isComparisonOp() \|\| OP->isLogicalOp();
	if (UnaryOperator *OP = dyn_cast<UnaryOperator>(E))
	return OP->getOpcode() == UO_LNot;
	if (E->getType()->isPointerType())
	return true;

	return false;
	}

	/// DiagnoseConditionalPrecedence - Emit a warning when a conditional operator
	/// and binary operator are mixed in a way that suggests the programmer assumed
	/// the conditional operator has higher precedence, for example:
	/// "int x = a + someBinaryCondition ? 1 : 2".
	static void DiagnoseConditionalPrecedence(Sema &Self,
	SourceLocation OpLoc,
	Expr *Condition,
	Expr *LHSExpr,
	Expr *RHSExpr) {
	BinaryOperatorKind CondOpcode;
	Expr *CondRHS;

	if (!IsArithmeticBinaryExpr(Condition, &CondOpcode, &CondRHS))
	return;
	if (!ExprLooksBoolean(CondRHS))
	return;

	// The condition is an arithmetic binary expression, with a right-
	// hand side that looks boolean, so warn.

	Self.Diag(OpLoc, diag::warn_precedence_conditional)
	<< Condition->getSourceRange()
	<< BinaryOperator::getOpcodeStr(CondOpcode);

	SuggestParentheses(Self, OpLoc,
	Self.PDiag(diag::note_precedence_silence)
	<< BinaryOperator::getOpcodeStr(CondOpcode),
	SourceRange(Condition->getLocStart(), Condition->getLocEnd()));

	SuggestParentheses(Self, OpLoc,
	Self.PDiag(diag::note_precedence_conditional_first),
	SourceRange(CondRHS->getLocStart(), RHSExpr->getLocEnd()));
	}

	/// Compute the nullability of a conditional expression.
	static QualType computeConditionalNullability(QualType ResTy, bool IsBin,
	QualType LHSTy, QualType RHSTy,
	ASTContext &Ctx) {
	if (!ResTy->isAnyPointerType())
	return ResTy;

	auto GetNullability = [&Ctx](QualType Ty) {
	Optional<NullabilityKind> Kind = Ty->getNullability(Ctx);
	if (Kind)
	return *Kind;
	return NullabilityKind::Unspecified;
	};

	auto LHSKind = GetNullability(LHSTy), RHSKind = GetNullability(RHSTy);
	NullabilityKind MergedKind;

	// Compute nullability of a binary conditional expression.
	if (IsBin) {
	if (LHSKind == NullabilityKind::NonNull)
	MergedKind = NullabilityKind::NonNull;
	else
	MergedKind = RHSKind;
	// Compute nullability of a normal conditional expression.
	} else {
	if (LHSKind == NullabilityKind::Nullable \|\|
	RHSKind == NullabilityKind::Nullable)
	MergedKind = NullabilityKind::Nullable;
	else if (LHSKind == NullabilityKind::NonNull)
	MergedKind = RHSKind;
	else if (RHSKind == NullabilityKind::NonNull)
	MergedKind = LHSKind;
	else
	MergedKind = NullabilityKind::Unspecified;
	}

	// Return if ResTy already has the correct nullability.
	if (GetNullability(ResTy) == MergedKind)
	return ResTy;

	// Strip all nullability from ResTy.
	while (ResTy->getNullability(Ctx))
	ResTy = ResTy.getSingleStepDesugaredType(Ctx);

	// Create a new AttributedType with the new nullability kind.
	auto NewAttr = AttributedType::getNullabilityAttrKind(MergedKind);
	return Ctx.getAttributedType(NewAttr, ResTy, ResTy);
	}

	/// ActOnConditionalOp - Parse a ?: operation. Note that 'LHS' may be null
	/// in the case of a the GNU conditional expr extension.
	ExprResult Sema::ActOnConditionalOp(SourceLocation QuestionLoc,
	SourceLocation ColonLoc,
	Expr CondExpr, Expr LHSExpr,
	Expr *RHSExpr) {
	if (!getLangOpts().CPlusPlus) {
	// C cannot handle TypoExpr nodes in the condition because it
	// doesn't handle dependent types properly, so make sure any TypoExprs have
	// been dealt with before checking the operands.
	ExprResult CondResult = CorrectDelayedTyposInExpr(CondExpr);
	ExprResult LHSResult = CorrectDelayedTyposInExpr(LHSExpr);
	ExprResult RHSResult = CorrectDelayedTyposInExpr(RHSExpr);

	if (!CondResult.isUsable())
	return ExprError();

	if (LHSExpr) {
	if (!LHSResult.isUsable())
	return ExprError();
	}

	if (!RHSResult.isUsable())
	return ExprError();

	CondExpr = CondResult.get();
	LHSExpr = LHSResult.get();
	RHSExpr = RHSResult.get();
	}

	// If this is the gnu "x ?: y" extension, analyze the types as though the LHS
	// was the condition.
	OpaqueValueExpr *opaqueValue = nullptr;
	Expr *commonExpr = nullptr;
	if (!LHSExpr) {
	commonExpr = CondExpr;
	// Lower out placeholder types first. This is important so that we don't
	// try to capture a placeholder. This happens in few cases in C++; such
	// as Objective-C++'s dictionary subscripting syntax.
	if (commonExpr->hasPlaceholderType()) {
	ExprResult result = CheckPlaceholderExpr(commonExpr);
	if (!result.isUsable()) return ExprError();
	commonExpr = result.get();
	}
	// We usually want to apply unary conversions before saving, except
	// in the special case of a C++ l-value conditional.
	if (!(getLangOpts().CPlusPlus
	&& !commonExpr->isTypeDependent()
	&& commonExpr->getValueKind() == RHSExpr->getValueKind()
	&& commonExpr->isGLValue()
	&& commonExpr->isOrdinaryOrBitFieldObject()
	&& RHSExpr->isOrdinaryOrBitFieldObject()
	&& Context.hasSameType(commonExpr->getType(), RHSExpr->getType()))) {
	ExprResult commonRes = UsualUnaryConversions(commonExpr);
	if (commonRes.isInvalid())
	return ExprError();
	commonExpr = commonRes.get();
	}

	opaqueValue = new (Context) OpaqueValueExpr(commonExpr->getExprLoc(),
	commonExpr->getType(),
	commonExpr->getValueKind(),
	commonExpr->getObjectKind(),
	commonExpr);
	LHSExpr = CondExpr = opaqueValue;
	}

	QualType LHSTy = LHSExpr->getType(), RHSTy = RHSExpr->getType();
	ExprValueKind VK = VK_RValue;
	ExprObjectKind OK = OK_Ordinary;
	ExprResult Cond = CondExpr, LHS = LHSExpr, RHS = RHSExpr;
	QualType result = CheckConditionalOperands(Cond, LHS, RHS,
	VK, OK, QuestionLoc);
	if (result.isNull() \|\| Cond.isInvalid() \|\| LHS.isInvalid() \|\|
	RHS.isInvalid())
	return ExprError();

	DiagnoseConditionalPrecedence(*this, QuestionLoc, Cond.get(), LHS.get(),
	RHS.get());

	CheckBoolLikeConversion(Cond.get(), QuestionLoc);

	result = computeConditionalNullability(result, commonExpr, LHSTy, RHSTy,
	Context);

	if (!commonExpr)
	return new (Context)
	ConditionalOperator(Cond.get(), QuestionLoc, LHS.get(), ColonLoc,
	RHS.get(), result, VK, OK);

	return new (Context) BinaryConditionalOperator(
	commonExpr, opaqueValue, Cond.get(), LHS.get(), RHS.get(), QuestionLoc,
	ColonLoc, result, VK, OK);
	}

	// checkPointerTypesForAssignment - This is a very tricky routine (despite
	// being closely modeled after the C99 spec:-). The odd characteristic of this
	// routine is it effectively iqnores the qualifiers on the top level pointee.
	// This circumvents the usual type rules specified in 6.2.7p1 & 6.7.5.[1-3].
	// FIXME: add a couple examples in this comment.
	static Sema::AssignConvertType
	checkPointerTypesForAssignment(Sema &S, QualType LHSType, QualType RHSType) {
	assert(LHSType.isCanonical() && "LHS not canonicalized!");
	assert(RHSType.isCanonical() && "RHS not canonicalized!");

	// get the "pointed to" type (ignoring qualifiers at the top level)
	const Type lhptee, rhptee;
	Qualifiers lhq, rhq;
	std::tie(lhptee, lhq) =
	cast<PointerType>(LHSType)->getPointeeType().split().asPair();
	std::tie(rhptee, rhq) =
	cast<PointerType>(RHSType)->getPointeeType().split().asPair();

	Sema::AssignConvertType ConvTy = Sema::Compatible;

	// C99 6.5.16.1p1: This following citation is common to constraints
	// 3 & 4 (below). ...and the type pointed to by the left has all the
	// qualifiers of the type pointed to by the right;

	// As a special case, 'non-__weak A ' -> 'non-__weak const ' is okay.
	if (lhq.getObjCLifetime() != rhq.getObjCLifetime() &&
	lhq.compatiblyIncludesObjCLifetime(rhq)) {
	// Ignore lifetime for further calculation.
	lhq.removeObjCLifetime();
	rhq.removeObjCLifetime();
	}

	if (!lhq.compatiblyIncludes(rhq)) {
	// Treat address-space mismatches as fatal. TODO: address subspaces
	if (!lhq.isAddressSpaceSupersetOf(rhq))
	ConvTy = Sema::IncompatiblePointerDiscardsQualifiers;

	// It's okay to add or remove GC or lifetime qualifiers when converting to
	// and from void*.
	else if (lhq.withoutObjCGCAttr().withoutObjCLifetime()
	.compatiblyIncludes(
	rhq.withoutObjCGCAttr().withoutObjCLifetime())
	&& (lhptee->isVoidType() \|\| rhptee->isVoidType()))
	; // keep old

	// Treat lifetime mismatches as fatal.
	else if (lhq.getObjCLifetime() != rhq.getObjCLifetime())
	ConvTy = Sema::IncompatiblePointerDiscardsQualifiers;

	// For GCC/MS compatibility, other qualifier mismatches are treated
	// as still compatible in C.
	else ConvTy = Sema::CompatiblePointerDiscardsQualifiers;
	}

	// C99 6.5.16.1p1 (constraint 4): If one operand is a pointer to an object or
	// incomplete type and the other is a pointer to a qualified or unqualified
	// version of void...
	if (lhptee->isVoidType()) {
	if (rhptee->isIncompleteOrObjectType())
	return ConvTy;

	// As an extension, we allow cast to/from void* to function pointer.
	assert(rhptee->isFunctionType());
	return Sema::FunctionVoidPointer;
	}

	if (rhptee->isVoidType()) {
	if (lhptee->isIncompleteOrObjectType())
	return ConvTy;

	// As an extension, we allow cast to/from void* to function pointer.
	assert(lhptee->isFunctionType());
	return Sema::FunctionVoidPointer;
	}

	// C99 6.5.16.1p1 (constraint 3): both operands are pointers to qualified or
	// unqualified versions of compatible types, ...
	QualType ltrans = QualType(lhptee, 0), rtrans = QualType(rhptee, 0);
	if (!S.Context.typesAreCompatible(ltrans, rtrans)) {
	// Check if the pointee types are compatible ignoring the sign.
	// We explicitly check for char so that we catch "char" vs
	// "unsigned char" on systems where "char" is unsigned.
	if (lhptee->isCharType())
	ltrans = S.Context.UnsignedCharTy;
	else if (lhptee->hasSignedIntegerRepresentation())
	ltrans = S.Context.getCorrespondingUnsignedType(ltrans);

	if (rhptee->isCharType())
	rtrans = S.Context.UnsignedCharTy;
	else if (rhptee->hasSignedIntegerRepresentation())
	rtrans = S.Context.getCorrespondingUnsignedType(rtrans);

	if (ltrans == rtrans) {
	// Types are compatible ignoring the sign. Qualifier incompatibility
	// takes priority over sign incompatibility because the sign
	// warning can be disabled.
	if (ConvTy != Sema::Compatible)
	return ConvTy;

	return Sema::IncompatiblePointerSign;
	}

	// If we are a multi-level pointer, it's possible that our issue is simply
	// one of qualification - e.g. char -> const char is not allowed. If
	// the eventual target type is the same and the pointers have the same
	// level of indirection, this must be the issue.
	if (isa<PointerType>(lhptee) && isa<PointerType>(rhptee)) {
	do {
	lhptee = cast<PointerType>(lhptee)->getPointeeType().getTypePtr();
	rhptee = cast<PointerType>(rhptee)->getPointeeType().getTypePtr();
	} while (isa<PointerType>(lhptee) && isa<PointerType>(rhptee));

	if (lhptee == rhptee)
	return Sema::IncompatibleNestedPointerQualifiers;
	}

	// General pointer incompatibility takes priority over qualifiers.
	return Sema::IncompatiblePointer;
	}
	if (!S.getLangOpts().CPlusPlus &&
	S.IsFunctionConversion(ltrans, rtrans, ltrans))
	return Sema::IncompatiblePointer;
	return ConvTy;
	}

	/// checkBlockPointerTypesForAssignment - This routine determines whether two
	/// block pointer types are compatible or whether a block and normal pointer
	/// are compatible. It is more restrict than comparing two function pointer
	// types.
	static Sema::AssignConvertType
	checkBlockPointerTypesForAssignment(Sema &S, QualType LHSType,
	QualType RHSType) {
	assert(LHSType.isCanonical() && "LHS not canonicalized!");
	assert(RHSType.isCanonical() && "RHS not canonicalized!");

	QualType lhptee, rhptee;

	// get the "pointed to" type (ignoring qualifiers at the top level)
	lhptee = cast<BlockPointerType>(LHSType)->getPointeeType();
	rhptee = cast<BlockPointerType>(RHSType)->getPointeeType();

	// In C++, the types have to match exactly.
	if (S.getLangOpts().CPlusPlus)
	return Sema::IncompatibleBlockPointer;

	Sema::AssignConvertType ConvTy = Sema::Compatible;

	// For blocks we enforce that qualifiers are identical.
	Qualifiers LQuals = lhptee.getLocalQualifiers();
	Qualifiers RQuals = rhptee.getLocalQualifiers();
	if (S.getLangOpts().OpenCL) {
	LQuals.removeAddressSpace();
	RQuals.removeAddressSpace();
	}
	if (LQuals != RQuals)
	ConvTy = Sema::CompatiblePointerDiscardsQualifiers;

	// FIXME: OpenCL doesn't define the exact compile time semantics for a block
	// assignment.
	// The current behavior is similar to C++ lambdas. A block might be
	// assigned to a variable iff its return type and parameters are compatible
	// (C99 6.2.7) with the corresponding return type and parameters of the LHS of
	// an assignment. Presumably it should behave in way that a function pointer
	// assignment does in C, so for each parameter and return type:
	// * CVR and address space of LHS should be a superset of CVR and address
	// space of RHS.
	// * unqualified types should be compatible.
	if (S.getLangOpts().OpenCL) {
	if (!S.Context.typesAreBlockPointerCompatible(
	S.Context.getQualifiedType(LHSType.getUnqualifiedType(), LQuals),
	S.Context.getQualifiedType(RHSType.getUnqualifiedType(), RQuals)))
	return Sema::IncompatibleBlockPointer;
	} else if (!S.Context.typesAreBlockPointerCompatible(LHSType, RHSType))
	return Sema::IncompatibleBlockPointer;

	return ConvTy;
	}

	/// checkObjCPointerTypesForAssignment - Compares two objective-c pointer types
	/// for assignment compatibility.
	static Sema::AssignConvertType
	checkObjCPointerTypesForAssignment(Sema &S, QualType LHSType,
	QualType RHSType) {
	assert(LHSType.isCanonical() && "LHS was not canonicalized!");
	assert(RHSType.isCanonical() && "RHS was not canonicalized!");

	if (LHSType->isObjCBuiltinType()) {
	// Class is not compatible with ObjC object pointers.
	if (LHSType->isObjCClassType() && !RHSType->isObjCBuiltinType() &&
	!RHSType->isObjCQualifiedClassType())
	return Sema::IncompatiblePointer;
	return Sema::Compatible;
	}
	if (RHSType->isObjCBuiltinType()) {
	if (RHSType->isObjCClassType() && !LHSType->isObjCBuiltinType() &&
	!LHSType->isObjCQualifiedClassType())
	return Sema::IncompatiblePointer;
	return Sema::Compatible;
	}
	QualType lhptee = LHSType->getAs<ObjCObjectPointerType>()->getPointeeType();
	QualType rhptee = RHSType->getAs<ObjCObjectPointerType>()->getPointeeType();

	if (!lhptee.isAtLeastAsQualifiedAs(rhptee) &&
	// make an exception for id<P>
	!LHSType->isObjCQualifiedIdType())
	return Sema::CompatiblePointerDiscardsQualifiers;

	if (S.Context.typesAreCompatible(LHSType, RHSType))
	return Sema::Compatible;
	if (LHSType->isObjCQualifiedIdType() \|\| RHSType->isObjCQualifiedIdType())
	return Sema::IncompatibleObjCQualifiedId;
	return Sema::IncompatiblePointer;
	}

	Sema::AssignConvertType
	Sema::CheckAssignmentConstraints(SourceLocation Loc,
	QualType LHSType, QualType RHSType) {
	// Fake up an opaque expression. We don't actually care about what
	// cast operations are required, so if CheckAssignmentConstraints
	// adds casts to this they'll be wasted, but fortunately that doesn't
	// usually happen on valid code.
	OpaqueValueExpr RHSExpr(Loc, RHSType, VK_RValue);
	ExprResult RHSPtr = &RHSExpr;
	CastKind K = CK_Invalid;

	return CheckAssignmentConstraints(LHSType, RHSPtr, K, /ConvertRHS=/false);
	}

	/// CheckAssignmentConstraints (C99 6.5.16) - This routine currently
	/// has code to accommodate several GCC extensions when type checking
	/// pointers. Here are some objectionable examples that GCC considers warnings:
	///
	/// int a, *pint;
	/// short *pshort;
	/// struct foo *pfoo;
	///
	/// pint = pshort; // warning: assignment from incompatible pointer type
	/// a = pint; // warning: assignment makes integer from pointer without a cast
	/// pint = a; // warning: assignment makes pointer from integer without a cast
	/// pint = pfoo; // warning: assignment from incompatible pointer type
	///
	/// As a result, the code for dealing with pointers is more complex than the
	/// C99 spec dictates.
	///
	/// Sets 'Kind' for any result kind except Incompatible.
	Sema::AssignConvertType
	Sema::CheckAssignmentConstraints(QualType LHSType, ExprResult &RHS,
	CastKind &Kind, bool ConvertRHS) {
	QualType RHSType = RHS.get()->getType();
	QualType OrigLHSType = LHSType;

	// Get canonical types. We're not formatting these types, just comparing
	// them.
	LHSType = Context.getCanonicalType(LHSType).getUnqualifiedType();
	RHSType = Context.getCanonicalType(RHSType).getUnqualifiedType();

	// Common case: no conversion required.
	if (LHSType == RHSType) {
	Kind = CK_NoOp;
	return Compatible;
	}

	// If we have an atomic type, try a non-atomic assignment, then just add an
	// atomic qualification step.
	if (const AtomicType *AtomicTy = dyn_cast<AtomicType>(LHSType)) {
	Sema::AssignConvertType result =
	CheckAssignmentConstraints(AtomicTy->getValueType(), RHS, Kind);
	if (result != Compatible)
	return result;
	if (Kind != CK_NoOp && ConvertRHS)
	RHS = ImpCastExprToType(RHS.get(), AtomicTy->getValueType(), Kind);
	Kind = CK_NonAtomicToAtomic;
	return Compatible;
	}

	// If the left-hand side is a reference type, then we are in a
	// (rare!) case where we've allowed the use of references in C,
	// e.g., as a parameter type in a built-in function. In this case,
	// just make sure that the type referenced is compatible with the
	// right-hand side type. The caller is responsible for adjusting
	// LHSType so that the resulting expression does not have reference
	// type.
	if (const ReferenceType *LHSTypeRef = LHSType->getAs<ReferenceType>()) {
	if (Context.typesAreCompatible(LHSTypeRef->getPointeeType(), RHSType)) {
	Kind = CK_LValueBitCast;
	return Compatible;
	}
	return Incompatible;
	}

	// Allow scalar to ExtVector assignments, and assignments of an ExtVector type
	// to the same ExtVector type.
	if (LHSType->isExtVectorType()) {
	if (RHSType->isExtVectorType())
	return Incompatible;
	if (RHSType->isArithmeticType()) {
	// CK_VectorSplat does T -> vector T, so first cast to the element type.
	if (ConvertRHS)
	RHS = prepareVectorSplat(LHSType, RHS.get());
	Kind = CK_VectorSplat;
	return Compatible;
	}
	}

	// Conversions to or from vector type.
	if (LHSType->isVectorType() \|\| RHSType->isVectorType()) {
	if (LHSType->isVectorType() && RHSType->isVectorType()) {
	// Allow assignments of an AltiVec vector type to an equivalent GCC
	// vector type and vice versa
	if (Context.areCompatibleVectorTypes(LHSType, RHSType)) {
	Kind = CK_BitCast;
	return Compatible;
	}

	// If we are allowing lax vector conversions, and LHS and RHS are both
	// vectors, the total size only needs to be the same. This is a bitcast;
	// no bits are changed but the result type is different.
	if (isLaxVectorConversion(RHSType, LHSType)) {
	Kind = CK_BitCast;
	return IncompatibleVectors;
	}
	}

	// When the RHS comes from another lax conversion (e.g. binops between
	// scalars and vectors) the result is canonicalized as a vector. When the
	// LHS is also a vector, the lax is allowed by the condition above. Handle
	// the case where LHS is a scalar.
	if (LHSType->isScalarType()) {
	const VectorType *VecType = RHSType->getAs<VectorType>();
	if (VecType && VecType->getNumElements() == 1 &&
	isLaxVectorConversion(RHSType, LHSType)) {
	ExprResult *VecExpr = &RHS;
	*VecExpr = ImpCastExprToType(VecExpr->get(), LHSType, CK_BitCast);
	Kind = CK_BitCast;
	return Compatible;
	}
	}

	return Incompatible;
	}

	// Diagnose attempts to convert between __float128 and long double where
	// such conversions currently can't be handled.
	if (unsupportedTypeConversion(*this, LHSType, RHSType))
	return Incompatible;

	// Arithmetic conversions.
	if (LHSType->isArithmeticType() && RHSType->isArithmeticType() &&
	!(getLangOpts().CPlusPlus && LHSType->isEnumeralType())) {
	if (ConvertRHS)
	Kind = PrepareScalarCast(RHS, LHSType);
	return Compatible;
	}

	// Conversions to normal pointers.
	if (const PointerType *LHSPointer = dyn_cast<PointerType>(LHSType)) {
	// U* -> T*
	if (isa<PointerType>(RHSType)) {
	unsigned AddrSpaceL = LHSPointer->getPointeeType().getAddressSpace();
	unsigned AddrSpaceR = RHSType->getPointeeType().getAddressSpace();
	Kind = AddrSpaceL != AddrSpaceR ? CK_AddressSpaceConversion : CK_BitCast;
	return checkPointerTypesForAssignment(*this, LHSType, RHSType);
	}

	// int -> T*
	if (RHSType->isIntegerType()) {
	Kind = CK_IntegralToPointer; // FIXME: null?
	return IntToPointer;
	}

	// C pointers are not compatible with ObjC object pointers,
	// with two exceptions:
	if (isa<ObjCObjectPointerType>(RHSType)) {
	// - conversions to void*
	if (LHSPointer->getPointeeType()->isVoidType()) {
	Kind = CK_BitCast;
	return Compatible;
	}

	// - conversions from 'Class' to the redefinition type
	if (RHSType->isObjCClassType() &&
	Context.hasSameType(LHSType,
	Context.getObjCClassRedefinitionType())) {
	Kind = CK_BitCast;
	return Compatible;
	}

	Kind = CK_BitCast;
	return IncompatiblePointer;
	}

	// U^ -> void*
	if (RHSType->getAs<BlockPointerType>()) {
	if (LHSPointer->getPointeeType()->isVoidType()) {
	unsigned AddrSpaceL = LHSPointer->getPointeeType().getAddressSpace();
	unsigned AddrSpaceR = RHSType->getAs<BlockPointerType>()
	->getPointeeType()
	.getAddressSpace();
	Kind =
	AddrSpaceL != AddrSpaceR ? CK_AddressSpaceConversion : CK_BitCast;
	return Compatible;
	}
	}

	return Incompatible;
	}

	// Conversions to block pointers.
	if (isa<BlockPointerType>(LHSType)) {
	// U^ -> T^
	if (RHSType->isBlockPointerType()) {
	unsigned AddrSpaceL = LHSType->getAs<BlockPointerType>()
	->getPointeeType()
	.getAddressSpace();
	unsigned AddrSpaceR = RHSType->getAs<BlockPointerType>()
	->getPointeeType()
	.getAddressSpace();
	Kind = AddrSpaceL != AddrSpaceR ? CK_AddressSpaceConversion : CK_BitCast;
	return checkBlockPointerTypesForAssignment(*this, LHSType, RHSType);
	}

	// int or null -> T^
	if (RHSType->isIntegerType()) {
	Kind = CK_IntegralToPointer; // FIXME: null
	return IntToBlockPointer;
	}

	// id -> T^
	if (getLangOpts().ObjC1 && RHSType->isObjCIdType()) {
	Kind = CK_AnyPointerToBlockPointerCast;
	return Compatible;
	}

	// void* -> T^
	if (const PointerType *RHSPT = RHSType->getAs<PointerType>())
	if (RHSPT->getPointeeType()->isVoidType()) {
	Kind = CK_AnyPointerToBlockPointerCast;
	return Compatible;
	}

	return Incompatible;
	}

	// Conversions to Objective-C pointers.
	if (isa<ObjCObjectPointerType>(LHSType)) {
	// A* -> B*
	if (RHSType->isObjCObjectPointerType()) {
	Kind = CK_BitCast;
	Sema::AssignConvertType result =
	checkObjCPointerTypesForAssignment(*this, LHSType, RHSType);
	if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers() &&
	result == Compatible &&
	!CheckObjCARCUnavailableWeakConversion(OrigLHSType, RHSType))
	result = IncompatibleObjCWeakRef;
	return result;
	}

	// int or null -> A*
	if (RHSType->isIntegerType()) {
	Kind = CK_IntegralToPointer; // FIXME: null
	return IntToPointer;
	}

	// In general, C pointers are not compatible with ObjC object pointers,
	// with two exceptions:
	if (isa<PointerType>(RHSType)) {
	Kind = CK_CPointerToObjCPointerCast;

	// - conversions from 'void*'
	if (RHSType->isVoidPointerType()) {
	return Compatible;
	}

	// - conversions to 'Class' from its redefinition type
	if (LHSType->isObjCClassType() &&
	Context.hasSameType(RHSType,
	Context.getObjCClassRedefinitionType())) {
	return Compatible;
	}

	return IncompatiblePointer;
	}

	// Only under strict condition T^ is compatible with an Objective-C pointer.
	if (RHSType->isBlockPointerType() &&
	LHSType->isBlockCompatibleObjCPointerType(Context)) {
	if (ConvertRHS)
	maybeExtendBlockObject(RHS);
	Kind = CK_BlockPointerToObjCPointerCast;
	return Compatible;
	}

	return Incompatible;
	}

	// Conversions from pointers that are not covered by the above.
	if (isa<PointerType>(RHSType)) {
	// T* -> _Bool
	if (LHSType == Context.BoolTy) {
	Kind = CK_PointerToBoolean;
	return Compatible;
	}

	// T* -> int
	if (LHSType->isIntegerType()) {
	Kind = CK_PointerToIntegral;
	return PointerToInt;
	}

	return Incompatible;
	}

	// Conversions from Objective-C pointers that are not covered by the above.
	if (isa<ObjCObjectPointerType>(RHSType)) {
	// T* -> _Bool
	if (LHSType == Context.BoolTy) {
	Kind = CK_PointerToBoolean;
	return Compatible;
	}

	// T* -> int
	if (LHSType->isIntegerType()) {
	Kind = CK_PointerToIntegral;
	return PointerToInt;
	}

	return Incompatible;
	}

	// struct A -> struct B
	if (isa<TagType>(LHSType) && isa<TagType>(RHSType)) {
	if (Context.typesAreCompatible(LHSType, RHSType)) {
	Kind = CK_NoOp;
	return Compatible;
	}
	}

	if (LHSType->isSamplerT() && RHSType->isIntegerType()) {
	Kind = CK_IntToOCLSampler;
	return Compatible;
	}

	return Incompatible;
	}

	/// \brief Constructs a transparent union from an expression that is
	/// used to initialize the transparent union.
	static void ConstructTransparentUnion(Sema &S, ASTContext &C,
	ExprResult &EResult, QualType UnionType,
	FieldDecl *Field) {
	// Build an initializer list that designates the appropriate member
	// of the transparent union.
	Expr *E = EResult.get();
	InitListExpr *Initializer = new (C) InitListExpr(C, SourceLocation(),
	E, SourceLocation());
	Initializer->setType(UnionType);
	Initializer->setInitializedFieldInUnion(Field);

	// Build a compound literal constructing a value of the transparent
	// union type from this initializer list.
	TypeSourceInfo *unionTInfo = C.getTrivialTypeSourceInfo(UnionType);
	EResult = new (C) CompoundLiteralExpr(SourceLocation(), unionTInfo, UnionType,
	VK_RValue, Initializer, false);
	}

	Sema::AssignConvertType
	Sema::CheckTransparentUnionArgumentConstraints(QualType ArgType,
	ExprResult &RHS) {
	QualType RHSType = RHS.get()->getType();

	// If the ArgType is a Union type, we want to handle a potential
	// transparent_union GCC extension.
	const RecordType *UT = ArgType->getAsUnionType();
	if (!UT \|\| !UT->getDecl()->hasAttr<TransparentUnionAttr>())
	return Incompatible;

	// The field to initialize within the transparent union.
	RecordDecl *UD = UT->getDecl();
	FieldDecl *InitField = nullptr;
	// It's compatible if the expression matches any of the fields.
	for (auto *it : UD->fields()) {
	if (it->getType()->isPointerType()) {
	// If the transparent union contains a pointer type, we allow:
	// 1) void pointer
	// 2) null pointer constant
	if (RHSType->isPointerType())
	if (RHSType->castAs<PointerType>()->getPointeeType()->isVoidType()) {
	RHS = ImpCastExprToType(RHS.get(), it->getType(), CK_BitCast);
	InitField = it;
	break;
	}

	if (RHS.get()->isNullPointerConstant(Context,
	Expr::NPC_ValueDependentIsNull)) {
	RHS = ImpCastExprToType(RHS.get(), it->getType(),
	CK_NullToPointer);
	InitField = it;
	break;
	}
	}

	CastKind Kind = CK_Invalid;
	if (CheckAssignmentConstraints(it->getType(), RHS, Kind)
	== Compatible) {
	RHS = ImpCastExprToType(RHS.get(), it->getType(), Kind);
	InitField = it;
	break;
	}
	}

	if (!InitField)
	return Incompatible;

	ConstructTransparentUnion(*this, Context, RHS, ArgType, InitField);
	return Compatible;
	}

	Sema::AssignConvertType
	Sema::CheckSingleAssignmentConstraints(QualType LHSType, ExprResult &CallerRHS,
	bool Diagnose,
	bool DiagnoseCFAudited,
	bool ConvertRHS) {
	// We need to be able to tell the caller whether we diagnosed a problem, if
	// they ask us to issue diagnostics.
	assert((ConvertRHS \|\| !Diagnose) && "can't indicate whether we diagnosed");

	// If ConvertRHS is false, we want to leave the caller's RHS untouched. Sadly,
	// we can't avoid all modifications at the moment, so we need some somewhere
	// to put the updated value.
	ExprResult LocalRHS = CallerRHS;
	ExprResult &RHS = ConvertRHS ? CallerRHS : LocalRHS;

	if (getLangOpts().CPlusPlus) {
	if (!LHSType->isRecordType() && !LHSType->isAtomicType()) {
	// C++ 5.17p3: If the left operand is not of class type, the
	// expression is implicitly converted (C++ 4) to the
	// cv-unqualified type of the left operand.
	QualType RHSType = RHS.get()->getType();
	if (Diagnose) {
	RHS = PerformImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(),
	AA_Assigning);
	} else {
	ImplicitConversionSequence ICS =
	TryImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(),
	/SuppressUserConversions=/false,
	/AllowExplicit=/false,
	/InOverloadResolution=/false,
	/CStyle=/false,
	/AllowObjCWritebackConversion=/false);
	if (ICS.isFailure())
	return Incompatible;
	RHS = PerformImplicitConversion(RHS.get(), LHSType.getUnqualifiedType(),
	ICS, AA_Assigning);
	}
	if (RHS.isInvalid())
	return Incompatible;
	Sema::AssignConvertType result = Compatible;
	if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers() &&
	!CheckObjCARCUnavailableWeakConversion(LHSType, RHSType))
	result = IncompatibleObjCWeakRef;
	return result;
	}

	// FIXME: Currently, we fall through and treat C++ classes like C
	// structures.
	// FIXME: We also fall through for atomics; not sure what should
	// happen there, though.
	} else if (RHS.get()->getType() == Context.OverloadTy) {
	// As a set of extensions to C, we support overloading on functions. These
	// functions need to be resolved here.
	DeclAccessPair DAP;
	if (FunctionDecl *FD = ResolveAddressOfOverloadedFunction(
	RHS.get(), LHSType, /Complain=/false, DAP))
	RHS = FixOverloadedFunctionReference(RHS.get(), DAP, FD);
	else
	return Incompatible;
	}

	// C99 6.5.16.1p1: the left operand is a pointer and the right is
	// a null pointer constant.
	if ((LHSType->isPointerType() \|\| LHSType->isObjCObjectPointerType() \|\|
	LHSType->isBlockPointerType()) &&
	RHS.get()->isNullPointerConstant(Context,
	Expr::NPC_ValueDependentIsNull)) {
	if (Diagnose \|\| ConvertRHS) {
	CastKind Kind;
	CXXCastPath Path;
	CheckPointerConversion(RHS.get(), LHSType, Kind, Path,
	/IgnoreBaseAccess=/false, Diagnose);
	if (ConvertRHS)
	RHS = ImpCastExprToType(RHS.get(), LHSType, Kind, VK_RValue, &Path);
	}
	return Compatible;
	}

	// This check seems unnatural, however it is necessary to ensure the proper
	// conversion of functions/arrays. If the conversion were done for all
	// DeclExpr's (created by ActOnIdExpression), it would mess up the unary
	// expressions that suppress this implicit conversion (&, sizeof).
	//
	// Suppress this for references: C++ 8.5.3p5.
	if (!LHSType->isReferenceType()) {
	// FIXME: We potentially allocate here even if ConvertRHS is false.
	RHS = DefaultFunctionArrayLvalueConversion(RHS.get(), Diagnose);
	if (RHS.isInvalid())
	return Incompatible;
	}

	Expr *PRE = RHS.get()->IgnoreParenCasts();
	if (Diagnose && isa<ObjCProtocolExpr>(PRE)) {
	ObjCProtocolDecl *PDecl = cast<ObjCProtocolExpr>(PRE)->getProtocol();
	if (PDecl && !PDecl->hasDefinition()) {
	Diag(PRE->getExprLoc(), diag::warn_atprotocol_protocol) << PDecl->getName();
	Diag(PDecl->getLocation(), diag::note_entity_declared_at) << PDecl;
	}
	}

	CastKind Kind = CK_Invalid;
	Sema::AssignConvertType result =
	CheckAssignmentConstraints(LHSType, RHS, Kind, ConvertRHS);

	// C99 6.5.16.1p2: The value of the right operand is converted to the
	// type of the assignment expression.
	// CheckAssignmentConstraints allows the left-hand side to be a reference,
	// so that we can use references in built-in functions even in C.
	// The getNonReferenceType() call makes sure that the resulting expression
	// does not have reference type.
	if (result != Incompatible && RHS.get()->getType() != LHSType) {
	QualType Ty = LHSType.getNonLValueExprType(Context);
	Expr *E = RHS.get();

	// Check for various Objective-C errors. If we are not reporting
	// diagnostics and just checking for errors, e.g., during overload
	// resolution, return Incompatible to indicate the failure.
	if (getLangOpts().allowsNonTrivialObjCLifetimeQualifiers() &&
	CheckObjCConversion(SourceRange(), Ty, E, CCK_ImplicitConversion,
	Diagnose, DiagnoseCFAudited) != ACR_okay) {
	if (!Diagnose)
	return Incompatible;
	}
	if (getLangOpts().ObjC1 &&
	(CheckObjCBridgeRelatedConversions(E->getLocStart(), LHSType,
	E->getType(), E, Diagnose) \|\|
	ConversionToObjCStringLiteralCheck(LHSType, E, Diagnose))) {
	if (!Diagnose)
	return Incompatible;
	// Replace the expression with a corrected version and continue so we
	// can find further errors.
	RHS = E;
	return Compatible;
	}

	if (ConvertRHS)
	RHS = ImpCastExprToType(E, Ty, Kind);
	}
	return result;
	}

	QualType Sema::InvalidOperands(SourceLocation Loc, ExprResult &LHS,
	ExprResult &RHS) {
	Diag(Loc, diag::err_typecheck_invalid_operands)
	<< LHS.get()->getType() << RHS.get()->getType()
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	return QualType();
	}

	// Diagnose cases where a scalar was implicitly converted to a vector and
	// diagnose the underlying types. Otherwise, diagnose the error
	// as invalid vector logical operands for non-C++ cases.
	QualType Sema::InvalidLogicalVectorOperands(SourceLocation Loc, ExprResult &LHS,
	ExprResult &RHS) {
	QualType LHSType = LHS.get()->IgnoreImpCasts()->getType();
	QualType RHSType = RHS.get()->IgnoreImpCasts()->getType();

	bool LHSNatVec = LHSType->isVectorType();
	bool RHSNatVec = RHSType->isVectorType();

	if (!(LHSNatVec && RHSNatVec)) {
	Expr *Vector = LHSNatVec ? LHS.get() : RHS.get();
	Expr *NonVector = !LHSNatVec ? LHS.get() : RHS.get();
	Diag(Loc, diag::err_typecheck_logical_vector_expr_gnu_cpp_restrict)
	<< 0 << Vector->getType() << NonVector->IgnoreImpCasts()->getType()
	<< Vector->getSourceRange();
	return QualType();
	}

	Diag(Loc, diag::err_typecheck_logical_vector_expr_gnu_cpp_restrict)
	<< 1 << LHSType << RHSType << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();

	return QualType();
	}

	/// Try to convert a value of non-vector type to a vector type by converting
	/// the type to the element type of the vector and then performing a splat.
	/// If the language is OpenCL, we only use conversions that promote scalar
	/// rank; for C, Obj-C, and C++ we allow any real scalar conversion except
	/// for float->int.
	///
	/// OpenCL V2.0 6.2.6.p2:
	/// An error shall occur if any scalar operand type has greater rank
	/// than the type of the vector element.
	///
	/// \param scalar - if non-null, actually perform the conversions
	/// \return true if the operation fails (but without diagnosing the failure)
	static bool tryVectorConvertAndSplat(Sema &S, ExprResult *scalar,
	QualType scalarTy,
	QualType vectorEltTy,
	QualType vectorTy,
	unsigned &DiagID) {
	// The conversion to apply to the scalar before splatting it,
	// if necessary.
	CastKind scalarCast = CK_Invalid;

	if (vectorEltTy->isIntegralType(S.Context)) {
	if (S.getLangOpts().OpenCL && (scalarTy->isRealFloatingType() \|\|
	(scalarTy->isIntegerType() &&
	S.Context.getIntegerTypeOrder(vectorEltTy, scalarTy) < 0))) {
	DiagID = diag::err_opencl_scalar_type_rank_greater_than_vector_type;
	return true;
	}
	if (!scalarTy->isIntegralType(S.Context))
	return true;
	scalarCast = CK_IntegralCast;
	} else if (vectorEltTy->isRealFloatingType()) {
	if (scalarTy->isRealFloatingType()) {
	if (S.getLangOpts().OpenCL &&
	S.Context.getFloatingTypeOrder(vectorEltTy, scalarTy) < 0) {
	DiagID = diag::err_opencl_scalar_type_rank_greater_than_vector_type;
	return true;
	}
	scalarCast = CK_FloatingCast;
	}
	else if (scalarTy->isIntegralType(S.Context))
	scalarCast = CK_IntegralToFloating;
	else
	return true;
	} else {
	return true;
	}

	// Adjust scalar if desired.
	if (scalar) {
	if (scalarCast != CK_Invalid)
	*scalar = S.ImpCastExprToType(scalar->get(), vectorEltTy, scalarCast);
	*scalar = S.ImpCastExprToType(scalar->get(), vectorTy, CK_VectorSplat);
	}
	return false;
	}

	/// Test if a (constant) integer Int can be casted to another integer type
	/// IntTy without losing precision.
	static bool canConvertIntToOtherIntTy(Sema &S, ExprResult *Int,
	QualType OtherIntTy) {
	QualType IntTy = Int->get()->getType().getUnqualifiedType();

	// Reject cases where the value of the Int is unknown as that would
	// possibly cause truncation, but accept cases where the scalar can be
	// demoted without loss of precision.
	llvm::APSInt Result;
	bool CstInt = Int->get()->EvaluateAsInt(Result, S.Context);
	int Order = S.Context.getIntegerTypeOrder(OtherIntTy, IntTy);
	bool IntSigned = IntTy->hasSignedIntegerRepresentation();
	bool OtherIntSigned = OtherIntTy->hasSignedIntegerRepresentation();

	if (CstInt) {
	// If the scalar is constant and is of a higher order and has more active
	// bits that the vector element type, reject it.
	unsigned NumBits = IntSigned
	? (Result.isNegative() ? Result.getMinSignedBits()
	: Result.getActiveBits())
	: Result.getActiveBits();
	if (Order < 0 && S.Context.getIntWidth(OtherIntTy) < NumBits)
	return true;

	// If the signedness of the scalar type and the vector element type
	// differs and the number of bits is greater than that of the vector
	// element reject it.
	return (IntSigned != OtherIntSigned &&
	NumBits > S.Context.getIntWidth(OtherIntTy));
	}

	// Reject cases where the value of the scalar is not constant and it's
	// order is greater than that of the vector element type.
	return (Order < 0);
	}

	/// Test if a (constant) integer Int can be casted to floating point type
	/// FloatTy without losing precision.
	static bool canConvertIntTyToFloatTy(Sema &S, ExprResult *Int,
	QualType FloatTy) {
	QualType IntTy = Int->get()->getType().getUnqualifiedType();

	// Determine if the integer constant can be expressed as a floating point
	// number of the appropiate type.
	llvm::APSInt Result;
	bool CstInt = Int->get()->EvaluateAsInt(Result, S.Context);
	uint64_t Bits = 0;
	if (CstInt) {
	// Reject constants that would be truncated if they were converted to
	// the floating point type. Test by simple to/from conversion.
	// FIXME: Ideally the conversion to an APFloat and from an APFloat
	// could be avoided if there was a convertFromAPInt method
	// which could signal back if implicit truncation occurred.
	llvm::APFloat Float(S.Context.getFloatTypeSemantics(FloatTy));
	Float.convertFromAPInt(Result, IntTy->hasSignedIntegerRepresentation(),
	llvm::APFloat::rmTowardZero);
	llvm::APSInt ConvertBack(S.Context.getIntWidth(IntTy),
	!IntTy->hasSignedIntegerRepresentation());
	bool Ignored = false;
	Float.convertToInteger(ConvertBack, llvm::APFloat::rmNearestTiesToEven,
	&Ignored);
	if (Result != ConvertBack)
	return true;
	} else {
	// Reject types that cannot be fully encoded into the mantissa of
	// the float.
	Bits = S.Context.getTypeSize(IntTy);
	unsigned FloatPrec = llvm::APFloat::semanticsPrecision(
	S.Context.getFloatTypeSemantics(FloatTy));
	if (Bits > FloatPrec)
	return true;
	}

	return false;
	}

	/// Attempt to convert and splat Scalar into a vector whose types matches
	/// Vector following GCC conversion rules. The rule is that implicit
	/// conversion can occur when Scalar can be casted to match Vector's element
	/// type without causing truncation of Scalar.
	static bool tryGCCVectorConvertAndSplat(Sema &S, ExprResult *Scalar,
	ExprResult *Vector) {
	QualType ScalarTy = Scalar->get()->getType().getUnqualifiedType();
	QualType VectorTy = Vector->get()->getType().getUnqualifiedType();
	const VectorType *VT = VectorTy->getAs<VectorType>();

	assert(!isa<ExtVectorType>(VT) &&
	"ExtVectorTypes should not be handled here!");

	QualType VectorEltTy = VT->getElementType();

	// Reject cases where the vector element type or the scalar element type are
	// not integral or floating point types.
	if (!VectorEltTy->isArithmeticType() \|\| !ScalarTy->isArithmeticType())
	return true;

	// The conversion to apply to the scalar before splatting it,
	// if necessary.
	CastKind ScalarCast = CK_NoOp;

	// Accept cases where the vector elements are integers and the scalar is
	// an integer.
	// FIXME: Notionally if the scalar was a floating point value with a precise
	// integral representation, we could cast it to an appropriate integer
	// type and then perform the rest of the checks here. GCC will perform
	// this conversion in some cases as determined by the input language.
	// We should accept it on a language independent basis.
	if (VectorEltTy->isIntegralType(S.Context) &&
	ScalarTy->isIntegralType(S.Context) &&
	S.Context.getIntegerTypeOrder(VectorEltTy, ScalarTy)) {

	if (canConvertIntToOtherIntTy(S, Scalar, VectorEltTy))
	return true;

	ScalarCast = CK_IntegralCast;
	} else if (VectorEltTy->isRealFloatingType()) {
	if (ScalarTy->isRealFloatingType()) {

	// Reject cases where the scalar type is not a constant and has a higher
	// Order than the vector element type.
	llvm::APFloat Result(0.0);
	bool CstScalar = Scalar->get()->EvaluateAsFloat(Result, S.Context);
	int Order = S.Context.getFloatingTypeOrder(VectorEltTy, ScalarTy);
	if (!CstScalar && Order < 0)
	return true;

	// If the scalar cannot be safely casted to the vector element type,
	// reject it.
	if (CstScalar) {
	bool Truncated = false;
	Result.convert(S.Context.getFloatTypeSemantics(VectorEltTy),
	llvm::APFloat::rmNearestTiesToEven, &Truncated);
	if (Truncated)
	return true;
	}

	ScalarCast = CK_FloatingCast;
	} else if (ScalarTy->isIntegralType(S.Context)) {
	if (canConvertIntTyToFloatTy(S, Scalar, VectorEltTy))
	return true;

	ScalarCast = CK_IntegralToFloating;
	} else
	return true;
	}

	// Adjust scalar if desired.
	if (Scalar) {
	if (ScalarCast != CK_NoOp)
	*Scalar = S.ImpCastExprToType(Scalar->get(), VectorEltTy, ScalarCast);
	*Scalar = S.ImpCastExprToType(Scalar->get(), VectorTy, CK_VectorSplat);
	}
	return false;
	}

	QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc, bool IsCompAssign,
	bool AllowBothBool,
	bool AllowBoolConversions) {
	if (!IsCompAssign) {
	LHS = DefaultFunctionArrayLvalueConversion(LHS.get());
	if (LHS.isInvalid())
	return QualType();
	}
	RHS = DefaultFunctionArrayLvalueConversion(RHS.get());
	if (RHS.isInvalid())
	return QualType();

	// For conversion purposes, we ignore any qualifiers.
	// For example, "const float" and "float" are equivalent.
	QualType LHSType = LHS.get()->getType().getUnqualifiedType();
	QualType RHSType = RHS.get()->getType().getUnqualifiedType();

	const VectorType *LHSVecType = LHSType->getAs<VectorType>();
	const VectorType *RHSVecType = RHSType->getAs<VectorType>();
	assert(LHSVecType \|\| RHSVecType);

	// AltiVec-style "vector bool op vector bool" combinations are allowed
	// for some operators but not others.
	if (!AllowBothBool &&
	LHSVecType && LHSVecType->getVectorKind() == VectorType::AltiVecBool &&
	RHSVecType && RHSVecType->getVectorKind() == VectorType::AltiVecBool)
	return InvalidOperands(Loc, LHS, RHS);

	// If the vector types are identical, return.
	if (Context.hasSameType(LHSType, RHSType))
	return LHSType;

	// If we have compatible AltiVec and GCC vector types, use the AltiVec type.
	if (LHSVecType && RHSVecType &&
	Context.areCompatibleVectorTypes(LHSType, RHSType)) {
	if (isa<ExtVectorType>(LHSVecType)) {
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
	return LHSType;
	}

	if (!IsCompAssign)
	LHS = ImpCastExprToType(LHS.get(), RHSType, CK_BitCast);
	return RHSType;
	}

	// AllowBoolConversions says that bool and non-bool AltiVec vectors
	// can be mixed, with the result being the non-bool type. The non-bool
	// operand must have integer element type.
	if (AllowBoolConversions && LHSVecType && RHSVecType &&
	LHSVecType->getNumElements() == RHSVecType->getNumElements() &&
	(Context.getTypeSize(LHSVecType->getElementType()) ==
	Context.getTypeSize(RHSVecType->getElementType()))) {
	if (LHSVecType->getVectorKind() == VectorType::AltiVecVector &&
	LHSVecType->getElementType()->isIntegerType() &&
	RHSVecType->getVectorKind() == VectorType::AltiVecBool) {
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
	return LHSType;
	}
	if (!IsCompAssign &&
	LHSVecType->getVectorKind() == VectorType::AltiVecBool &&
	RHSVecType->getVectorKind() == VectorType::AltiVecVector &&
	RHSVecType->getElementType()->isIntegerType()) {
	LHS = ImpCastExprToType(LHS.get(), RHSType, CK_BitCast);
	return RHSType;
	}
	}

	// If there's a vector type and a scalar, try to convert the scalar to
	// the vector element type and splat.
	unsigned DiagID = diag::err_typecheck_vector_not_convertable;
	if (!RHSVecType) {
	if (isa<ExtVectorType>(LHSVecType)) {
	if (!tryVectorConvertAndSplat(*this, &RHS, RHSType,
	LHSVecType->getElementType(), LHSType,
	DiagID))
	return LHSType;
	} else {
	if (!tryGCCVectorConvertAndSplat(*this, &RHS, &LHS))
	return LHSType;
	}
	}
	if (!LHSVecType) {
	if (isa<ExtVectorType>(RHSVecType)) {
	if (!tryVectorConvertAndSplat(*this, (IsCompAssign ? nullptr : &LHS),
	LHSType, RHSVecType->getElementType(),
	RHSType, DiagID))
	return RHSType;
	} else {
	if (LHS.get()->getValueKind() == VK_LValue \|\|
	!tryGCCVectorConvertAndSplat(*this, &LHS, &RHS))
	return RHSType;
	}
	}

	// FIXME: The code below also handles conversion between vectors and
	// non-scalars, we should break this down into fine grained specific checks
	// and emit proper diagnostics.
	QualType VecType = LHSVecType ? LHSType : RHSType;
	const VectorType *VT = LHSVecType ? LHSVecType : RHSVecType;
	QualType OtherType = LHSVecType ? RHSType : LHSType;
	ExprResult *OtherExpr = LHSVecType ? &RHS : &LHS;
	if (isLaxVectorConversion(OtherType, VecType)) {
	// If we're allowing lax vector conversions, only the total (data) size
	// needs to be the same. For non compound assignment, if one of the types is
	// scalar, the result is always the vector type.
	if (!IsCompAssign) {
	*OtherExpr = ImpCastExprToType(OtherExpr->get(), VecType, CK_BitCast);
	return VecType;
	// In a compound assignment, lhs += rhs, 'lhs' is a lvalue src, forbidding
	// any implicit cast. Here, the 'rhs' should be implicit casted to 'lhs'
	// type. Note that this is already done by non-compound assignments in
	// CheckAssignmentConstraints. If it's a scalar type, only bitcast for
	// <1 x T> -> T. The result is also a vector type.
	- } else if (OtherType->isExtVectorType() \|\|
	+ } else if (OtherType->isExtVectorType() \|\| OtherType->isVectorType() \|\|
	(OtherType->isScalarType() && VT->getNumElements() == 1)) {
	ExprResult *RHSExpr = &RHS;
	*RHSExpr = ImpCastExprToType(RHSExpr->get(), LHSType, CK_BitCast);
	return VecType;
	}
	}

	// Okay, the expression is invalid.

	// If there's a non-vector, non-real operand, diagnose that.
	if ((!RHSVecType && !RHSType->isRealType()) \|\|
	(!LHSVecType && !LHSType->isRealType())) {
	Diag(Loc, diag::err_typecheck_vector_not_convertable_non_scalar)
	<< LHSType << RHSType
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	return QualType();
	}

	// OpenCL V1.1 6.2.6.p1:
	// If the operands are of more than one vector type, then an error shall
	// occur. Implicit conversions between vector types are not permitted, per
	// section 6.2.1.
	if (getLangOpts().OpenCL &&
	RHSVecType && isa<ExtVectorType>(RHSVecType) &&
	LHSVecType && isa<ExtVectorType>(LHSVecType)) {
	Diag(Loc, diag::err_opencl_implicit_vector_conversion) << LHSType
	<< RHSType;
	return QualType();
	}


	// If there is a vector type that is not a ExtVector and a scalar, we reach
	// this point if scalar could not be converted to the vector's element type
	// without truncation.
	if ((RHSVecType && !isa<ExtVectorType>(RHSVecType)) \|\|
	(LHSVecType && !isa<ExtVectorType>(LHSVecType))) {
	QualType Scalar = LHSVecType ? RHSType : LHSType;
	QualType Vector = LHSVecType ? LHSType : RHSType;
	unsigned ScalarOrVector = LHSVecType && RHSVecType ? 1 : 0;
	Diag(Loc,
	diag::err_typecheck_vector_not_convertable_implict_truncation)
	<< ScalarOrVector << Scalar << Vector;

	return QualType();
	}

	// Otherwise, use the generic diagnostic.
	Diag(Loc, DiagID)
	<< LHSType << RHSType
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	return QualType();
	}

	// checkArithmeticNull - Detect when a NULL constant is used improperly in an
	// expression. These are mainly cases where the null pointer is used as an
	// integer instead of a pointer.
	static void checkArithmeticNull(Sema &S, ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc, bool IsCompare) {
	// The canonical way to check for a GNU null is with isNullPointerConstant,
	// but we use a bit of a hack here for speed; this is a relatively
	// hot path, and isNullPointerConstant is slow.
	bool LHSNull = isa<GNUNullExpr>(LHS.get()->IgnoreParenImpCasts());
	bool RHSNull = isa<GNUNullExpr>(RHS.get()->IgnoreParenImpCasts());

	QualType NonNullType = LHSNull ? RHS.get()->getType() : LHS.get()->getType();

	// Avoid analyzing cases where the result will either be invalid (and
	// diagnosed as such) or entirely valid and not something to warn about.
	if ((!LHSNull && !RHSNull) \|\| NonNullType->isBlockPointerType() \|\|
	NonNullType->isMemberPointerType() \|\| NonNullType->isFunctionType())
	return;

	// Comparison operations would not make sense with a null pointer no matter
	// what the other expression is.
	if (!IsCompare) {
	S.Diag(Loc, diag::warn_null_in_arithmetic_operation)
	<< (LHSNull ? LHS.get()->getSourceRange() : SourceRange())
	<< (RHSNull ? RHS.get()->getSourceRange() : SourceRange());
	return;
	}

	// The rest of the operations only make sense with a null pointer
	// if the other expression is a pointer.
	if (LHSNull == RHSNull \|\| NonNullType->isAnyPointerType() \|\|
	NonNullType->canDecayToPointerType())
	return;

	S.Diag(Loc, diag::warn_null_in_comparison_operation)
	<< LHSNull /* LHS is NULL */ << NonNullType
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	}

	static void DiagnoseBadDivideOrRemainderValues(Sema& S, ExprResult &LHS,
	ExprResult &RHS,
	SourceLocation Loc, bool IsDiv) {
	// Check for division/remainder by zero.
	llvm::APSInt RHSValue;
	if (!RHS.get()->isValueDependent() &&
	RHS.get()->EvaluateAsInt(RHSValue, S.Context) && RHSValue == 0)
	S.DiagRuntimeBehavior(Loc, RHS.get(),
	S.PDiag(diag::warn_remainder_division_by_zero)
	<< IsDiv << RHS.get()->getSourceRange());
	}

	QualType Sema::CheckMultiplyDivideOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc,
	bool IsCompAssign, bool IsDiv) {
	checkArithmeticNull(this, LHS, RHS, Loc, /isCompare=*/false);

	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType())
	return CheckVectorOperands(LHS, RHS, Loc, IsCompAssign,
	/AllowBothBool/getLangOpts().AltiVec,
	/AllowBoolConversions/false);

	QualType compType = UsualArithmeticConversions(LHS, RHS, IsCompAssign);
	if (LHS.isInvalid() \|\| RHS.isInvalid())
	return QualType();


	if (compType.isNull() \|\| !compType->isArithmeticType())
	return InvalidOperands(Loc, LHS, RHS);
	if (IsDiv)
	DiagnoseBadDivideOrRemainderValues(*this, LHS, RHS, Loc, IsDiv);
	return compType;
	}

	QualType Sema::CheckRemainderOperands(
	ExprResult &LHS, ExprResult &RHS, SourceLocation Loc, bool IsCompAssign) {
	checkArithmeticNull(this, LHS, RHS, Loc, /isCompare=*/false);

	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType()) {
	if (LHS.get()->getType()->hasIntegerRepresentation() &&
	RHS.get()->getType()->hasIntegerRepresentation())
	return CheckVectorOperands(LHS, RHS, Loc, IsCompAssign,
	/AllowBothBool/getLangOpts().AltiVec,
	/AllowBoolConversions/false);
	return InvalidOperands(Loc, LHS, RHS);
	}

	QualType compType = UsualArithmeticConversions(LHS, RHS, IsCompAssign);
	if (LHS.isInvalid() \|\| RHS.isInvalid())
	return QualType();

	if (compType.isNull() \|\| !compType->isIntegerType())
	return InvalidOperands(Loc, LHS, RHS);
	DiagnoseBadDivideOrRemainderValues(this, LHS, RHS, Loc, false / IsDiv */);
	return compType;
	}

	/// \brief Diagnose invalid arithmetic on two void pointers.
	static void diagnoseArithmeticOnTwoVoidPointers(Sema &S, SourceLocation Loc,
	Expr LHSExpr, Expr RHSExpr) {
	S.Diag(Loc, S.getLangOpts().CPlusPlus
	? diag::err_typecheck_pointer_arith_void_type
	: diag::ext_gnu_void_ptr)
	<< 1 /* two pointers */ << LHSExpr->getSourceRange()
	<< RHSExpr->getSourceRange();
	}

	/// \brief Diagnose invalid arithmetic on a void pointer.
	static void diagnoseArithmeticOnVoidPointer(Sema &S, SourceLocation Loc,
	Expr *Pointer) {
	S.Diag(Loc, S.getLangOpts().CPlusPlus
	? diag::err_typecheck_pointer_arith_void_type
	: diag::ext_gnu_void_ptr)
	<< 0 /* one pointer */ << Pointer->getSourceRange();
	}

	/// \brief Diagnose invalid arithmetic on two function pointers.
	static void diagnoseArithmeticOnTwoFunctionPointers(Sema &S, SourceLocation Loc,
	Expr LHS, Expr RHS) {
	assert(LHS->getType()->isAnyPointerType());
	assert(RHS->getType()->isAnyPointerType());
	S.Diag(Loc, S.getLangOpts().CPlusPlus
	? diag::err_typecheck_pointer_arith_function_type
	: diag::ext_gnu_ptr_func_arith)
	<< 1 /* two pointers */ << LHS->getType()->getPointeeType()
	// We only show the second type if it differs from the first.
	<< (unsigned)!S.Context.hasSameUnqualifiedType(LHS->getType(),
	RHS->getType())
	<< RHS->getType()->getPointeeType()
	<< LHS->getSourceRange() << RHS->getSourceRange();
	}

	/// \brief Diagnose invalid arithmetic on a function pointer.
	static void diagnoseArithmeticOnFunctionPointer(Sema &S, SourceLocation Loc,
	Expr *Pointer) {
	assert(Pointer->getType()->isAnyPointerType());
	S.Diag(Loc, S.getLangOpts().CPlusPlus
	? diag::err_typecheck_pointer_arith_function_type
	: diag::ext_gnu_ptr_func_arith)
	<< 0 /* one pointer */ << Pointer->getType()->getPointeeType()
	<< 0 /* one pointer, so only one type */
	<< Pointer->getSourceRange();
	}

	/// \brief Emit error if Operand is incomplete pointer type
	///
	/// \returns True if pointer has incomplete type
	static bool checkArithmeticIncompletePointerType(Sema &S, SourceLocation Loc,
	Expr *Operand) {
	QualType ResType = Operand->getType();
	if (const AtomicType *ResAtomicType = ResType->getAs<AtomicType>())
	ResType = ResAtomicType->getValueType();

	assert(ResType->isAnyPointerType() && !ResType->isDependentType());
	QualType PointeeTy = ResType->getPointeeType();
	return S.RequireCompleteType(Loc, PointeeTy,
	diag::err_typecheck_arithmetic_incomplete_type,
	PointeeTy, Operand->getSourceRange());
	}

	/// \brief Check the validity of an arithmetic pointer operand.
	///
	/// If the operand has pointer type, this code will check for pointer types
	/// which are invalid in arithmetic operations. These will be diagnosed
	/// appropriately, including whether or not the use is supported as an
	/// extension.
	///
	/// \returns True when the operand is valid to use (even if as an extension).
	static bool checkArithmeticOpPointerOperand(Sema &S, SourceLocation Loc,
	Expr *Operand) {
	QualType ResType = Operand->getType();
	if (const AtomicType *ResAtomicType = ResType->getAs<AtomicType>())
	ResType = ResAtomicType->getValueType();

	if (!ResType->isAnyPointerType()) return true;

	QualType PointeeTy = ResType->getPointeeType();
	if (PointeeTy->isVoidType()) {
	diagnoseArithmeticOnVoidPointer(S, Loc, Operand);
	return !S.getLangOpts().CPlusPlus;
	}
	if (PointeeTy->isFunctionType()) {
	diagnoseArithmeticOnFunctionPointer(S, Loc, Operand);
	return !S.getLangOpts().CPlusPlus;
	}

	if (checkArithmeticIncompletePointerType(S, Loc, Operand)) return false;

	return true;
	}

	/// \brief Check the validity of a binary arithmetic operation w.r.t. pointer
	/// operands.
	///
	/// This routine will diagnose any invalid arithmetic on pointer operands much
	/// like \see checkArithmeticOpPointerOperand. However, it has special logic
	/// for emitting a single diagnostic even for operations where both LHS and RHS
	/// are (potentially problematic) pointers.
	///
	/// \returns True when the operand is valid to use (even if as an extension).
	static bool checkArithmeticBinOpPointerOperands(Sema &S, SourceLocation Loc,
	Expr LHSExpr, Expr RHSExpr) {
	bool isLHSPointer = LHSExpr->getType()->isAnyPointerType();
	bool isRHSPointer = RHSExpr->getType()->isAnyPointerType();
	if (!isLHSPointer && !isRHSPointer) return true;

	QualType LHSPointeeTy, RHSPointeeTy;
	if (isLHSPointer) LHSPointeeTy = LHSExpr->getType()->getPointeeType();
	if (isRHSPointer) RHSPointeeTy = RHSExpr->getType()->getPointeeType();

	// if both are pointers check if operation is valid wrt address spaces
	if (S.getLangOpts().OpenCL && isLHSPointer && isRHSPointer) {
	const PointerType *lhsPtr = LHSExpr->getType()->getAs<PointerType>();
	const PointerType *rhsPtr = RHSExpr->getType()->getAs<PointerType>();
	if (!lhsPtr->isAddressSpaceOverlapping(*rhsPtr)) {
	S.Diag(Loc,
	diag::err_typecheck_op_on_nonoverlapping_address_space_pointers)
	<< LHSExpr->getType() << RHSExpr->getType() << 1 /arithmetic op/
	<< LHSExpr->getSourceRange() << RHSExpr->getSourceRange();
	return false;
	}
	}

	// Check for arithmetic on pointers to incomplete types.
	bool isLHSVoidPtr = isLHSPointer && LHSPointeeTy->isVoidType();
	bool isRHSVoidPtr = isRHSPointer && RHSPointeeTy->isVoidType();
	if (isLHSVoidPtr \|\| isRHSVoidPtr) {
	if (!isRHSVoidPtr) diagnoseArithmeticOnVoidPointer(S, Loc, LHSExpr);
	else if (!isLHSVoidPtr) diagnoseArithmeticOnVoidPointer(S, Loc, RHSExpr);
	else diagnoseArithmeticOnTwoVoidPointers(S, Loc, LHSExpr, RHSExpr);

	return !S.getLangOpts().CPlusPlus;
	}

	bool isLHSFuncPtr = isLHSPointer && LHSPointeeTy->isFunctionType();
	bool isRHSFuncPtr = isRHSPointer && RHSPointeeTy->isFunctionType();
	if (isLHSFuncPtr \|\| isRHSFuncPtr) {
	if (!isRHSFuncPtr) diagnoseArithmeticOnFunctionPointer(S, Loc, LHSExpr);
	else if (!isLHSFuncPtr) diagnoseArithmeticOnFunctionPointer(S, Loc,
	RHSExpr);
	else diagnoseArithmeticOnTwoFunctionPointers(S, Loc, LHSExpr, RHSExpr);

	return !S.getLangOpts().CPlusPlus;
	}

	if (isLHSPointer && checkArithmeticIncompletePointerType(S, Loc, LHSExpr))
	return false;
	if (isRHSPointer && checkArithmeticIncompletePointerType(S, Loc, RHSExpr))
	return false;

	return true;
	}

	/// diagnoseStringPlusInt - Emit a warning when adding an integer to a string
	/// literal.
	static void diagnoseStringPlusInt(Sema &Self, SourceLocation OpLoc,
	Expr LHSExpr, Expr RHSExpr) {
	StringLiteral* StrExpr = dyn_cast<StringLiteral>(LHSExpr->IgnoreImpCasts());
	Expr* IndexExpr = RHSExpr;
	if (!StrExpr) {
	StrExpr = dyn_cast<StringLiteral>(RHSExpr->IgnoreImpCasts());
	IndexExpr = LHSExpr;
	}

	bool IsStringPlusInt = StrExpr &&
	IndexExpr->getType()->isIntegralOrUnscopedEnumerationType();
	if (!IsStringPlusInt \|\| IndexExpr->isValueDependent())
	return;

	llvm::APSInt index;
	if (IndexExpr->EvaluateAsInt(index, Self.getASTContext())) {
	unsigned StrLenWithNull = StrExpr->getLength() + 1;
	if (index.isNonNegative() &&
	index <= llvm::APSInt(llvm::APInt(index.getBitWidth(), StrLenWithNull),
	index.isUnsigned()))
	return;
	}

	SourceRange DiagRange(LHSExpr->getLocStart(), RHSExpr->getLocEnd());
	Self.Diag(OpLoc, diag::warn_string_plus_int)
	<< DiagRange << IndexExpr->IgnoreImpCasts()->getType();

	// Only print a fixit for "str" + int, not for int + "str".
	if (IndexExpr == RHSExpr) {
	SourceLocation EndLoc = Self.getLocForEndOfToken(RHSExpr->getLocEnd());
	Self.Diag(OpLoc, diag::note_string_plus_scalar_silence)
	<< FixItHint::CreateInsertion(LHSExpr->getLocStart(), "&")
	<< FixItHint::CreateReplacement(SourceRange(OpLoc), "[")
	<< FixItHint::CreateInsertion(EndLoc, "]");
	} else
	Self.Diag(OpLoc, diag::note_string_plus_scalar_silence);
	}

	/// \brief Emit a warning when adding a char literal to a string.
	static void diagnoseStringPlusChar(Sema &Self, SourceLocation OpLoc,
	Expr LHSExpr, Expr RHSExpr) {
	const Expr *StringRefExpr = LHSExpr;
	const CharacterLiteral *CharExpr =
	dyn_cast<CharacterLiteral>(RHSExpr->IgnoreImpCasts());

	if (!CharExpr) {
	CharExpr = dyn_cast<CharacterLiteral>(LHSExpr->IgnoreImpCasts());
	StringRefExpr = RHSExpr;
	}

	if (!CharExpr \|\| !StringRefExpr)
	return;

	const QualType StringType = StringRefExpr->getType();

	// Return if not a PointerType.
	if (!StringType->isAnyPointerType())
	return;

	// Return if not a CharacterType.
	if (!StringType->getPointeeType()->isAnyCharacterType())
	return;

	ASTContext &Ctx = Self.getASTContext();
	SourceRange DiagRange(LHSExpr->getLocStart(), RHSExpr->getLocEnd());

	const QualType CharType = CharExpr->getType();
	if (!CharType->isAnyCharacterType() &&
	CharType->isIntegerType() &&
	llvm::isUIntN(Ctx.getCharWidth(), CharExpr->getValue())) {
	Self.Diag(OpLoc, diag::warn_string_plus_char)
	<< DiagRange << Ctx.CharTy;
	} else {
	Self.Diag(OpLoc, diag::warn_string_plus_char)
	<< DiagRange << CharExpr->getType();
	}

	// Only print a fixit for str + char, not for char + str.
	if (isa<CharacterLiteral>(RHSExpr->IgnoreImpCasts())) {
	SourceLocation EndLoc = Self.getLocForEndOfToken(RHSExpr->getLocEnd());
	Self.Diag(OpLoc, diag::note_string_plus_scalar_silence)
	<< FixItHint::CreateInsertion(LHSExpr->getLocStart(), "&")
	<< FixItHint::CreateReplacement(SourceRange(OpLoc), "[")
	<< FixItHint::CreateInsertion(EndLoc, "]");
	} else {
	Self.Diag(OpLoc, diag::note_string_plus_scalar_silence);
	}
	}

	/// \brief Emit error when two pointers are incompatible.
	static void diagnosePointerIncompatibility(Sema &S, SourceLocation Loc,
	Expr LHSExpr, Expr RHSExpr) {
	assert(LHSExpr->getType()->isAnyPointerType());
	assert(RHSExpr->getType()->isAnyPointerType());
	S.Diag(Loc, diag::err_typecheck_sub_ptr_compatible)
	<< LHSExpr->getType() << RHSExpr->getType() << LHSExpr->getSourceRange()
	<< RHSExpr->getSourceRange();
	}

	// C99 6.5.6
	QualType Sema::CheckAdditionOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc, BinaryOperatorKind Opc,
	QualType* CompLHSTy) {
	checkArithmeticNull(this, LHS, RHS, Loc, /isCompare=*/false);

	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType()) {
	QualType compType = CheckVectorOperands(
	LHS, RHS, Loc, CompLHSTy,
	/AllowBothBool/getLangOpts().AltiVec,
	/AllowBoolConversions/getLangOpts().ZVector);
	if (CompLHSTy) *CompLHSTy = compType;
	return compType;
	}

	QualType compType = UsualArithmeticConversions(LHS, RHS, CompLHSTy);
	if (LHS.isInvalid() \|\| RHS.isInvalid())
	return QualType();

	// Diagnose "string literal" '+' int and string '+' "char literal".
	if (Opc == BO_Add) {
	diagnoseStringPlusInt(*this, Loc, LHS.get(), RHS.get());
	diagnoseStringPlusChar(*this, Loc, LHS.get(), RHS.get());
	}

	// handle the common case first (both operands are arithmetic).
	if (!compType.isNull() && compType->isArithmeticType()) {
	if (CompLHSTy) *CompLHSTy = compType;
	return compType;
	}

	// Type-checking. Ultimately the pointer's going to be in PExp;
	// note that we bias towards the LHS being the pointer.
	Expr PExp = LHS.get(), IExp = RHS.get();

	bool isObjCPointer;
	if (PExp->getType()->isPointerType()) {
	isObjCPointer = false;
	} else if (PExp->getType()->isObjCObjectPointerType()) {
	isObjCPointer = true;
	} else {
	std::swap(PExp, IExp);
	if (PExp->getType()->isPointerType()) {
	isObjCPointer = false;
	} else if (PExp->getType()->isObjCObjectPointerType()) {
	isObjCPointer = true;
	} else {
	return InvalidOperands(Loc, LHS, RHS);
	}
	}
	assert(PExp->getType()->isAnyPointerType());

	if (!IExp->getType()->isIntegerType())
	return InvalidOperands(Loc, LHS, RHS);

	if (!checkArithmeticOpPointerOperand(*this, Loc, PExp))
	return QualType();

	if (isObjCPointer && checkArithmeticOnObjCPointer(*this, Loc, PExp))
	return QualType();

	// Check array bounds for pointer arithemtic
	CheckArrayAccess(PExp, IExp);

	if (CompLHSTy) {
	QualType LHSTy = Context.isPromotableBitField(LHS.get());
	if (LHSTy.isNull()) {
	LHSTy = LHS.get()->getType();
	if (LHSTy->isPromotableIntegerType())
	LHSTy = Context.getPromotedIntegerType(LHSTy);
	}
	*CompLHSTy = LHSTy;
	}

	return PExp->getType();
	}

	// C99 6.5.6
	QualType Sema::CheckSubtractionOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc,
	QualType* CompLHSTy) {
	checkArithmeticNull(this, LHS, RHS, Loc, /isCompare=*/false);

	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType()) {
	QualType compType = CheckVectorOperands(
	LHS, RHS, Loc, CompLHSTy,
	/AllowBothBool/getLangOpts().AltiVec,
	/AllowBoolConversions/getLangOpts().ZVector);
	if (CompLHSTy) *CompLHSTy = compType;
	return compType;
	}

	QualType compType = UsualArithmeticConversions(LHS, RHS, CompLHSTy);
	if (LHS.isInvalid() \|\| RHS.isInvalid())
	return QualType();

	// Enforce type constraints: C99 6.5.6p3.

	// Handle the common case first (both operands are arithmetic).
	if (!compType.isNull() && compType->isArithmeticType()) {
	if (CompLHSTy) *CompLHSTy = compType;
	return compType;
	}

	// Either ptr - int or ptr - ptr.
	if (LHS.get()->getType()->isAnyPointerType()) {
	QualType lpointee = LHS.get()->getType()->getPointeeType();

	// Diagnose bad cases where we step over interface counts.
	if (LHS.get()->getType()->isObjCObjectPointerType() &&
	checkArithmeticOnObjCPointer(*this, Loc, LHS.get()))
	return QualType();

	// The result type of a pointer-int computation is the pointer type.
	if (RHS.get()->getType()->isIntegerType()) {
	if (!checkArithmeticOpPointerOperand(*this, Loc, LHS.get()))
	return QualType();

	// Check array bounds for pointer arithemtic
	CheckArrayAccess(LHS.get(), RHS.get(), /ArraySubscriptExpr/nullptr,
	/AllowOnePastEnd/true, /IndexNegated/true);

	if (CompLHSTy) *CompLHSTy = LHS.get()->getType();
	return LHS.get()->getType();
	}

	// Handle pointer-pointer subtractions.
	if (const PointerType *RHSPTy
	= RHS.get()->getType()->getAs<PointerType>()) {
	QualType rpointee = RHSPTy->getPointeeType();

	if (getLangOpts().CPlusPlus) {
	// Pointee types must be the same: C++ [expr.add]
	if (!Context.hasSameUnqualifiedType(lpointee, rpointee)) {
	diagnosePointerIncompatibility(*this, Loc, LHS.get(), RHS.get());
	}
	} else {
	// Pointee types must be compatible C99 6.5.6p3
	if (!Context.typesAreCompatible(
	Context.getCanonicalType(lpointee).getUnqualifiedType(),
	Context.getCanonicalType(rpointee).getUnqualifiedType())) {
	diagnosePointerIncompatibility(*this, Loc, LHS.get(), RHS.get());
	return QualType();
	}
	}

	if (!checkArithmeticBinOpPointerOperands(*this, Loc,
	LHS.get(), RHS.get()))
	return QualType();

	// The pointee type may have zero size. As an extension, a structure or
	// union may have zero size or an array may have zero length. In this
	// case subtraction does not make sense.
	if (!rpointee->isVoidType() && !rpointee->isFunctionType()) {
	CharUnits ElementSize = Context.getTypeSizeInChars(rpointee);
	if (ElementSize.isZero()) {
	Diag(Loc,diag::warn_sub_ptr_zero_size_types)
	<< rpointee.getUnqualifiedType()
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	}
	}

	if (CompLHSTy) *CompLHSTy = LHS.get()->getType();
	return Context.getPointerDiffType();
	}
	}

	return InvalidOperands(Loc, LHS, RHS);
	}

	static bool isScopedEnumerationType(QualType T) {
	if (const EnumType *ET = T->getAs<EnumType>())
	return ET->getDecl()->isScoped();
	return false;
	}

	static void DiagnoseBadShiftValues(Sema& S, ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc, BinaryOperatorKind Opc,
	QualType LHSType) {
	// OpenCL 6.3j: shift values are effectively % word size of LHS (more defined),
	// so skip remaining warnings as we don't want to modify values within Sema.
	if (S.getLangOpts().OpenCL)
	return;

	llvm::APSInt Right;
	// Check right/shifter operand
	if (RHS.get()->isValueDependent() \|\|
	!RHS.get()->EvaluateAsInt(Right, S.Context))
	return;

	if (Right.isNegative()) {
	S.DiagRuntimeBehavior(Loc, RHS.get(),
	S.PDiag(diag::warn_shift_negative)
	<< RHS.get()->getSourceRange());
	return;
	}
	llvm::APInt LeftBits(Right.getBitWidth(),
	S.Context.getTypeSize(LHS.get()->getType()));
	if (Right.uge(LeftBits)) {
	S.DiagRuntimeBehavior(Loc, RHS.get(),
	S.PDiag(diag::warn_shift_gt_typewidth)
	<< RHS.get()->getSourceRange());
	return;
	}
	if (Opc != BO_Shl)
	return;

	// When left shifting an ICE which is signed, we can check for overflow which
	// according to C++ has undefined behavior ([expr.shift] 5.8/2). Unsigned
	// integers have defined behavior modulo one more than the maximum value
	// representable in the result type, so never warn for those.
	llvm::APSInt Left;
	if (LHS.get()->isValueDependent() \|\|
	LHSType->hasUnsignedIntegerRepresentation() \|\|
	!LHS.get()->EvaluateAsInt(Left, S.Context))
	return;

	// If LHS does not have a signed type and non-negative value
	// then, the behavior is undefined. Warn about it.
	if (Left.isNegative() && !S.getLangOpts().isSignedOverflowDefined()) {
	S.DiagRuntimeBehavior(Loc, LHS.get(),
	S.PDiag(diag::warn_shift_lhs_negative)
	<< LHS.get()->getSourceRange());
	return;
	}

	llvm::APInt ResultBits =
	static_cast<llvm::APInt&>(Right) + Left.getMinSignedBits();
	if (LeftBits.uge(ResultBits))
	return;
	llvm::APSInt Result = Left.extend(ResultBits.getLimitedValue());
	Result = Result.shl(Right);

	// Print the bit representation of the signed integer as an unsigned
	// hexadecimal number.
	SmallString<40> HexResult;
	Result.toString(HexResult, 16, /Signed =/false, /Literal =/true);

	// If we are only missing a sign bit, this is less likely to result in actual
	// bugs -- if the result is cast back to an unsigned type, it will have the
	// expected value. Thus we place this behind a different warning that can be
	// turned off separately if needed.
	if (LeftBits == ResultBits - 1) {
	S.Diag(Loc, diag::warn_shift_result_sets_sign_bit)
	<< HexResult << LHSType
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	return;
	}

	S.Diag(Loc, diag::warn_shift_result_gt_typewidth)
	<< HexResult.str() << Result.getMinSignedBits() << LHSType
	<< Left.getBitWidth() << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	}

	/// \brief Return the resulting type when a vector is shifted
	/// by a scalar or vector shift amount.
	static QualType checkVectorShift(Sema &S, ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc, bool IsCompAssign) {
	// OpenCL v1.1 s6.3.j says RHS can be a vector only if LHS is a vector.
	if ((S.LangOpts.OpenCL \|\| S.LangOpts.ZVector) &&
	!LHS.get()->getType()->isVectorType()) {
	S.Diag(Loc, diag::err_shift_rhs_only_vector)
	<< RHS.get()->getType() << LHS.get()->getType()
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	return QualType();
	}

	if (!IsCompAssign) {
	LHS = S.UsualUnaryConversions(LHS.get());
	if (LHS.isInvalid()) return QualType();
	}

	RHS = S.UsualUnaryConversions(RHS.get());
	if (RHS.isInvalid()) return QualType();

	QualType LHSType = LHS.get()->getType();
	// Note that LHS might be a scalar because the routine calls not only in
	// OpenCL case.
	const VectorType *LHSVecTy = LHSType->getAs<VectorType>();
	QualType LHSEleType = LHSVecTy ? LHSVecTy->getElementType() : LHSType;

	// Note that RHS might not be a vector.
	QualType RHSType = RHS.get()->getType();
	const VectorType *RHSVecTy = RHSType->getAs<VectorType>();
	QualType RHSEleType = RHSVecTy ? RHSVecTy->getElementType() : RHSType;

	// The operands need to be integers.
	if (!LHSEleType->isIntegerType()) {
	S.Diag(Loc, diag::err_typecheck_expect_int)
	<< LHS.get()->getType() << LHS.get()->getSourceRange();
	return QualType();
	}

	if (!RHSEleType->isIntegerType()) {
	S.Diag(Loc, diag::err_typecheck_expect_int)
	<< RHS.get()->getType() << RHS.get()->getSourceRange();
	return QualType();
	}

	if (!LHSVecTy) {
	assert(RHSVecTy);
	if (IsCompAssign)
	return RHSType;
	if (LHSEleType != RHSEleType) {
	LHS = S.ImpCastExprToType(LHS.get(),RHSEleType, CK_IntegralCast);
	LHSEleType = RHSEleType;
	}
	QualType VecTy =
	S.Context.getExtVectorType(LHSEleType, RHSVecTy->getNumElements());
	LHS = S.ImpCastExprToType(LHS.get(), VecTy, CK_VectorSplat);
	LHSType = VecTy;
	} else if (RHSVecTy) {
	// OpenCL v1.1 s6.3.j says that for vector types, the operators
	// are applied component-wise. So if RHS is a vector, then ensure
	// that the number of elements is the same as LHS...
	if (RHSVecTy->getNumElements() != LHSVecTy->getNumElements()) {
	S.Diag(Loc, diag::err_typecheck_vector_lengths_not_equal)
	<< LHS.get()->getType() << RHS.get()->getType()
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	return QualType();
	}
	if (!S.LangOpts.OpenCL && !S.LangOpts.ZVector) {
	const BuiltinType *LHSBT = LHSEleType->getAs<clang::BuiltinType>();
	const BuiltinType *RHSBT = RHSEleType->getAs<clang::BuiltinType>();
	if (LHSBT != RHSBT &&
	S.Context.getTypeSize(LHSBT) != S.Context.getTypeSize(RHSBT)) {
	S.Diag(Loc, diag::warn_typecheck_vector_element_sizes_not_equal)
	<< LHS.get()->getType() << RHS.get()->getType()
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	}
	}
	} else {
	// ...else expand RHS to match the number of elements in LHS.
	QualType VecTy =
	S.Context.getExtVectorType(RHSEleType, LHSVecTy->getNumElements());
	RHS = S.ImpCastExprToType(RHS.get(), VecTy, CK_VectorSplat);
	}

	return LHSType;
	}

	// C99 6.5.7
	QualType Sema::CheckShiftOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc, BinaryOperatorKind Opc,
	bool IsCompAssign) {
	checkArithmeticNull(this, LHS, RHS, Loc, /isCompare=*/false);

	// Vector shifts promote their scalar inputs to vector type.
	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType()) {
	if (LangOpts.ZVector) {
	// The shift operators for the z vector extensions work basically
	// like general shifts, except that neither the LHS nor the RHS is
	// allowed to be a "vector bool".
	if (auto LHSVecType = LHS.get()->getType()->getAs<VectorType>())
	if (LHSVecType->getVectorKind() == VectorType::AltiVecBool)
	return InvalidOperands(Loc, LHS, RHS);
	if (auto RHSVecType = RHS.get()->getType()->getAs<VectorType>())
	if (RHSVecType->getVectorKind() == VectorType::AltiVecBool)
	return InvalidOperands(Loc, LHS, RHS);
	}
	return checkVectorShift(*this, LHS, RHS, Loc, IsCompAssign);
	}

	// Shifts don't perform usual arithmetic conversions, they just do integer
	// promotions on each operand. C99 6.5.7p3

	// For the LHS, do usual unary conversions, but then reset them away
	// if this is a compound assignment.
	ExprResult OldLHS = LHS;
	LHS = UsualUnaryConversions(LHS.get());
	if (LHS.isInvalid())
	return QualType();
	QualType LHSType = LHS.get()->getType();
	if (IsCompAssign) LHS = OldLHS;

	// The RHS is simpler.
	RHS = UsualUnaryConversions(RHS.get());
	if (RHS.isInvalid())
	return QualType();
	QualType RHSType = RHS.get()->getType();

	// C99 6.5.7p2: Each of the operands shall have integer type.
	if (!LHSType->hasIntegerRepresentation() \|\|
	!RHSType->hasIntegerRepresentation())
	return InvalidOperands(Loc, LHS, RHS);

	// C++0x: Don't allow scoped enums. FIXME: Use something better than
	// hasIntegerRepresentation() above instead of this.
	if (isScopedEnumerationType(LHSType) \|\|
	isScopedEnumerationType(RHSType)) {
	return InvalidOperands(Loc, LHS, RHS);
	}
	// Sanity-check shift operands
	DiagnoseBadShiftValues(*this, LHS, RHS, Loc, Opc, LHSType);

	// "The type of the result is that of the promoted left operand."
	return LHSType;
	}

	static bool IsWithinTemplateSpecialization(Decl *D) {
	if (DeclContext *DC = D->getDeclContext()) {
	if (isa<ClassTemplateSpecializationDecl>(DC))
	return true;
	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(DC))
	return FD->isFunctionTemplateSpecialization();
	}
	return false;
	}

	/// If two different enums are compared, raise a warning.
	static void checkEnumComparison(Sema &S, SourceLocation Loc, Expr *LHS,
	Expr *RHS) {
	QualType LHSStrippedType = LHS->IgnoreParenImpCasts()->getType();
	QualType RHSStrippedType = RHS->IgnoreParenImpCasts()->getType();

	const EnumType *LHSEnumType = LHSStrippedType->getAs<EnumType>();
	if (!LHSEnumType)
	return;
	const EnumType *RHSEnumType = RHSStrippedType->getAs<EnumType>();
	if (!RHSEnumType)
	return;

	// Ignore anonymous enums.
	if (!LHSEnumType->getDecl()->getIdentifier())
	return;
	if (!RHSEnumType->getDecl()->getIdentifier())
	return;

	if (S.Context.hasSameUnqualifiedType(LHSStrippedType, RHSStrippedType))
	return;

	S.Diag(Loc, diag::warn_comparison_of_mixed_enum_types)
	<< LHSStrippedType << RHSStrippedType
	<< LHS->getSourceRange() << RHS->getSourceRange();
	}

	/// \brief Diagnose bad pointer comparisons.
	static void diagnoseDistinctPointerComparison(Sema &S, SourceLocation Loc,
	ExprResult &LHS, ExprResult &RHS,
	bool IsError) {
	S.Diag(Loc, IsError ? diag::err_typecheck_comparison_of_distinct_pointers
	: diag::ext_typecheck_comparison_of_distinct_pointers)
	<< LHS.get()->getType() << RHS.get()->getType()
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	}

	/// \brief Returns false if the pointers are converted to a composite type,
	/// true otherwise.
	static bool convertPointersToCompositeType(Sema &S, SourceLocation Loc,
	ExprResult &LHS, ExprResult &RHS) {
	// C++ [expr.rel]p2:
	// [...] Pointer conversions (4.10) and qualification
	// conversions (4.4) are performed on pointer operands (or on
	// a pointer operand and a null pointer constant) to bring
	// them to their composite pointer type. [...]
	//
	// C++ [expr.eq]p1 uses the same notion for (in)equality
	// comparisons of pointers.

	QualType LHSType = LHS.get()->getType();
	QualType RHSType = RHS.get()->getType();
	assert(LHSType->isPointerType() \|\| RHSType->isPointerType() \|\|
	LHSType->isMemberPointerType() \|\| RHSType->isMemberPointerType());

	QualType T = S.FindCompositePointerType(Loc, LHS, RHS);
	if (T.isNull()) {
	if ((LHSType->isPointerType() \|\| LHSType->isMemberPointerType()) &&
	(RHSType->isPointerType() \|\| RHSType->isMemberPointerType()))
	diagnoseDistinctPointerComparison(S, Loc, LHS, RHS, /isError/true);
	else
	S.InvalidOperands(Loc, LHS, RHS);
	return true;
	}

	LHS = S.ImpCastExprToType(LHS.get(), T, CK_BitCast);
	RHS = S.ImpCastExprToType(RHS.get(), T, CK_BitCast);
	return false;
	}

	static void diagnoseFunctionPointerToVoidComparison(Sema &S, SourceLocation Loc,
	ExprResult &LHS,
	ExprResult &RHS,
	bool IsError) {
	S.Diag(Loc, IsError ? diag::err_typecheck_comparison_of_fptr_to_void
	: diag::ext_typecheck_comparison_of_fptr_to_void)
	<< LHS.get()->getType() << RHS.get()->getType()
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	}

	static bool isObjCObjectLiteral(ExprResult &E) {
	switch (E.get()->IgnoreParenImpCasts()->getStmtClass()) {
	case Stmt::ObjCArrayLiteralClass:
	case Stmt::ObjCDictionaryLiteralClass:
	case Stmt::ObjCStringLiteralClass:
	case Stmt::ObjCBoxedExprClass:
	return true;
	default:
	// Note that ObjCBoolLiteral is NOT an object literal!
	return false;
	}
	}

	static bool hasIsEqualMethod(Sema &S, const Expr LHS, const Expr RHS) {
	const ObjCObjectPointerType *Type =
	LHS->getType()->getAs<ObjCObjectPointerType>();

	// If this is not actually an Objective-C object, bail out.
	if (!Type)
	return false;

	// Get the LHS object's interface type.
	QualType InterfaceType = Type->getPointeeType();

	// If the RHS isn't an Objective-C object, bail out.
	if (!RHS->getType()->isObjCObjectPointerType())
	return false;

	// Try to find the -isEqual: method.
	Selector IsEqualSel = S.NSAPIObj->getIsEqualSelector();
	ObjCMethodDecl *Method = S.LookupMethodInObjectType(IsEqualSel,
	InterfaceType,
	/instance=/true);
	if (!Method) {
	if (Type->isObjCIdType()) {
	// For 'id', just check the global pool.
	Method = S.LookupInstanceMethodInGlobalPool(IsEqualSel, SourceRange(),
	/receiverId=/true);
	} else {
	// Check protocols.
	Method = S.LookupMethodInQualifiedType(IsEqualSel, Type,
	/instance=/true);
	}
	}

	if (!Method)
	return false;

	QualType T = Method->parameters()[0]->getType();
	if (!T->isObjCObjectPointerType())
	return false;

	QualType R = Method->getReturnType();
	if (!R->isScalarType())
	return false;

	return true;
	}

	Sema::ObjCLiteralKind Sema::CheckLiteralKind(Expr *FromE) {
	FromE = FromE->IgnoreParenImpCasts();
	switch (FromE->getStmtClass()) {
	default:
	break;
	case Stmt::ObjCStringLiteralClass:
	// "string literal"
	return LK_String;
	case Stmt::ObjCArrayLiteralClass:
	// "array literal"
	return LK_Array;
	case Stmt::ObjCDictionaryLiteralClass:
	// "dictionary literal"
	return LK_Dictionary;
	case Stmt::BlockExprClass:
	return LK_Block;
	case Stmt::ObjCBoxedExprClass: {
	Expr *Inner = cast<ObjCBoxedExpr>(FromE)->getSubExpr()->IgnoreParens();
	switch (Inner->getStmtClass()) {
	case Stmt::IntegerLiteralClass:
	case Stmt::FloatingLiteralClass:
	case Stmt::CharacterLiteralClass:
	case Stmt::ObjCBoolLiteralExprClass:
	case Stmt::CXXBoolLiteralExprClass:
	// "numeric literal"
	return LK_Numeric;
	case Stmt::ImplicitCastExprClass: {
	CastKind CK = cast<CastExpr>(Inner)->getCastKind();
	// Boolean literals can be represented by implicit casts.
	if (CK == CK_IntegralToBoolean \|\| CK == CK_IntegralCast)
	return LK_Numeric;
	break;
	}
	default:
	break;
	}
	return LK_Boxed;
	}
	}
	return LK_None;
	}

	static void diagnoseObjCLiteralComparison(Sema &S, SourceLocation Loc,
	ExprResult &LHS, ExprResult &RHS,
	BinaryOperator::Opcode Opc){
	Expr *Literal;
	Expr *Other;
	if (isObjCObjectLiteral(LHS)) {
	Literal = LHS.get();
	Other = RHS.get();
	} else {
	Literal = RHS.get();
	Other = LHS.get();
	}

	// Don't warn on comparisons against nil.
	Other = Other->IgnoreParenCasts();
	if (Other->isNullPointerConstant(S.getASTContext(),
	Expr::NPC_ValueDependentIsNotNull))
	return;

	// This should be kept in sync with warn_objc_literal_comparison.
	// LK_String should always be after the other literals, since it has its own
	// warning flag.
	Sema::ObjCLiteralKind LiteralKind = S.CheckLiteralKind(Literal);
	assert(LiteralKind != Sema::LK_Block);
	if (LiteralKind == Sema::LK_None) {
	llvm_unreachable("Unknown Objective-C object literal kind");
	}

	if (LiteralKind == Sema::LK_String)
	S.Diag(Loc, diag::warn_objc_string_literal_comparison)
	<< Literal->getSourceRange();
	else
	S.Diag(Loc, diag::warn_objc_literal_comparison)
	<< LiteralKind << Literal->getSourceRange();

	if (BinaryOperator::isEqualityOp(Opc) &&
	hasIsEqualMethod(S, LHS.get(), RHS.get())) {
	SourceLocation Start = LHS.get()->getLocStart();
	SourceLocation End = S.getLocForEndOfToken(RHS.get()->getLocEnd());
	CharSourceRange OpRange =
	CharSourceRange::getCharRange(Loc, S.getLocForEndOfToken(Loc));

	S.Diag(Loc, diag::note_objc_literal_comparison_isequal)
	<< FixItHint::CreateInsertion(Start, Opc == BO_EQ ? "[" : "![")
	<< FixItHint::CreateReplacement(OpRange, " isEqual:")
	<< FixItHint::CreateInsertion(End, "]");
	}
	}

	/// Warns on !x < y, !x & y where !(x < y), !(x & y) was probably intended.
	static void diagnoseLogicalNotOnLHSofCheck(Sema &S, ExprResult &LHS,
	ExprResult &RHS, SourceLocation Loc,
	BinaryOperatorKind Opc) {
	// Check that left hand side is !something.
	UnaryOperator *UO = dyn_cast<UnaryOperator>(LHS.get()->IgnoreImpCasts());
	if (!UO \|\| UO->getOpcode() != UO_LNot) return;

	// Only check if the right hand side is non-bool arithmetic type.
	if (RHS.get()->isKnownToHaveBooleanValue()) return;

	// Make sure that the something in !something is not bool.
	Expr *SubExpr = UO->getSubExpr()->IgnoreImpCasts();
	if (SubExpr->isKnownToHaveBooleanValue()) return;

	// Emit warning.
	bool IsBitwiseOp = Opc == BO_And \|\| Opc == BO_Or \|\| Opc == BO_Xor;
	S.Diag(UO->getOperatorLoc(), diag::warn_logical_not_on_lhs_of_check)
	<< Loc << IsBitwiseOp;

	// First note suggest !(x < y)
	SourceLocation FirstOpen = SubExpr->getLocStart();
	SourceLocation FirstClose = RHS.get()->getLocEnd();
	FirstClose = S.getLocForEndOfToken(FirstClose);
	if (FirstClose.isInvalid())
	FirstOpen = SourceLocation();
	S.Diag(UO->getOperatorLoc(), diag::note_logical_not_fix)
	<< IsBitwiseOp
	<< FixItHint::CreateInsertion(FirstOpen, "(")
	<< FixItHint::CreateInsertion(FirstClose, ")");

	// Second note suggests (!x) < y
	SourceLocation SecondOpen = LHS.get()->getLocStart();
	SourceLocation SecondClose = LHS.get()->getLocEnd();
	SecondClose = S.getLocForEndOfToken(SecondClose);
	if (SecondClose.isInvalid())
	SecondOpen = SourceLocation();
	S.Diag(UO->getOperatorLoc(), diag::note_logical_not_silence_with_parens)
	<< FixItHint::CreateInsertion(SecondOpen, "(")
	<< FixItHint::CreateInsertion(SecondClose, ")");
	}

	// Get the decl for a simple expression: a reference to a variable,
	// an implicit C++ field reference, or an implicit ObjC ivar reference.
	static ValueDecl getCompareDecl(Expr E) {
	if (DeclRefExpr* DR = dyn_cast<DeclRefExpr>(E))
	return DR->getDecl();
	if (ObjCIvarRefExpr* Ivar = dyn_cast<ObjCIvarRefExpr>(E)) {
	if (Ivar->isFreeIvar())
	return Ivar->getDecl();
	}
	if (MemberExpr* Mem = dyn_cast<MemberExpr>(E)) {
	if (Mem->isImplicitAccess())
	return Mem->getMemberDecl();
	}
	return nullptr;
	}

	// C99 6.5.8, C++ [expr.rel]
	QualType Sema::CheckCompareOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc, BinaryOperatorKind Opc,
	bool IsRelational) {
	checkArithmeticNull(this, LHS, RHS, Loc, /isCompare=*/true);

	// Handle vector comparisons separately.
	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType())
	return CheckVectorCompareOperands(LHS, RHS, Loc, IsRelational);

	QualType LHSType = LHS.get()->getType();
	QualType RHSType = RHS.get()->getType();

	Expr *LHSStripped = LHS.get()->IgnoreParenImpCasts();
	Expr *RHSStripped = RHS.get()->IgnoreParenImpCasts();

	checkEnumComparison(*this, Loc, LHS.get(), RHS.get());
	diagnoseLogicalNotOnLHSofCheck(*this, LHS, RHS, Loc, Opc);

	if (!LHSType->hasFloatingRepresentation() &&
	!(LHSType->isBlockPointerType() && IsRelational) &&
	!LHS.get()->getLocStart().isMacroID() &&
	!RHS.get()->getLocStart().isMacroID() &&
	!inTemplateInstantiation()) {
	// For non-floating point types, check for self-comparisons of the form
	// x == x, x != x, x < x, etc. These always evaluate to a constant, and
	// often indicate logic errors in the program.
	//
	// NOTE: Don't warn about comparison expressions resulting from macro
	// expansion. Also don't warn about comparisons which are only self
	// comparisons within a template specialization. The warnings should catch
	// obvious cases in the definition of the template anyways. The idea is to
	// warn when the typed comparison operator will always evaluate to the same
	// result.
	ValueDecl *DL = getCompareDecl(LHSStripped);
	ValueDecl *DR = getCompareDecl(RHSStripped);
	if (DL && DR && DL == DR && !IsWithinTemplateSpecialization(DL)) {
	DiagRuntimeBehavior(Loc, nullptr, PDiag(diag::warn_comparison_always)
	<< 0 // self-
	<< (Opc == BO_EQ
	\|\| Opc == BO_LE
	\|\| Opc == BO_GE));
	} else if (DL && DR && LHSType->isArrayType() && RHSType->isArrayType() &&
	!DL->getType()->isReferenceType() &&
	!DR->getType()->isReferenceType()) {
	// what is it always going to eval to?
	char always_evals_to;
	switch(Opc) {
	case BO_EQ: // e.g. array1 == array2
	always_evals_to = 0; // false
	break;
	case BO_NE: // e.g. array1 != array2
	always_evals_to = 1; // true
	break;
	default:
	// best we can say is 'a constant'
	always_evals_to = 2; // e.g. array1 <= array2
	break;
	}
	DiagRuntimeBehavior(Loc, nullptr, PDiag(diag::warn_comparison_always)
	<< 1 // array
	<< always_evals_to);
	}

	if (isa<CastExpr>(LHSStripped))
	LHSStripped = LHSStripped->IgnoreParenCasts();
	if (isa<CastExpr>(RHSStripped))
	RHSStripped = RHSStripped->IgnoreParenCasts();

	// Warn about comparisons against a string constant (unless the other
	// operand is null), the user probably wants strcmp.
	Expr *literalString = nullptr;
	Expr *literalStringStripped = nullptr;
	if ((isa<StringLiteral>(LHSStripped) \|\| isa<ObjCEncodeExpr>(LHSStripped)) &&
	!RHSStripped->isNullPointerConstant(Context,
	Expr::NPC_ValueDependentIsNull)) {
	literalString = LHS.get();
	literalStringStripped = LHSStripped;
	} else if ((isa<StringLiteral>(RHSStripped) \|\|
	isa<ObjCEncodeExpr>(RHSStripped)) &&
	!LHSStripped->isNullPointerConstant(Context,
	Expr::NPC_ValueDependentIsNull)) {
	literalString = RHS.get();
	literalStringStripped = RHSStripped;
	}

	if (literalString) {
	DiagRuntimeBehavior(Loc, nullptr,
	PDiag(diag::warn_stringcompare)
	<< isa<ObjCEncodeExpr>(literalStringStripped)
	<< literalString->getSourceRange());
	}
	}

	// C99 6.5.8p3 / C99 6.5.9p4
	UsualArithmeticConversions(LHS, RHS);
	if (LHS.isInvalid() \|\| RHS.isInvalid())
	return QualType();

	LHSType = LHS.get()->getType();
	RHSType = RHS.get()->getType();

	// The result of comparisons is 'bool' in C++, 'int' in C.
	QualType ResultTy = Context.getLogicalOperationType();

	if (IsRelational) {
	if (LHSType->isRealType() && RHSType->isRealType())
	return ResultTy;
	} else {
	// Check for comparisons of floating point operands using != and ==.
	if (LHSType->hasFloatingRepresentation())
	CheckFloatComparison(Loc, LHS.get(), RHS.get());

	if (LHSType->isArithmeticType() && RHSType->isArithmeticType())
	return ResultTy;
	}

	const Expr::NullPointerConstantKind LHSNullKind =
	LHS.get()->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull);
	const Expr::NullPointerConstantKind RHSNullKind =
	RHS.get()->isNullPointerConstant(Context, Expr::NPC_ValueDependentIsNull);
	bool LHSIsNull = LHSNullKind != Expr::NPCK_NotNull;
	bool RHSIsNull = RHSNullKind != Expr::NPCK_NotNull;

	if (!IsRelational && LHSIsNull != RHSIsNull) {
	bool IsEquality = Opc == BO_EQ;
	if (RHSIsNull)
	DiagnoseAlwaysNonNullPointer(LHS.get(), RHSNullKind, IsEquality,
	RHS.get()->getSourceRange());
	else
	DiagnoseAlwaysNonNullPointer(RHS.get(), LHSNullKind, IsEquality,
	LHS.get()->getSourceRange());
	}

	if ((LHSType->isIntegerType() && !LHSIsNull) \|\|
	(RHSType->isIntegerType() && !RHSIsNull)) {
	// Skip normal pointer conversion checks in this case; we have better
	// diagnostics for this below.
	} else if (getLangOpts().CPlusPlus) {
	// Equality comparison of a function pointer to a void pointer is invalid,
	// but we allow it as an extension.
	// FIXME: If we really want to allow this, should it be part of composite
	// pointer type computation so it works in conditionals too?
	if (!IsRelational &&
	((LHSType->isFunctionPointerType() && RHSType->isVoidPointerType()) \|\|
	(RHSType->isFunctionPointerType() && LHSType->isVoidPointerType()))) {
	// This is a gcc extension compatibility comparison.
	// In a SFINAE context, we treat this as a hard error to maintain
	// conformance with the C++ standard.
	diagnoseFunctionPointerToVoidComparison(
	this, Loc, LHS, RHS, /isError*/ (bool)isSFINAEContext());

	if (isSFINAEContext())
	return QualType();

	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
	return ResultTy;
	}

	// C++ [expr.eq]p2:
	// If at least one operand is a pointer [...] bring them to their
	// composite pointer type.
	// C++ [expr.rel]p2:
	// If both operands are pointers, [...] bring them to their composite
	// pointer type.
	if ((int)LHSType->isPointerType() + (int)RHSType->isPointerType() >=
	(IsRelational ? 2 : 1) &&
	(!LangOpts.ObjCAutoRefCount \|\|
	!(LHSType->isObjCObjectPointerType() \|\|
	RHSType->isObjCObjectPointerType()))) {
	if (convertPointersToCompositeType(*this, Loc, LHS, RHS))
	return QualType();
	else
	return ResultTy;
	}
	} else if (LHSType->isPointerType() &&
	RHSType->isPointerType()) { // C99 6.5.8p2
	// All of the following pointer-related warnings are GCC extensions, except
	// when handling null pointer constants.
	QualType LCanPointeeTy =
	LHSType->castAs<PointerType>()->getPointeeType().getCanonicalType();
	QualType RCanPointeeTy =
	RHSType->castAs<PointerType>()->getPointeeType().getCanonicalType();

	// C99 6.5.9p2 and C99 6.5.8p2
	if (Context.typesAreCompatible(LCanPointeeTy.getUnqualifiedType(),
	RCanPointeeTy.getUnqualifiedType())) {
	// Valid unless a relational comparison of function pointers
	if (IsRelational && LCanPointeeTy->isFunctionType()) {
	Diag(Loc, diag::ext_typecheck_ordered_comparison_of_function_pointers)
	<< LHSType << RHSType << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	}
	} else if (!IsRelational &&
	(LCanPointeeTy->isVoidType() \|\| RCanPointeeTy->isVoidType())) {
	// Valid unless comparison between non-null pointer and function pointer
	if ((LCanPointeeTy->isFunctionType() \|\| RCanPointeeTy->isFunctionType())
	&& !LHSIsNull && !RHSIsNull)
	diagnoseFunctionPointerToVoidComparison(*this, Loc, LHS, RHS,
	/isError/false);
	} else {
	// Invalid
	diagnoseDistinctPointerComparison(this, Loc, LHS, RHS, /isError*/false);
	}
	if (LCanPointeeTy != RCanPointeeTy) {
	// Treat NULL constant as a special case in OpenCL.
	if (getLangOpts().OpenCL && !LHSIsNull && !RHSIsNull) {
	const PointerType *LHSPtr = LHSType->getAs<PointerType>();
	if (!LHSPtr->isAddressSpaceOverlapping(*RHSType->getAs<PointerType>())) {
	Diag(Loc,
	diag::err_typecheck_op_on_nonoverlapping_address_space_pointers)
	<< LHSType << RHSType << 0 /* comparison */
	<< LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
	}
	}
	unsigned AddrSpaceL = LCanPointeeTy.getAddressSpace();
	unsigned AddrSpaceR = RCanPointeeTy.getAddressSpace();
	CastKind Kind = AddrSpaceL != AddrSpaceR ? CK_AddressSpaceConversion
	: CK_BitCast;
	if (LHSIsNull && !RHSIsNull)
	LHS = ImpCastExprToType(LHS.get(), RHSType, Kind);
	else
	RHS = ImpCastExprToType(RHS.get(), LHSType, Kind);
	}
	return ResultTy;
	}

	if (getLangOpts().CPlusPlus) {
	// C++ [expr.eq]p4:
	// Two operands of type std::nullptr_t or one operand of type
	// std::nullptr_t and the other a null pointer constant compare equal.
	if (!IsRelational && LHSIsNull && RHSIsNull) {
	if (LHSType->isNullPtrType()) {
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_NullToPointer);
	return ResultTy;
	}
	if (RHSType->isNullPtrType()) {
	LHS = ImpCastExprToType(LHS.get(), RHSType, CK_NullToPointer);
	return ResultTy;
	}
	}

	// Comparison of Objective-C pointers and block pointers against nullptr_t.
	// These aren't covered by the composite pointer type rules.
	if (!IsRelational && RHSType->isNullPtrType() &&
	(LHSType->isObjCObjectPointerType() \|\| LHSType->isBlockPointerType())) {
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_NullToPointer);
	return ResultTy;
	}
	if (!IsRelational && LHSType->isNullPtrType() &&
	(RHSType->isObjCObjectPointerType() \|\| RHSType->isBlockPointerType())) {
	LHS = ImpCastExprToType(LHS.get(), RHSType, CK_NullToPointer);
	return ResultTy;
	}

	if (IsRelational &&
	((LHSType->isNullPtrType() && RHSType->isPointerType()) \|\|
	(RHSType->isNullPtrType() && LHSType->isPointerType()))) {
	// HACK: Relational comparison of nullptr_t against a pointer type is
	// invalid per DR583, but we allow it within std::less<> and friends,
	// since otherwise common uses of it break.
	// FIXME: Consider removing this hack once LWG fixes std::less<> and
	// friends to have std::nullptr_t overload candidates.
	DeclContext *DC = CurContext;
	if (isa<FunctionDecl>(DC))
	DC = DC->getParent();
	if (auto *CTSD = dyn_cast<ClassTemplateSpecializationDecl>(DC)) {
	if (CTSD->isInStdNamespace() &&
	llvm::StringSwitch<bool>(CTSD->getName())
	.Cases("less", "less_equal", "greater", "greater_equal", true)
	.Default(false)) {
	if (RHSType->isNullPtrType())
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_NullToPointer);
	else
	LHS = ImpCastExprToType(LHS.get(), RHSType, CK_NullToPointer);
	return ResultTy;
	}
	}
	}

	// C++ [expr.eq]p2:
	// If at least one operand is a pointer to member, [...] bring them to
	// their composite pointer type.
	if (!IsRelational &&
	(LHSType->isMemberPointerType() \|\| RHSType->isMemberPointerType())) {
	if (convertPointersToCompositeType(*this, Loc, LHS, RHS))
	return QualType();
	else
	return ResultTy;
	}

	// Handle scoped enumeration types specifically, since they don't promote
	// to integers.
	if (LHS.get()->getType()->isEnumeralType() &&
	Context.hasSameUnqualifiedType(LHS.get()->getType(),
	RHS.get()->getType()))
	return ResultTy;
	}

	// Handle block pointer types.
	if (!IsRelational && LHSType->isBlockPointerType() &&
	RHSType->isBlockPointerType()) {
	QualType lpointee = LHSType->castAs<BlockPointerType>()->getPointeeType();
	QualType rpointee = RHSType->castAs<BlockPointerType>()->getPointeeType();

	if (!LHSIsNull && !RHSIsNull &&
	!Context.typesAreCompatible(lpointee, rpointee)) {
	Diag(Loc, diag::err_typecheck_comparison_of_distinct_blocks)
	<< LHSType << RHSType << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	}
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
	return ResultTy;
	}

	// Allow block pointers to be compared with null pointer constants.
	if (!IsRelational
	&& ((LHSType->isBlockPointerType() && RHSType->isPointerType())
	\|\| (LHSType->isPointerType() && RHSType->isBlockPointerType()))) {
	if (!LHSIsNull && !RHSIsNull) {
	if (!((RHSType->isPointerType() && RHSType->castAs<PointerType>()
	->getPointeeType()->isVoidType())
	\|\| (LHSType->isPointerType() && LHSType->castAs<PointerType>()
	->getPointeeType()->isVoidType())))
	Diag(Loc, diag::err_typecheck_comparison_of_distinct_blocks)
	<< LHSType << RHSType << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	}
	if (LHSIsNull && !RHSIsNull)
	LHS = ImpCastExprToType(LHS.get(), RHSType,
	RHSType->isPointerType() ? CK_BitCast
	: CK_AnyPointerToBlockPointerCast);
	else
	RHS = ImpCastExprToType(RHS.get(), LHSType,
	LHSType->isPointerType() ? CK_BitCast
	: CK_AnyPointerToBlockPointerCast);
	return ResultTy;
	}

	if (LHSType->isObjCObjectPointerType() \|\|
	RHSType->isObjCObjectPointerType()) {
	const PointerType *LPT = LHSType->getAs<PointerType>();
	const PointerType *RPT = RHSType->getAs<PointerType>();
	if (LPT \|\| RPT) {
	bool LPtrToVoid = LPT ? LPT->getPointeeType()->isVoidType() : false;
	bool RPtrToVoid = RPT ? RPT->getPointeeType()->isVoidType() : false;

	if (!LPtrToVoid && !RPtrToVoid &&
	!Context.typesAreCompatible(LHSType, RHSType)) {
	diagnoseDistinctPointerComparison(*this, Loc, LHS, RHS,
	/isError/false);
	}
	if (LHSIsNull && !RHSIsNull) {
	Expr *E = LHS.get();
	if (getLangOpts().ObjCAutoRefCount)
	CheckObjCConversion(SourceRange(), RHSType, E,
	CCK_ImplicitConversion);
	LHS = ImpCastExprToType(E, RHSType,
	RPT ? CK_BitCast :CK_CPointerToObjCPointerCast);
	}
	else {
	Expr *E = RHS.get();
	if (getLangOpts().ObjCAutoRefCount)
	CheckObjCConversion(SourceRange(), LHSType, E, CCK_ImplicitConversion,
	/Diagnose=/true,
	/DiagnoseCFAudited=/false, Opc);
	RHS = ImpCastExprToType(E, LHSType,
	LPT ? CK_BitCast :CK_CPointerToObjCPointerCast);
	}
	return ResultTy;
	}
	if (LHSType->isObjCObjectPointerType() &&
	RHSType->isObjCObjectPointerType()) {
	if (!Context.areComparableObjCPointerTypes(LHSType, RHSType))
	diagnoseDistinctPointerComparison(*this, Loc, LHS, RHS,
	/isError/false);
	if (isObjCObjectLiteral(LHS) \|\| isObjCObjectLiteral(RHS))
	diagnoseObjCLiteralComparison(*this, Loc, LHS, RHS, Opc);

	if (LHSIsNull && !RHSIsNull)
	LHS = ImpCastExprToType(LHS.get(), RHSType, CK_BitCast);
	else
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_BitCast);
	return ResultTy;
	}
	}
	if ((LHSType->isAnyPointerType() && RHSType->isIntegerType()) \|\|
	(LHSType->isIntegerType() && RHSType->isAnyPointerType())) {
	unsigned DiagID = 0;
	bool isError = false;
	if (LangOpts.DebuggerSupport) {
	// Under a debugger, allow the comparison of pointers to integers,
	// since users tend to want to compare addresses.
	} else if ((LHSIsNull && LHSType->isIntegerType()) \|\|
	(RHSIsNull && RHSType->isIntegerType())) {
	if (IsRelational) {
	isError = getLangOpts().CPlusPlus;
	DiagID =
	isError ? diag::err_typecheck_ordered_comparison_of_pointer_and_zero
	: diag::ext_typecheck_ordered_comparison_of_pointer_and_zero;
	}
	} else if (getLangOpts().CPlusPlus) {
	DiagID = diag::err_typecheck_comparison_of_pointer_integer;
	isError = true;
	} else if (IsRelational)
	DiagID = diag::ext_typecheck_ordered_comparison_of_pointer_integer;
	else
	DiagID = diag::ext_typecheck_comparison_of_pointer_integer;

	if (DiagID) {
	Diag(Loc, DiagID)
	<< LHSType << RHSType << LHS.get()->getSourceRange()
	<< RHS.get()->getSourceRange();
	if (isError)
	return QualType();
	}

	if (LHSType->isIntegerType())
	LHS = ImpCastExprToType(LHS.get(), RHSType,
	LHSIsNull ? CK_NullToPointer : CK_IntegralToPointer);
	else
	RHS = ImpCastExprToType(RHS.get(), LHSType,
	RHSIsNull ? CK_NullToPointer : CK_IntegralToPointer);
	return ResultTy;
	}

	// Handle block pointers.
	if (!IsRelational && RHSIsNull
	&& LHSType->isBlockPointerType() && RHSType->isIntegerType()) {
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_NullToPointer);
	return ResultTy;
	}
	if (!IsRelational && LHSIsNull
	&& LHSType->isIntegerType() && RHSType->isBlockPointerType()) {
	LHS = ImpCastExprToType(LHS.get(), RHSType, CK_NullToPointer);
	return ResultTy;
	}

	if (getLangOpts().OpenCLVersion >= 200) {
	if (LHSIsNull && RHSType->isQueueT()) {
	LHS = ImpCastExprToType(LHS.get(), RHSType, CK_NullToPointer);
	return ResultTy;
	}

	if (LHSType->isQueueT() && RHSIsNull) {
	RHS = ImpCastExprToType(RHS.get(), LHSType, CK_NullToPointer);
	return ResultTy;
	}
	}

	return InvalidOperands(Loc, LHS, RHS);
	}

	// Return a signed ext_vector_type that is of identical size and number of
	// elements. For floating point vectors, return an integer type of identical
	// size and number of elements. In the non ext_vector_type case, search from
	// the largest type to the smallest type to avoid cases where long long == long,
	// where long gets picked over long long.
	QualType Sema::GetSignedVectorType(QualType V) {
	const VectorType *VTy = V->getAs<VectorType>();
	unsigned TypeSize = Context.getTypeSize(VTy->getElementType());

	if (isa<ExtVectorType>(VTy)) {
	if (TypeSize == Context.getTypeSize(Context.CharTy))
	return Context.getExtVectorType(Context.CharTy, VTy->getNumElements());
	else if (TypeSize == Context.getTypeSize(Context.ShortTy))
	return Context.getExtVectorType(Context.ShortTy, VTy->getNumElements());
	else if (TypeSize == Context.getTypeSize(Context.IntTy))
	return Context.getExtVectorType(Context.IntTy, VTy->getNumElements());
	else if (TypeSize == Context.getTypeSize(Context.LongTy))
	return Context.getExtVectorType(Context.LongTy, VTy->getNumElements());
	assert(TypeSize == Context.getTypeSize(Context.LongLongTy) &&
	"Unhandled vector element size in vector compare");
	return Context.getExtVectorType(Context.LongLongTy, VTy->getNumElements());
	}

	if (TypeSize == Context.getTypeSize(Context.LongLongTy))
	return Context.getVectorType(Context.LongLongTy, VTy->getNumElements(),
	VectorType::GenericVector);
	else if (TypeSize == Context.getTypeSize(Context.LongTy))
	return Context.getVectorType(Context.LongTy, VTy->getNumElements(),
	VectorType::GenericVector);
	else if (TypeSize == Context.getTypeSize(Context.IntTy))
	return Context.getVectorType(Context.IntTy, VTy->getNumElements(),
	VectorType::GenericVector);
	else if (TypeSize == Context.getTypeSize(Context.ShortTy))
	return Context.getVectorType(Context.ShortTy, VTy->getNumElements(),
	VectorType::GenericVector);
	assert(TypeSize == Context.getTypeSize(Context.CharTy) &&
	"Unhandled vector element size in vector compare");
	return Context.getVectorType(Context.CharTy, VTy->getNumElements(),
	VectorType::GenericVector);
	}

	/// CheckVectorCompareOperands - vector comparisons are a clang extension that
	/// operates on extended vector types. Instead of producing an IntTy result,
	/// like a scalar comparison, a vector comparison produces a vector of integer
	/// types.
	QualType Sema::CheckVectorCompareOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc,
	bool IsRelational) {
	// Check to make sure we're operating on vectors of the same type and width,
	// Allowing one side to be a scalar of element type.
	QualType vType = CheckVectorOperands(LHS, RHS, Loc, /isCompAssign/false,
	/AllowBothBool/true,
	/AllowBoolConversions/getLangOpts().ZVector);
	if (vType.isNull())
	return vType;

	QualType LHSType = LHS.get()->getType();

	// If AltiVec, the comparison results in a numeric type, i.e.
	// bool for C++, int for C
	if (getLangOpts().AltiVec &&
	vType->getAs<VectorType>()->getVectorKind() == VectorType::AltiVecVector)
	return Context.getLogicalOperationType();

	// For non-floating point types, check for self-comparisons of the form
	// x == x, x != x, x < x, etc. These always evaluate to a constant, and
	// often indicate logic errors in the program.
	if (!LHSType->hasFloatingRepresentation() && !inTemplateInstantiation()) {
	if (DeclRefExpr* DRL
	= dyn_cast<DeclRefExpr>(LHS.get()->IgnoreParenImpCasts()))
	if (DeclRefExpr* DRR
	= dyn_cast<DeclRefExpr>(RHS.get()->IgnoreParenImpCasts()))
	if (DRL->getDecl() == DRR->getDecl())
	DiagRuntimeBehavior(Loc, nullptr,
	PDiag(diag::warn_comparison_always)
	<< 0 // self-
	<< 2 // "a constant"
	);
	}

	// Check for comparisons of floating point operands using != and ==.
	if (!IsRelational && LHSType->hasFloatingRepresentation()) {
	assert (RHS.get()->getType()->hasFloatingRepresentation());
	CheckFloatComparison(Loc, LHS.get(), RHS.get());
	}

	// Return a signed type for the vector.
	return GetSignedVectorType(vType);
	}

	QualType Sema::CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc) {
	// Ensure that either both operands are of the same vector type, or
	// one operand is of a vector type and the other is of its element type.
	QualType vType = CheckVectorOperands(LHS, RHS, Loc, false,
	/AllowBothBool/true,
	/AllowBoolConversions/false);
	if (vType.isNull())
	return InvalidOperands(Loc, LHS, RHS);
	if (getLangOpts().OpenCL && getLangOpts().OpenCLVersion < 120 &&
	vType->hasFloatingRepresentation())
	return InvalidOperands(Loc, LHS, RHS);
	// FIXME: The check for C++ here is for GCC compatibility. GCC rejects the
	// usage of the logical operators && and \|\| with vectors in C. This
	// check could be notionally dropped.
	if (!getLangOpts().CPlusPlus &&
	!(isa<ExtVectorType>(vType->getAs<VectorType>())))
	return InvalidLogicalVectorOperands(Loc, LHS, RHS);

	return GetSignedVectorType(LHS.get()->getType());
	}

	inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc,
	BinaryOperatorKind Opc) {
	checkArithmeticNull(this, LHS, RHS, Loc, /isCompare=*/false);

	bool IsCompAssign =
	Opc == BO_AndAssign \|\| Opc == BO_OrAssign \|\| Opc == BO_XorAssign;

	if (LHS.get()->getType()->isVectorType() \|\|
	RHS.get()->getType()->isVectorType()) {
	if (LHS.get()->getType()->hasIntegerRepresentation() &&
	RHS.get()->getType()->hasIntegerRepresentation())
	return CheckVectorOperands(LHS, RHS, Loc, IsCompAssign,
	/AllowBothBool/true,
	/AllowBoolConversions/getLangOpts().ZVector);
	return InvalidOperands(Loc, LHS, RHS);
	}

	if (Opc == BO_And)
	diagnoseLogicalNotOnLHSofCheck(*this, LHS, RHS, Loc, Opc);

	ExprResult LHSResult = LHS, RHSResult = RHS;
	QualType compType = UsualArithmeticConversions(LHSResult, RHSResult,
	IsCompAssign);
	if (LHSResult.isInvalid() \|\| RHSResult.isInvalid())
	return QualType();
	LHS = LHSResult.get();
	RHS = RHSResult.get();

	if (!compType.isNull() && compType->isIntegralOrUnscopedEnumerationType())
	return compType;
	return InvalidOperands(Loc, LHS, RHS);
	}

	// C99 6.5.[13,14]
	inline QualType Sema::CheckLogicalOperands(ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc,
	BinaryOperatorKind Opc) {
	// Check vector operands differently.
	if (LHS.get()->getType()->isVectorType() \|\| RHS.get()->getType()->isVectorType())
	return CheckVectorLogicalOperands(LHS, RHS, Loc);

	// Diagnose cases where the user write a logical and/or but probably meant a
	// bitwise one. We do this when the LHS is a non-bool integer and the RHS
	// is a constant.
	if (LHS.get()->getType()->isIntegerType() &&
	!LHS.get()->getType()->isBooleanType() &&
	RHS.get()->getType()->isIntegerType() && !RHS.get()->isValueDependent() &&
	// Don't warn in macros or template instantiations.
	!Loc.isMacroID() && !inTemplateInstantiation()) {
	// If the RHS can be constant folded, and if it constant folds to something
	// that isn't 0 or 1 (which indicate a potential logical operation that
	// happened to fold to true/false) then warn.
	// Parens on the RHS are ignored.
	llvm::APSInt Result;
	if (RHS.get()->EvaluateAsInt(Result, Context))
	if ((getLangOpts().Bool && !RHS.get()->getType()->isBooleanType() &&
	!RHS.get()->getExprLoc().isMacroID()) \|\|
	(Result != 0 && Result != 1)) {
	Diag(Loc, diag::warn_logical_instead_of_bitwise)
	<< RHS.get()->getSourceRange()
	<< (Opc == BO_LAnd ? "&&" : "\|\|");
	// Suggest replacing the logical operator with the bitwise version
	Diag(Loc, diag::note_logical_instead_of_bitwise_change_operator)
	<< (Opc == BO_LAnd ? "&" : "\|")
	<< FixItHint::CreateReplacement(SourceRange(
	Loc, getLocForEndOfToken(Loc)),
	Opc == BO_LAnd ? "&" : "\|");
	if (Opc == BO_LAnd)
	// Suggest replacing "Foo() && kNonZero" with "Foo()"
	Diag(Loc, diag::note_logical_instead_of_bitwise_remove_constant)
	<< FixItHint::CreateRemoval(
	SourceRange(getLocForEndOfToken(LHS.get()->getLocEnd()),
	RHS.get()->getLocEnd()));
	}
	}

	if (!Context.getLangOpts().CPlusPlus) {
	// OpenCL v1.1 s6.3.g: The logical operators and (&&), or (\|\|) do
	// not operate on the built-in scalar and vector float types.
	if (Context.getLangOpts().OpenCL &&
	Context.getLangOpts().OpenCLVersion < 120) {
	if (LHS.get()->getType()->isFloatingType() \|\|
	RHS.get()->getType()->isFloatingType())
	return InvalidOperands(Loc, LHS, RHS);
	}

	LHS = UsualUnaryConversions(LHS.get());
	if (LHS.isInvalid())
	return QualType();

	RHS = UsualUnaryConversions(RHS.get());
	if (RHS.isInvalid())
	return QualType();

	if (!LHS.get()->getType()->isScalarType() \|\|
	!RHS.get()->getType()->isScalarType())
	return InvalidOperands(Loc, LHS, RHS);

	return Context.IntTy;
	}

	// The following is safe because we only use this method for
	// non-overloadable operands.

	// C++ [expr.log.and]p1
	// C++ [expr.log.or]p1
	// The operands are both contextually converted to type bool.
	ExprResult LHSRes = PerformContextuallyConvertToBool(LHS.get());
	if (LHSRes.isInvalid())
	return InvalidOperands(Loc, LHS, RHS);
	LHS = LHSRes;

	ExprResult RHSRes = PerformContextuallyConvertToBool(RHS.get());
	if (RHSRes.isInvalid())
	return InvalidOperands(Loc, LHS, RHS);
	RHS = RHSRes;

	// C++ [expr.log.and]p2
	// C++ [expr.log.or]p2
	// The result is a bool.
	return Context.BoolTy;
	}

	static bool IsReadonlyMessage(Expr *E, Sema &S) {
	const MemberExpr *ME = dyn_cast<MemberExpr>(E);
	if (!ME) return false;
	if (!isa<FieldDecl>(ME->getMemberDecl())) return false;
	ObjCMessageExpr *Base = dyn_cast<ObjCMessageExpr>(
	ME->getBase()->IgnoreImplicit()->IgnoreParenImpCasts());
	if (!Base) return false;
	return Base->getMethodDecl() != nullptr;
	}

	/// Is the given expression (which must be 'const') a reference to a
	/// variable which was originally non-const, but which has become
	/// 'const' due to being captured within a block?
	enum NonConstCaptureKind { NCCK_None, NCCK_Block, NCCK_Lambda };
	static NonConstCaptureKind isReferenceToNonConstCapture(Sema &S, Expr *E) {
	assert(E->isLValue() && E->getType().isConstQualified());
	E = E->IgnoreParens();

	// Must be a reference to a declaration from an enclosing scope.
	DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E);
	if (!DRE) return NCCK_None;
	if (!DRE->refersToEnclosingVariableOrCapture()) return NCCK_None;

	// The declaration must be a variable which is not declared 'const'.
	VarDecl *var = dyn_cast<VarDecl>(DRE->getDecl());
	if (!var) return NCCK_None;
	if (var->getType().isConstQualified()) return NCCK_None;
	assert(var->hasLocalStorage() && "capture added 'const' to non-local?");

	// Decide whether the first capture was for a block or a lambda.
	DeclContext DC = S.CurContext, Prev = nullptr;
	// Decide whether the first capture was for a block or a lambda.
	while (DC) {
	// For init-capture, it is possible that the variable belongs to the
	// template pattern of the current context.
	if (auto *FD = dyn_cast<FunctionDecl>(DC))
	if (var->isInitCapture() &&
	FD->getTemplateInstantiationPattern() == var->getDeclContext())
	break;
	if (DC == var->getDeclContext())
	break;
	Prev = DC;
	DC = DC->getParent();
	}
	// Unless we have an init-capture, we've gone one step too far.
	if (!var->isInitCapture())
	DC = Prev;
	return (isa<BlockDecl>(DC) ? NCCK_Block : NCCK_Lambda);
	}

	static bool IsTypeModifiable(QualType Ty, bool IsDereference) {
	Ty = Ty.getNonReferenceType();
	if (IsDereference && Ty->isPointerType())
	Ty = Ty->getPointeeType();
	return !Ty.isConstQualified();
	}

	/// Emit the "read-only variable not assignable" error and print notes to give
	/// more information about why the variable is not assignable, such as pointing
	/// to the declaration of a const variable, showing that a method is const, or
	/// that the function is returning a const reference.
	static void DiagnoseConstAssignment(Sema &S, const Expr *E,
	SourceLocation Loc) {
	// Update err_typecheck_assign_const and note_typecheck_assign_const
	// when this enum is changed.
	enum {
	ConstFunction,
	ConstVariable,
	ConstMember,
	ConstMethod,
	ConstUnknown, // Keep as last element
	};

	SourceRange ExprRange = E->getSourceRange();

	// Only emit one error on the first const found. All other consts will emit
	// a note to the error.
	bool DiagnosticEmitted = false;

	// Track if the current expression is the result of a dereference, and if the
	// next checked expression is the result of a dereference.
	bool IsDereference = false;
	bool NextIsDereference = false;

	// Loop to process MemberExpr chains.
	while (true) {
	IsDereference = NextIsDereference;

	E = E->IgnoreImplicit()->IgnoreParenImpCasts();
	if (const MemberExpr *ME = dyn_cast<MemberExpr>(E)) {
	NextIsDereference = ME->isArrow();
	const ValueDecl *VD = ME->getMemberDecl();
	if (const FieldDecl *Field = dyn_cast<FieldDecl>(VD)) {
	// Mutable fields can be modified even if the class is const.
	if (Field->isMutable()) {
	assert(DiagnosticEmitted && "Expected diagnostic not emitted.");
	break;
	}

	if (!IsTypeModifiable(Field->getType(), IsDereference)) {
	if (!DiagnosticEmitted) {
	S.Diag(Loc, diag::err_typecheck_assign_const)
	<< ExprRange << ConstMember << false /static/ << Field
	<< Field->getType();
	DiagnosticEmitted = true;
	}
	S.Diag(VD->getLocation(), diag::note_typecheck_assign_const)
	<< ConstMember << false /static/ << Field << Field->getType()
	<< Field->getSourceRange();
	}
	E = ME->getBase();
	continue;
	} else if (const VarDecl *VDecl = dyn_cast<VarDecl>(VD)) {
	if (VDecl->getType().isConstQualified()) {
	if (!DiagnosticEmitted) {
	S.Diag(Loc, diag::err_typecheck_assign_const)
	<< ExprRange << ConstMember << true /static/ << VDecl
	<< VDecl->getType();
	DiagnosticEmitted = true;
	}
	S.Diag(VD->getLocation(), diag::note_typecheck_assign_const)
	<< ConstMember << true /static/ << VDecl << VDecl->getType()
	<< VDecl->getSourceRange();
	}
	// Static fields do not inherit constness from parents.
	break;
	}
	break;
	} // End MemberExpr
	break;
	}

	if (const CallExpr *CE = dyn_cast<CallExpr>(E)) {
	// Function calls
	const FunctionDecl *FD = CE->getDirectCallee();
	if (FD && !IsTypeModifiable(FD->getReturnType(), IsDereference)) {
	if (!DiagnosticEmitted) {
	S.Diag(Loc, diag::err_typecheck_assign_const) << ExprRange
	<< ConstFunction << FD;
	DiagnosticEmitted = true;
	}
	S.Diag(FD->getReturnTypeSourceRange().getBegin(),
	diag::note_typecheck_assign_const)
	<< ConstFunction << FD << FD->getReturnType()
	<< FD->getReturnTypeSourceRange();
	}
	} else if (const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
	// Point to variable declaration.
	if (const ValueDecl *VD = DRE->getDecl()) {
	if (!IsTypeModifiable(VD->getType(), IsDereference)) {
	if (!DiagnosticEmitted) {
	S.Diag(Loc, diag::err_typecheck_assign_const)
	<< ExprRange << ConstVariable << VD << VD->getType();
	DiagnosticEmitted = true;
	}
	S.Diag(VD->getLocation(), diag::note_typecheck_assign_const)
	<< ConstVariable << VD << VD->getType() << VD->getSourceRange();
	}
	}
	} else if (isa<CXXThisExpr>(E)) {
	if (const DeclContext *DC = S.getFunctionLevelDeclContext()) {
	if (const CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(DC)) {
	if (MD->isConst()) {
	if (!DiagnosticEmitted) {
	S.Diag(Loc, diag::err_typecheck_assign_const) << ExprRange
	<< ConstMethod << MD;
	DiagnosticEmitted = true;
	}
	S.Diag(MD->getLocation(), diag::note_typecheck_assign_const)
	<< ConstMethod << MD << MD->getSourceRange();
	}
	}
	}
	}

	if (DiagnosticEmitted)
	return;

	// Can't determine a more specific message, so display the generic error.
	S.Diag(Loc, diag::err_typecheck_assign_const) << ExprRange << ConstUnknown;
	}

	/// CheckForModifiableLvalue - Verify that E is a modifiable lvalue. If not,
	/// emit an error and return true. If so, return false.
	static bool CheckForModifiableLvalue(Expr *E, SourceLocation Loc, Sema &S) {
	assert(!E->hasPlaceholderType(BuiltinType::PseudoObject));

	S.CheckShadowingDeclModification(E, Loc);

	SourceLocation OrigLoc = Loc;
	Expr::isModifiableLvalueResult IsLV = E->isModifiableLvalue(S.Context,
	&Loc);
	if (IsLV == Expr::MLV_ClassTemporary && IsReadonlyMessage(E, S))
	IsLV = Expr::MLV_InvalidMessageExpression;
	if (IsLV == Expr::MLV_Valid)
	return false;

	unsigned DiagID = 0;
	bool NeedType = false;
	switch (IsLV) { // C99 6.5.16p2
	case Expr::MLV_ConstQualified:
	// Use a specialized diagnostic when we're assigning to an object
	// from an enclosing function or block.
	if (NonConstCaptureKind NCCK = isReferenceToNonConstCapture(S, E)) {
	if (NCCK == NCCK_Block)
	DiagID = diag::err_block_decl_ref_not_modifiable_lvalue;
	else
	DiagID = diag::err_lambda_decl_ref_not_modifiable_lvalue;
	break;
	}

	// In ARC, use some specialized diagnostics for occasions where we
	// infer 'const'. These are always pseudo-strong variables.
	if (S.getLangOpts().ObjCAutoRefCount) {
	DeclRefExpr *declRef = dyn_cast<DeclRefExpr>(E->IgnoreParenCasts());
	if (declRef && isa<VarDecl>(declRef->getDecl())) {
	VarDecl *var = cast<VarDecl>(declRef->getDecl());

	// Use the normal diagnostic if it's pseudo-__strong but the
	// user actually wrote 'const'.
	if (var->isARCPseudoStrong() &&
	(!var->getTypeSourceInfo() \|\|
	!var->getTypeSourceInfo()->getType().isConstQualified())) {
	// There are two pseudo-strong cases:
	// - self
	ObjCMethodDecl *method = S.getCurMethodDecl();
	if (method && var == method->getSelfDecl())
	DiagID = method->isClassMethod()
	? diag::err_typecheck_arc_assign_self_class_method
	: diag::err_typecheck_arc_assign_self;

	// - fast enumeration variables
	else
	DiagID = diag::err_typecheck_arr_assign_enumeration;

	SourceRange Assign;
	if (Loc != OrigLoc)
	Assign = SourceRange(OrigLoc, OrigLoc);
	S.Diag(Loc, DiagID) << E->getSourceRange() << Assign;
	// We need to preserve the AST regardless, so migration tool
	// can do its job.
	return false;
	}
	}
	}

	// If none of the special cases above are triggered, then this is a
	// simple const assignment.
	if (DiagID == 0) {
	DiagnoseConstAssignment(S, E, Loc);
	return true;
	}

	break;
	case Expr::MLV_ConstAddrSpace:
	DiagnoseConstAssignment(S, E, Loc);
	return true;
	case Expr::MLV_ArrayType:
	case Expr::MLV_ArrayTemporary:
	DiagID = diag::err_typecheck_array_not_modifiable_lvalue;
	NeedType = true;
	break;
	case Expr::MLV_NotObjectType:
	DiagID = diag::err_typecheck_non_object_not_modifiable_lvalue;
	NeedType = true;
	break;
	case Expr::MLV_LValueCast:
	DiagID = diag::err_typecheck_lvalue_casts_not_supported;
	break;
	case Expr::MLV_Valid:
	llvm_unreachable("did not take early return for MLV_Valid");
	case Expr::MLV_InvalidExpression:
	case Expr::MLV_MemberFunction:
	case Expr::MLV_ClassTemporary:
	DiagID = diag::err_typecheck_expression_not_modifiable_lvalue;
	break;
	case Expr::MLV_IncompleteType:
	case Expr::MLV_IncompleteVoidType:
	return S.RequireCompleteType(Loc, E->getType(),
	diag::err_typecheck_incomplete_type_not_modifiable_lvalue, E);
	case Expr::MLV_DuplicateVectorComponents:
	DiagID = diag::err_typecheck_duplicate_vector_components_not_mlvalue;
	break;
	case Expr::MLV_NoSetterProperty:
	llvm_unreachable("readonly properties should be processed differently");
	case Expr::MLV_InvalidMessageExpression:
	DiagID = diag::err_readonly_message_assignment;
	break;
	case Expr::MLV_SubObjCPropertySetting:
	DiagID = diag::err_no_subobject_property_setting;
	break;
	}

	SourceRange Assign;
	if (Loc != OrigLoc)
	Assign = SourceRange(OrigLoc, OrigLoc);
	if (NeedType)
	S.Diag(Loc, DiagID) << E->getType() << E->getSourceRange() << Assign;
	else
	S.Diag(Loc, DiagID) << E->getSourceRange() << Assign;
	return true;
	}

	static void CheckIdentityFieldAssignment(Expr LHSExpr, Expr RHSExpr,
	SourceLocation Loc,
	Sema &Sema) {
	// C / C++ fields
	MemberExpr *ML = dyn_cast<MemberExpr>(LHSExpr);
	MemberExpr *MR = dyn_cast<MemberExpr>(RHSExpr);
	if (ML && MR && ML->getMemberDecl() == MR->getMemberDecl()) {
	if (isa<CXXThisExpr>(ML->getBase()) && isa<CXXThisExpr>(MR->getBase()))
	Sema.Diag(Loc, diag::warn_identity_field_assign) << 0;
	}

	// Objective-C instance variables
	ObjCIvarRefExpr *OL = dyn_cast<ObjCIvarRefExpr>(LHSExpr);
	ObjCIvarRefExpr *OR = dyn_cast<ObjCIvarRefExpr>(RHSExpr);
	if (OL && OR && OL->getDecl() == OR->getDecl()) {
	DeclRefExpr *RL = dyn_cast<DeclRefExpr>(OL->getBase()->IgnoreImpCasts());
	DeclRefExpr *RR = dyn_cast<DeclRefExpr>(OR->getBase()->IgnoreImpCasts());
	if (RL && RR && RL->getDecl() == RR->getDecl())
	Sema.Diag(Loc, diag::warn_identity_field_assign) << 1;
	}
	}

	// C99 6.5.16.1
	QualType Sema::CheckAssignmentOperands(Expr *LHSExpr, ExprResult &RHS,
	SourceLocation Loc,
	QualType CompoundType) {
	assert(!LHSExpr->hasPlaceholderType(BuiltinType::PseudoObject));

	// Verify that LHS is a modifiable lvalue, and emit error if not.
	if (CheckForModifiableLvalue(LHSExpr, Loc, *this))
	return QualType();

	QualType LHSType = LHSExpr->getType();
	QualType RHSType = CompoundType.isNull() ? RHS.get()->getType() :
	CompoundType;
	// OpenCL v1.2 s6.1.1.1 p2:
	// The half data type can only be used to declare a pointer to a buffer that
	// contains half values
	if (getLangOpts().OpenCL && !getOpenCLOptions().isEnabled("cl_khr_fp16") &&
	LHSType->isHalfType()) {
	Diag(Loc, diag::err_opencl_half_load_store) << 1
	<< LHSType.getUnqualifiedType();
	return QualType();
	}

	AssignConvertType ConvTy;
	if (CompoundType.isNull()) {
	Expr *RHSCheck = RHS.get();

	CheckIdentityFieldAssignment(LHSExpr, RHSCheck, Loc, *this);

	QualType LHSTy(LHSType);
	ConvTy = CheckSingleAssignmentConstraints(LHSTy, RHS);
	if (RHS.isInvalid())
	return QualType();
	// Special case of NSObject attributes on c-style pointer types.
	if (ConvTy == IncompatiblePointer &&
	((Context.isObjCNSObjectType(LHSType) &&
	RHSType->isObjCObjectPointerType()) \|\|
	(Context.isObjCNSObjectType(RHSType) &&
	LHSType->isObjCObjectPointerType())))
	ConvTy = Compatible;

	if (ConvTy == Compatible &&
	LHSType->isObjCObjectType())
	Diag(Loc, diag::err_objc_object_assignment)
	<< LHSType;

	// If the RHS is a unary plus or minus, check to see if they = and + are
	// right next to each other. If so, the user may have typo'd "x =+ 4"
	// instead of "x += 4".
	if (ImplicitCastExpr *ICE = dyn_cast<ImplicitCastExpr>(RHSCheck))
	RHSCheck = ICE->getSubExpr();
	if (UnaryOperator *UO = dyn_cast<UnaryOperator>(RHSCheck)) {
	if ((UO->getOpcode() == UO_Plus \|\|
	UO->getOpcode() == UO_Minus) &&
	Loc.isFileID() && UO->getOperatorLoc().isFileID() &&
	// Only if the two operators are exactly adjacent.
	Loc.getLocWithOffset(1) == UO->getOperatorLoc() &&
	// And there is a space or other character before the subexpr of the
	// unary +/-. We don't want to warn on "x=-1".
	Loc.getLocWithOffset(2) != UO->getSubExpr()->getLocStart() &&
	UO->getSubExpr()->getLocStart().isFileID()) {
	Diag(Loc, diag::warn_not_compound_assign)
	<< (UO->getOpcode() == UO_Plus ? "+" : "-")
	<< SourceRange(UO->getOperatorLoc(), UO->getOperatorLoc());
	}
	}

	if (ConvTy == Compatible) {
	if (LHSType.getObjCLifetime() == Qualifiers::OCL_Strong) {
	// Warn about retain cycles where a block captures the LHS, but
	// not if the LHS is a simple variable into which the block is
	// being stored...unless that variable can be captured by reference!
	const Expr *InnerLHS = LHSExpr->IgnoreParenCasts();
	const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(InnerLHS);
	if (!DRE \|\| DRE->getDecl()->hasAttr<BlocksAttr>())
	checkRetainCycles(LHSExpr, RHS.get());
	}

	if (LHSType.getObjCLifetime() == Qualifiers::OCL_Strong \|\|
	LHSType.isNonWeakInMRRWithObjCWeak(Context)) {
	// It is safe to assign a weak reference into a strong variable.
	// Although this code can still have problems:
	// id x = self.weakProp;
	// id y = self.weakProp;
	// we do not warn to warn spuriously when 'x' and 'y' are on separate
	// paths through the function. This should be revisited if
	// -Wrepeated-use-of-weak is made flow-sensitive.
	// For ObjCWeak only, we do not warn if the assign is to a non-weak
	// variable, which will be valid for the current autorelease scope.
	if (!Diags.isIgnored(diag::warn_arc_repeated_use_of_weak,
	RHS.get()->getLocStart()))
	getCurFunction()->markSafeWeakUse(RHS.get());

	} else if (getLangOpts().ObjCAutoRefCount \|\| getLangOpts().ObjCWeak) {
	checkUnsafeExprAssigns(Loc, LHSExpr, RHS.get());
	}
	}
	} else {
	// Compound assignment "x += y"
	ConvTy = CheckAssignmentConstraints(Loc, LHSType, RHSType);
	}

	if (DiagnoseAssignmentResult(ConvTy, Loc, LHSType, RHSType,
	RHS.get(), AA_Assigning))
	return QualType();

	CheckForNullPointerDereference(*this, LHSExpr);

	// C99 6.5.16p3: The type of an assignment expression is the type of the
	// left operand unless the left operand has qualified type, in which case
	// it is the unqualified version of the type of the left operand.
	// C99 6.5.16.1p2: In simple assignment, the value of the right operand
	// is converted to the type of the assignment expression (above).
	// C++ 5.17p1: the type of the assignment expression is that of its left
	// operand.
	return (getLangOpts().CPlusPlus
	? LHSType : LHSType.getUnqualifiedType());
	}

	// Only ignore explicit casts to void.
	static bool IgnoreCommaOperand(const Expr *E) {
	E = E->IgnoreParens();

	if (const CastExpr *CE = dyn_cast<CastExpr>(E)) {
	if (CE->getCastKind() == CK_ToVoid) {
	return true;
	}
	}

	return false;
	}

	// Look for instances where it is likely the comma operator is confused with
	// another operator. There is a whitelist of acceptable expressions for the
	// left hand side of the comma operator, otherwise emit a warning.
	void Sema::DiagnoseCommaOperator(const Expr *LHS, SourceLocation Loc) {
	// No warnings in macros
	if (Loc.isMacroID())
	return;

	// Don't warn in template instantiations.
	if (inTemplateInstantiation())
	return;

	// Scope isn't fine-grained enough to whitelist the specific cases, so
	// instead, skip more than needed, then call back into here with the
	// CommaVisitor in SemaStmt.cpp.
	// The whitelisted locations are the initialization and increment portions
	// of a for loop. The additional checks are on the condition of
	// if statements, do/while loops, and for loops.
	const unsigned ForIncrementFlags =
	Scope::ControlScope \| Scope::ContinueScope \| Scope::BreakScope;
	const unsigned ForInitFlags = Scope::ControlScope \| Scope::DeclScope;
	const unsigned ScopeFlags = getCurScope()->getFlags();
	if ((ScopeFlags & ForIncrementFlags) == ForIncrementFlags \|\|
	(ScopeFlags & ForInitFlags) == ForInitFlags)
	return;

	// If there are multiple comma operators used together, get the RHS of the
	// of the comma operator as the LHS.
	while (const BinaryOperator *BO = dyn_cast<BinaryOperator>(LHS)) {
	if (BO->getOpcode() != BO_Comma)
	break;
	LHS = BO->getRHS();
	}

	// Only allow some expressions on LHS to not warn.
	if (IgnoreCommaOperand(LHS))
	return;

	Diag(Loc, diag::warn_comma_operator);
	Diag(LHS->getLocStart(), diag::note_cast_to_void)
	<< LHS->getSourceRange()
	<< FixItHint::CreateInsertion(LHS->getLocStart(),
	LangOpts.CPlusPlus ? "static_cast<void>("
	: "(void)(")
	<< FixItHint::CreateInsertion(PP.getLocForEndOfToken(LHS->getLocEnd()),
	")");
	}

	// C99 6.5.17
	static QualType CheckCommaOperands(Sema &S, ExprResult &LHS, ExprResult &RHS,
	SourceLocation Loc) {
	LHS = S.CheckPlaceholderExpr(LHS.get());
	RHS = S.CheckPlaceholderExpr(RHS.get());
	if (LHS.isInvalid() \|\| RHS.isInvalid())
	return QualType();

	// C's comma performs lvalue conversion (C99 6.3.2.1) on both its
	// operands, but not unary promotions.
	// C++'s comma does not do any conversions at all (C++ [expr.comma]p1).

	// So we treat the LHS as a ignored value, and in C++ we allow the
	// containing site to determine what should be done with the RHS.
	LHS = S.IgnoredValueConversions(LHS.get());
	if (LHS.isInvalid())
	return QualType();

	S.DiagnoseUnusedExprResult(LHS.get());

	if (!S.getLangOpts().CPlusPlus) {
	RHS = S.DefaultFunctionArrayLvalueConversion(RHS.get());
	if (RHS.isInvalid())
	return QualType();
	if (!RHS.get()->getType()->isVoidType())
	S.RequireCompleteType(Loc, RHS.get()->getType(),
	diag::err_incomplete_type);
	}

	if (!S.getDiagnostics().isIgnored(diag::warn_comma_operator, Loc))
	S.DiagnoseCommaOperator(LHS.get(), Loc);

	return RHS.get()->getType();
	}

	/// CheckIncrementDecrementOperand - unlike most "Check" methods, this routine
	/// doesn't need to call UsualUnaryConversions or UsualArithmeticConversions.
	static QualType CheckIncrementDecrementOperand(Sema &S, Expr *Op,
	ExprValueKind &VK,
	ExprObjectKind &OK,
	SourceLocation OpLoc,
	bool IsInc, bool IsPrefix) {
	if (Op->isTypeDependent())
	return S.Context.DependentTy;

	QualType ResType = Op->getType();
	// Atomic types can be used for increment / decrement where the non-atomic
	// versions can, so ignore the _Atomic() specifier for the purpose of
	// checking.
	if (const AtomicType *ResAtomicType = ResType->getAs<AtomicType>())
	ResType = ResAtomicType->getValueType();

	assert(!ResType.isNull() && "no type for increment/decrement expression");

	if (S.getLangOpts().CPlusPlus && ResType->isBooleanType()) {
	// Decrement of bool is not allowed.
	if (!IsInc) {
	S.Diag(OpLoc, diag::err_decrement_bool) << Op->getSourceRange();
	return QualType();
	}
	// Increment of bool sets it to true, but is deprecated.
	S.Diag(OpLoc, S.getLangOpts().CPlusPlus1z ? diag::ext_increment_bool
	: diag::warn_increment_bool)
	<< Op->getSourceRange();
	} else if (S.getLangOpts().CPlusPlus && ResType->isEnumeralType()) {
	// Error on enum increments and decrements in C++ mode
	S.Diag(OpLoc, diag::err_increment_decrement_enum) << IsInc << ResType;
	return QualType();
	} else if (ResType->isRealType()) {
	// OK!
	} else if (ResType->isPointerType()) {
	// C99 6.5.2.4p2, 6.5.6p2
	if (!checkArithmeticOpPointerOperand(S, OpLoc, Op))
	return QualType();
	} else if (ResType->isObjCObjectPointerType()) {
	// On modern runtimes, ObjC pointer arithmetic is forbidden.
	// Otherwise, we just need a complete type.
	if (checkArithmeticIncompletePointerType(S, OpLoc, Op) \|\|
	checkArithmeticOnObjCPointer(S, OpLoc, Op))
	return QualType();
	} else if (ResType->isAnyComplexType()) {
	// C99 does not support ++/-- on complex types, we allow as an extension.
	S.Diag(OpLoc, diag::ext_integer_increment_complex)
	<< ResType << Op->getSourceRange();
	} else if (ResType->isPlaceholderType()) {
	ExprResult PR = S.CheckPlaceholderExpr(Op);
	if (PR.isInvalid()) return QualType();
	return CheckIncrementDecrementOperand(S, PR.get(), VK, OK, OpLoc,
	IsInc, IsPrefix);
	} else if (S.getLangOpts().AltiVec && ResType->isVectorType()) {
	// OK! ( C/C++ Language Extensions for CBEA(Version 2.6) 10.3 )
	} else if (S.getLangOpts().ZVector && ResType->isVectorType() &&
	(ResType->getAs<VectorType>()->getVectorKind() !=
	VectorType::AltiVecBool)) {
	// The z vector extensions allow ++ and -- for non-bool vectors.
	} else if(S.getLangOpts().OpenCL && ResType->isVectorType() &&
	ResType->getAs<VectorType>()->getElementType()->isIntegerType()) {
	// OpenCL V1.2 6.3 says dec/inc ops operate on integer vector types.
	} else {
	S.Diag(OpLoc, diag::err_typecheck_illegal_increment_decrement)
	<< ResType << int(IsInc) << Op->getSourceRange();
	return QualType();
	}
	// At this point, we know we have a real, complex or pointer type.
	// Now make sure the operand is a modifiable lvalue.
	if (CheckForModifiableLvalue(Op, OpLoc, S))
	return QualType();
	// In C++, a prefix increment is the same type as the operand. Otherwise
	// (in C or with postfix), the increment is the unqualified type of the
	// operand.
	if (IsPrefix && S.getLangOpts().CPlusPlus) {
	VK = VK_LValue;
	OK = Op->getObjectKind();
	return ResType;
	} else {
	VK = VK_RValue;
	return ResType.getUnqualifiedType();
	}
	}


	/// getPrimaryDecl - Helper function for CheckAddressOfOperand().
	/// This routine allows us to typecheck complex/recursive expressions
	/// where the declaration is needed for type checking. We only need to
	/// handle cases when the expression references a function designator
	/// or is an lvalue. Here are some examples:
	/// - &(x) => x
	/// - &*****f => f for f a function designator.
	/// - &s.xx => s
	/// - &s.zz[1].yy -> s, if zz is an array
	/// - *(x + 1) -> x, if x is an array
	/// - &"123"[2] -> 0
	/// - & __real__ x -> x
	static ValueDecl getPrimaryDecl(Expr E) {
	switch (E->getStmtClass()) {
	case Stmt::DeclRefExprClass:
	return cast<DeclRefExpr>(E)->getDecl();
	case Stmt::MemberExprClass:
	// If this is an arrow operator, the address is an offset from
	// the base's value, so the object the base refers to is
	// irrelevant.
	if (cast<MemberExpr>(E)->isArrow())
	return nullptr;
	// Otherwise, the expression refers to a part of the base
	return getPrimaryDecl(cast<MemberExpr>(E)->getBase());
	case Stmt::ArraySubscriptExprClass: {
	// FIXME: This code shouldn't be necessary! We should catch the implicit
	// promotion of register arrays earlier.
	Expr* Base = cast<ArraySubscriptExpr>(E)->getBase();
	if (ImplicitCastExpr* ICE = dyn_cast<ImplicitCastExpr>(Base)) {
	if (ICE->getSubExpr()->getType()->isArrayType())
	return getPrimaryDecl(ICE->getSubExpr());
	}
	return nullptr;
	}
	case Stmt::UnaryOperatorClass: {
	UnaryOperator *UO = cast<UnaryOperator>(E);

	switch(UO->getOpcode()) {
	case UO_Real:
	case UO_Imag:
	case UO_Extension:
	return getPrimaryDecl(UO->getSubExpr());
	default:
	return nullptr;
	}
	}
	case Stmt::ParenExprClass:
	return getPrimaryDecl(cast<ParenExpr>(E)->getSubExpr());
	case Stmt::ImplicitCastExprClass:
	// If the result of an implicit cast is an l-value, we care about
	// the sub-expression; otherwise, the result here doesn't matter.
	return getPrimaryDecl(cast<ImplicitCastExpr>(E)->getSubExpr());
	default:
	return nullptr;
	}
	}

	namespace {
	enum {
	AO_Bit_Field = 0,
	AO_Vector_Element = 1,
	AO_Property_Expansion = 2,
	AO_Register_Variable = 3,
	AO_No_Error = 4
	};
	}
	/// \brief Diagnose invalid operand for address of operations.
	///
	/// \param Type The type of operand which cannot have its address taken.
	static void diagnoseAddressOfInvalidType(Sema &S, SourceLocation Loc,
	Expr *E, unsigned Type) {
	S.Diag(Loc, diag::err_typecheck_address_of) << Type << E->getSourceRange();
	}

	/// CheckAddressOfOperand - The operand of & must be either a function
	/// designator or an lvalue designating an object. If it is an lvalue, the
	/// object cannot be declared with storage class register or be a bit field.
	/// Note: The usual conversions are not applied to the operand of the &
	/// operator (C99 6.3.2.1p[2-4]), and its result is never an lvalue.
	/// In C++, the operand might be an overloaded function name, in which case
	/// we allow the '&' but retain the overloaded-function type.
	QualType Sema::CheckAddressOfOperand(ExprResult &OrigOp, SourceLocation OpLoc) {
	if (const BuiltinType *PTy = OrigOp.get()->getType()->getAsPlaceholderType()){
	if (PTy->getKind() == BuiltinType::Overload) {
	Expr *E = OrigOp.get()->IgnoreParens();
	if (!isa<OverloadExpr>(E)) {
	assert(cast<UnaryOperator>(E)->getOpcode() == UO_AddrOf);
	Diag(OpLoc, diag::err_typecheck_invalid_lvalue_addrof_addrof_function)
	<< OrigOp.get()->getSourceRange();
	return QualType();
	}

	OverloadExpr *Ovl = cast<OverloadExpr>(E);
	if (isa<UnresolvedMemberExpr>(Ovl))
	if (!ResolveSingleFunctionTemplateSpecialization(Ovl)) {
	Diag(OpLoc, diag::err_invalid_form_pointer_member_function)
	<< OrigOp.get()->getSourceRange();
	return QualType();
	}

	return Context.OverloadTy;
	}

	if (PTy->getKind() == BuiltinType::UnknownAny)
	return Context.UnknownAnyTy;

	if (PTy->getKind() == BuiltinType::BoundMember) {
	Diag(OpLoc, diag::err_invalid_form_pointer_member_function)
	<< OrigOp.get()->getSourceRange();
	return QualType();
	}

	OrigOp = CheckPlaceholderExpr(OrigOp.get());
	if (OrigOp.isInvalid()) return QualType();
	}

	if (OrigOp.get()->isTypeDependent())
	return Context.DependentTy;

	assert(!OrigOp.get()->getType()->isPlaceholderType());

	// Make sure to ignore parentheses in subsequent checks
	Expr *op = OrigOp.get()->IgnoreParens();

	// OpenCL v1.0 s6.8.a.3: Pointers to functions are not allowed.
	if (LangOpts.OpenCL && op->getType()->isFunctionType()) {
	Diag(op->getExprLoc(), diag::err_opencl_taking_function_address);
	return QualType();
	}

	if (getLangOpts().C99) {
	// Implement C99-only parts of addressof rules.
	if (UnaryOperator* uOp = dyn_cast<UnaryOperator>(op)) {
	if (uOp->getOpcode() == UO_Deref)
	// Per C99 6.5.3.2, the address of a deref always returns a valid result
	// (assuming the deref expression is valid).
	return uOp->getSubExpr()->getType();
	}
	// Technically, there should be a check for array subscript
	// expressions here, but the result of one is always an lvalue anyway.
	}
	ValueDecl *dcl = getPrimaryDecl(op);

	if (auto *FD = dyn_cast_or_null<FunctionDecl>(dcl))
	if (!checkAddressOfFunctionIsAvailable(FD, /Complain=/true,
	op->getLocStart()))
	return QualType();

	Expr::LValueClassification lval = op->ClassifyLValue(Context);
	unsigned AddressOfError = AO_No_Error;

	if (lval == Expr::LV_ClassTemporary \|\| lval == Expr::LV_ArrayTemporary) {
	bool sfinae = (bool)isSFINAEContext();
	Diag(OpLoc, isSFINAEContext() ? diag::err_typecheck_addrof_temporary
	: diag::ext_typecheck_addrof_temporary)
	<< op->getType() << op->getSourceRange();
	if (sfinae)
	return QualType();
	// Materialize the temporary as an lvalue so that we can take its address.
	OrigOp = op =
	CreateMaterializeTemporaryExpr(op->getType(), OrigOp.get(), true);
	} else if (isa<ObjCSelectorExpr>(op)) {
	return Context.getPointerType(op->getType());
	} else if (lval == Expr::LV_MemberFunction) {
	// If it's an instance method, make a member pointer.
	// The expression must have exactly the form &A::foo.

	// If the underlying expression isn't a decl ref, give up.
	if (!isa<DeclRefExpr>(op)) {
	Diag(OpLoc, diag::err_invalid_form_pointer_member_function)
	<< OrigOp.get()->getSourceRange();
	return QualType();
	}
	DeclRefExpr *DRE = cast<DeclRefExpr>(op);
	CXXMethodDecl *MD = cast<CXXMethodDecl>(DRE->getDecl());

	// The id-expression was parenthesized.
	if (OrigOp.get() != DRE) {
	Diag(OpLoc, diag::err_parens_pointer_member_function)
	<< OrigOp.get()->getSourceRange();

	// The method was named without a qualifier.
	} else if (!DRE->getQualifier()) {
	if (MD->getParent()->getName().empty())
	Diag(OpLoc, diag::err_unqualified_pointer_member_function)
	<< op->getSourceRange();
	else {
	SmallString<32> Str;
	StringRef Qual = (MD->getParent()->getName() + "::").toStringRef(Str);
	Diag(OpLoc, diag::err_unqualified_pointer_member_function)
	<< op->getSourceRange()
	<< FixItHint::CreateInsertion(op->getSourceRange().getBegin(), Qual);
	}
	}

	// Taking the address of a dtor is illegal per C++ [class.dtor]p2.
	if (isa<CXXDestructorDecl>(MD))
	Diag(OpLoc, diag::err_typecheck_addrof_dtor) << op->getSourceRange();

	QualType MPTy = Context.getMemberPointerType(
	op->getType(), Context.getTypeDeclType(MD->getParent()).getTypePtr());
	// Under the MS ABI, lock down the inheritance model now.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft())
	(void)isCompleteType(OpLoc, MPTy);
	return MPTy;
	} else if (lval != Expr::LV_Valid && lval != Expr::LV_IncompleteVoidType) {
	// C99 6.5.3.2p1
	// The operand must be either an l-value or a function designator
	if (!op->getType()->isFunctionType()) {
	// Use a special diagnostic for loads from property references.
	if (isa<PseudoObjectExpr>(op)) {
	AddressOfError = AO_Property_Expansion;
	} else {
	Diag(OpLoc, diag::err_typecheck_invalid_lvalue_addrof)
	<< op->getType() << op->getSourceRange();
	return QualType();
	}
	}
	} else if (op->getObjectKind() == OK_BitField) { // C99 6.5.3.2p1
	// The operand cannot be a bit-field
	AddressOfError = AO_Bit_Field;
	} else if (op->getObjectKind() == OK_VectorComponent) {
	// The operand cannot be an element of a vector
	AddressOfError = AO_Vector_Element;
	} else if (dcl) { // C99 6.5.3.2p1
	// We have an lvalue with a decl. Make sure the decl is not declared
	// with the register storage-class specifier.
	if (const VarDecl *vd = dyn_cast<VarDecl>(dcl)) {
	// in C++ it is not error to take address of a register
	// variable (c++03 7.1.1P3)
	if (vd->getStorageClass() == SC_Register &&
	!getLangOpts().CPlusPlus) {
	AddressOfError = AO_Register_Variable;
	}
	} else if (isa<MSPropertyDecl>(dcl)) {
	AddressOfError = AO_Property_Expansion;
	} else if (isa<FunctionTemplateDecl>(dcl)) {
	return Context.OverloadTy;
	} else if (isa<FieldDecl>(dcl) \|\| isa<IndirectFieldDecl>(dcl)) {
	// Okay: we can take the address of a field.
	// Could be a pointer to member, though, if there is an explicit
	// scope qualifier for the class.
	if (isa<DeclRefExpr>(op) && cast<DeclRefExpr>(op)->getQualifier()) {
	DeclContext *Ctx = dcl->getDeclContext();
	if (Ctx && Ctx->isRecord()) {
	if (dcl->getType()->isReferenceType()) {
	Diag(OpLoc,
	diag::err_cannot_form_pointer_to_member_of_reference_type)
	<< dcl->getDeclName() << dcl->getType();
	return QualType();
	}

	while (cast<RecordDecl>(Ctx)->isAnonymousStructOrUnion())
	Ctx = Ctx->getParent();

	QualType MPTy = Context.getMemberPointerType(
	op->getType(),
	Context.getTypeDeclType(cast<RecordDecl>(Ctx)).getTypePtr());
	// Under the MS ABI, lock down the inheritance model now.
	if (Context.getTargetInfo().getCXXABI().isMicrosoft())
	(void)isCompleteType(OpLoc, MPTy);
	return MPTy;
	}
	}
	} else if (!isa<FunctionDecl>(dcl) && !isa<NonTypeTemplateParmDecl>(dcl) &&
	!isa<BindingDecl>(dcl))
	llvm_unreachable("Unknown/unexpected decl type");
	}

	if (AddressOfError != AO_No_Error) {
	diagnoseAddressOfInvalidType(*this, OpLoc, op, AddressOfError);
	return QualType();
	}

	if (lval == Expr::LV_IncompleteVoidType) {
	// Taking the address of a void variable is technically illegal, but we
	// allow it in cases which are otherwise valid.
	// Example: "extern void x; void* y = &x;".
	Diag(OpLoc, diag::ext_typecheck_addrof_void) << op->getSourceRange();
	}

	// If the operand has type "type", the result has type "pointer to type".
	if (op->getType()->isObjCObjectType())
	return Context.getObjCObjectPointerType(op->getType());

	CheckAddressOfPackedMember(op);

	return Context.getPointerType(op->getType());
	}

	static void RecordModifiableNonNullParam(Sema &S, const Expr *Exp) {
	const DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(Exp);
	if (!DRE)
	return;
	const Decl *D = DRE->getDecl();
	if (!D)
	return;
	const ParmVarDecl *Param = dyn_cast<ParmVarDecl>(D);
	if (!Param)
	return;
	if (const FunctionDecl* FD = dyn_cast<FunctionDecl>(Param->getDeclContext()))
	if (!FD->hasAttr<NonNullAttr>() && !Param->hasAttr<NonNullAttr>())
	return;
	if (FunctionScopeInfo *FD = S.getCurFunction())
	if (!FD->ModifiedNonNullParams.count(Param))
	FD->ModifiedNonNullParams.insert(Param);
	}

	/// CheckIndirectionOperand - Type check unary indirection (prefix '*').
	static QualType CheckIndirectionOperand(Sema &S, Expr *Op, ExprValueKind &VK,
	SourceLocation OpLoc) {
	if (Op->isTypeDependent())
	return S.Context.DependentTy;

	ExprResult ConvResult = S.UsualUnaryConversions(Op);
	if (ConvResult.isInvalid())
	return QualType();
	Op = ConvResult.get();
	QualType OpTy = Op->getType();
	QualType Result;

	if (isa<CXXReinterpretCastExpr>(Op)) {
	QualType OpOrigType = Op->IgnoreParenCasts()->getType();
	S.CheckCompatibleReinterpretCast(OpOrigType, OpTy, /IsDereference/true,
	Op->getSourceRange());
	}

	if (const PointerType *PT = OpTy->getAs<PointerType>())
	{
	Result = PT->getPointeeType();
	}
	else if (const ObjCObjectPointerType *OPT =
	OpTy->getAs<ObjCObjectPointerType>())
	Result = OPT->getPointeeType();
	else {
	ExprResult PR = S.CheckPlaceholderExpr(Op);
	if (PR.isInvalid()) return QualType();
	if (PR.get() != Op)
	return CheckIndirectionOperand(S, PR.get(), VK, OpLoc);
	}

	if (Result.isNull()) {
	S.Diag(OpLoc, diag::err_typecheck_indirection_requires_pointer)
	<< OpTy << Op->getSourceRange();
	return QualType();
	}

	// Note that per both C89 and C99, indirection is always legal, even if Result
	// is an incomplete type or void. It would be possible to warn about
	// dereferencing a void pointer, but it's completely well-defined, and such a
	// warning is unlikely to catch any mistakes. In C++, indirection is not valid
	// for pointers to 'void' but is fine for any other pointer type:
	//
	// C++ [expr.unary.op]p1:
	// [...] the expression to which [the unary * operator] is applied shall
	// be a pointer to an object type, or a pointer to a function type
	if (S.getLangOpts().CPlusPlus && Result->isVoidType())
	S.Diag(OpLoc, diag::ext_typecheck_indirection_through_void_pointer)
	<< OpTy << Op->getSourceRange();

	// Dereferences are usually l-values...
	VK = VK_LValue;

	// ...except that certain expressions are never l-values in C.
	if (!S.getLangOpts().CPlusPlus && Result.isCForbiddenLValueType())
	VK = VK_RValue;

	return Result;
	}

	BinaryOperatorKind Sema::ConvertTokenKindToBinaryOpcode(tok::TokenKind Kind) {
	BinaryOperatorKind Opc;
	switch (Kind) {
	default: llvm_unreachable("Unknown binop!");
	case tok::periodstar: Opc = BO_PtrMemD; break;
	case tok::arrowstar: Opc = BO_PtrMemI; break;
	case tok::star: Opc = BO_Mul; break;
	case tok::slash: Opc = BO_Div; break;
	case tok::percent: Opc = BO_Rem; break;
	case tok::plus: Opc = BO_Add; break;
	case tok::minus: Opc = BO_Sub; break;
	case tok::lessless: Opc = BO_Shl; break;
	case tok::greatergreater: Opc = BO_Shr; break;
	case tok::lessequal: Opc = BO_LE; break;
	case tok::less: Opc = BO_LT; break;
	case tok::greaterequal: Opc = BO_GE; break;
	case tok::greater: Opc = BO_GT; break;
	case tok::exclaimequal: Opc = BO_NE; break;
	case tok::equalequal: Opc = BO_EQ; break;
	case tok::amp: Opc = BO_And; break;
	case tok::caret: Opc = BO_Xor; break;
	case tok::pipe: Opc = BO_Or; break;
	case tok::ampamp: Opc = BO_LAnd; break;
	case tok::pipepipe: Opc = BO_LOr; break;
	case tok::equal: Opc = BO_Assign; break;
	case tok::starequal: Opc = BO_MulAssign; break;
	case tok::slashequal: Opc = BO_DivAssign; break;
	case tok::percentequal: Opc = BO_RemAssign; break;
	case tok::plusequal: Opc = BO_AddAssign; break;
	case tok::minusequal: Opc = BO_SubAssign; break;
	case tok::lesslessequal: Opc = BO_ShlAssign; break;
	case tok::greatergreaterequal: Opc = BO_ShrAssign; break;
	case tok::ampequal: Opc = BO_AndAssign; break;
	case tok::caretequal: Opc = BO_XorAssign; break;
	case tok::pipeequal: Opc = BO_OrAssign; break;
	case tok::comma: Opc = BO_Comma; break;
	}
	return Opc;
	}

	static inline UnaryOperatorKind ConvertTokenKindToUnaryOpcode(
	tok::TokenKind Kind) {
	UnaryOperatorKind Opc;
	switch (Kind) {
	default: llvm_unreachable("Unknown unary op!");
	case tok::plusplus: Opc = UO_PreInc; break;
	case tok::minusminus: Opc = UO_PreDec; break;
	case tok::amp: Opc = UO_AddrOf; break;
	case tok::star: Opc = UO_Deref; break;
	case tok::plus: Opc = UO_Plus; break;
	case tok::minus: Opc = UO_Minus; break;
	case tok::tilde: Opc = UO_Not; break;
	case tok::exclaim: Opc = UO_LNot; break;
	case tok::kw___real: Opc = UO_Real; break;
	case tok::kw___imag: Opc = UO_Imag; break;
	case tok::kw___extension__: Opc = UO_Extension; break;
	}
	return Opc;
	}

	/// DiagnoseSelfAssignment - Emits a warning if a value is assigned to itself.
	/// This warning is only emitted for builtin assignment operations. It is also
	/// suppressed in the event of macro expansions.
	static void DiagnoseSelfAssignment(Sema &S, Expr LHSExpr, Expr RHSExpr,
	SourceLocation OpLoc) {
	if (S.inTemplateInstantiation())
	return;
	if (OpLoc.isInvalid() \|\| OpLoc.isMacroID())
	return;
	LHSExpr = LHSExpr->IgnoreParenImpCasts();
	RHSExpr = RHSExpr->IgnoreParenImpCasts();
	const DeclRefExpr *LHSDeclRef = dyn_cast<DeclRefExpr>(LHSExpr);
	const DeclRefExpr *RHSDeclRef = dyn_cast<DeclRefExpr>(RHSExpr);
	if (!LHSDeclRef \|\| !RHSDeclRef \|\|
	LHSDeclRef->getLocation().isMacroID() \|\|
	RHSDeclRef->getLocation().isMacroID())
	return;
	const ValueDecl *LHSDecl =
	cast<ValueDecl>(LHSDeclRef->getDecl()->getCanonicalDecl());
	const ValueDecl *RHSDecl =
	cast<ValueDecl>(RHSDeclRef->getDecl()->getCanonicalDecl());
	if (LHSDecl != RHSDecl)
	return;
	if (LHSDecl->getType().isVolatileQualified())
	return;
	if (const ReferenceType *RefTy = LHSDecl->getType()->getAs<ReferenceType>())
	if (RefTy->getPointeeType().isVolatileQualified())
	return;

	S.Diag(OpLoc, diag::warn_self_assignment)
	<< LHSDeclRef->getType()
	<< LHSExpr->getSourceRange() << RHSExpr->getSourceRange();
	}

	/// Check if a bitwise-& is performed on an Objective-C pointer. This
	/// is usually indicative of introspection within the Objective-C pointer.
	static void checkObjCPointerIntrospection(Sema &S, ExprResult &L, ExprResult &R,
	SourceLocation OpLoc) {
	if (!S.getLangOpts().ObjC1)
	return;

	const Expr ObjCPointerExpr = nullptr, OtherExpr = nullptr;
	const Expr *LHS = L.get();
	const Expr *RHS = R.get();

	if (LHS->IgnoreParenCasts()->getType()->isObjCObjectPointerType()) {
	ObjCPointerExpr = LHS;
	OtherExpr = RHS;
	}
	else if (RHS->IgnoreParenCasts()->getType()->isObjCObjectPointerType()) {
	ObjCPointerExpr = RHS;
	OtherExpr = LHS;
	}

	// This warning is deliberately made very specific to reduce false
	// positives with logic that uses '&' for hashing. This logic mainly
	// looks for code trying to introspect into tagged pointers, which
	// code should generally never do.
	if (ObjCPointerExpr && isa<IntegerLiteral>(OtherExpr->IgnoreParenCasts())) {
	unsigned Diag = diag::warn_objc_pointer_masking;
	// Determine if we are introspecting the result of performSelectorXXX.
	const Expr *Ex = ObjCPointerExpr->IgnoreParenCasts();
	// Special case messages to -performSelector and friends, which
	// can return non-pointer values boxed in a pointer value.
	// Some clients may wish to silence warnings in this subcase.
	if (const ObjCMessageExpr *ME = dyn_cast<ObjCMessageExpr>(Ex)) {
	Selector S = ME->getSelector();
	StringRef SelArg0 = S.getNameForSlot(0);
	if (SelArg0.startswith("performSelector"))
	Diag = diag::warn_objc_pointer_masking_performSelector;
	}

	S.Diag(OpLoc, Diag)
	<< ObjCPointerExpr->getSourceRange();
	}
	}

	static NamedDecl getDeclFromExpr(Expr E) {
	if (!E)
	return nullptr;
	if (auto *DRE = dyn_cast<DeclRefExpr>(E))
	return DRE->getDecl();
	if (auto *ME = dyn_cast<MemberExpr>(E))
	return ME->getMemberDecl();
	if (auto *IRE = dyn_cast<ObjCIvarRefExpr>(E))
	return IRE->getDecl();
	return nullptr;
	}

	/// CreateBuiltinBinOp - Creates a new built-in binary operation with
	/// operator @p Opc at location @c TokLoc. This routine only supports
	/// built-in operations; ActOnBinOp handles overloaded operators.
	ExprResult Sema::CreateBuiltinBinOp(SourceLocation OpLoc,
	BinaryOperatorKind Opc,
	Expr LHSExpr, Expr RHSExpr) {
	if (getLangOpts().CPlusPlus11 && isa<InitListExpr>(RHSExpr)) {
	// The syntax only allows initializer lists on the RHS of assignment,
	// so we don't need to worry about accepting invalid code for
	// non-assignment operators.
	// C++11 5.17p9:
	// The meaning of x = {v} [...] is that of x = T(v) [...]. The meaning
	// of x = {} is x = T().
	InitializationKind Kind =
	InitializationKind::CreateDirectList(RHSExpr->getLocStart());
	InitializedEntity Entity =
	InitializedEntity::InitializeTemporary(LHSExpr->getType());
	InitializationSequence InitSeq(*this, Entity, Kind, RHSExpr);
	ExprResult Init = InitSeq.Perform(*this, Entity, Kind, RHSExpr);
	if (Init.isInvalid())
	return Init;
	RHSExpr = Init.get();
	}

	ExprResult LHS = LHSExpr, RHS = RHSExpr;
	QualType ResultTy; // Result type of the binary operator.
	// The following two variables are used for compound assignment operators
	QualType CompLHSTy; // Type of LHS after promotions for computation
	QualType CompResultTy; // Type of computation result
	ExprValueKind VK = VK_RValue;
	ExprObjectKind OK = OK_Ordinary;

	if (!getLangOpts().CPlusPlus) {
	// C cannot handle TypoExpr nodes on either side of a binop because it
	// doesn't handle dependent types properly, so make sure any TypoExprs have
	// been dealt with before checking the operands.
	LHS = CorrectDelayedTyposInExpr(LHSExpr);
	RHS = CorrectDelayedTyposInExpr(RHSExpr, [Opc, LHS](Expr *E) {
	if (Opc != BO_Assign)
	return ExprResult(E);
	// Avoid correcting the RHS to the same Expr as the LHS.
	Decl *D = getDeclFromExpr(E);
	return (D && D == getDeclFromExpr(LHS.get())) ? ExprError() : E;
	});
	if (!LHS.isUsable() \|\| !RHS.isUsable())
	return ExprError();
	}

	if (getLangOpts().OpenCL) {
	QualType LHSTy = LHSExpr->getType();
	QualType RHSTy = RHSExpr->getType();
	// OpenCLC v2.0 s6.13.11.1 allows atomic variables to be initialized by
	// the ATOMIC_VAR_INIT macro.
	if (LHSTy->isAtomicType() \|\| RHSTy->isAtomicType()) {
	SourceRange SR(LHSExpr->getLocStart(), RHSExpr->getLocEnd());
	if (BO_Assign == Opc)
	Diag(OpLoc, diag::err_opencl_atomic_init) << 0 << SR;
	else
	ResultTy = InvalidOperands(OpLoc, LHS, RHS);
	return ExprError();
	}

	// OpenCL special types - image, sampler, pipe, and blocks are to be used
	// only with a builtin functions and therefore should be disallowed here.
	if (LHSTy->isImageType() \|\| RHSTy->isImageType() \|\|
	LHSTy->isSamplerT() \|\| RHSTy->isSamplerT() \|\|
	LHSTy->isPipeType() \|\| RHSTy->isPipeType() \|\|
	LHSTy->isBlockPointerType() \|\| RHSTy->isBlockPointerType()) {
	ResultTy = InvalidOperands(OpLoc, LHS, RHS);
	return ExprError();
	}
	}

	switch (Opc) {
	case BO_Assign:
	ResultTy = CheckAssignmentOperands(LHS.get(), RHS, OpLoc, QualType());
	if (getLangOpts().CPlusPlus &&
	LHS.get()->getObjectKind() != OK_ObjCProperty) {
	VK = LHS.get()->getValueKind();
	OK = LHS.get()->getObjectKind();
	}
	if (!ResultTy.isNull()) {
	DiagnoseSelfAssignment(*this, LHS.get(), RHS.get(), OpLoc);
	DiagnoseSelfMove(LHS.get(), RHS.get(), OpLoc);
	}
	RecordModifiableNonNullParam(*this, LHS.get());
	break;
	case BO_PtrMemD:
	case BO_PtrMemI:
	ResultTy = CheckPointerToMemberOperands(LHS, RHS, VK, OpLoc,
	Opc == BO_PtrMemI);
	break;
	case BO_Mul:
	case BO_Div:
	ResultTy = CheckMultiplyDivideOperands(LHS, RHS, OpLoc, false,
	Opc == BO_Div);
	break;
	case BO_Rem:
	ResultTy = CheckRemainderOperands(LHS, RHS, OpLoc);
	break;
	case BO_Add:
	ResultTy = CheckAdditionOperands(LHS, RHS, OpLoc, Opc);
	break;
	case BO_Sub:
	ResultTy = CheckSubtractionOperands(LHS, RHS, OpLoc);
	break;
	case BO_Shl:
	case BO_Shr:
	ResultTy = CheckShiftOperands(LHS, RHS, OpLoc, Opc);
	break;
	case BO_LE:
	case BO_LT:
	case BO_GE:
	case BO_GT:
	ResultTy = CheckCompareOperands(LHS, RHS, OpLoc, Opc, true);
	break;
	case BO_EQ:
	case BO_NE:
	ResultTy = CheckCompareOperands(LHS, RHS, OpLoc, Opc, false);
	break;
	case BO_And:
	checkObjCPointerIntrospection(*this, LHS, RHS, OpLoc);
	LLVM_FALLTHROUGH;
	case BO_Xor:
	case BO_Or:
	ResultTy = CheckBitwiseOperands(LHS, RHS, OpLoc, Opc);
	break;
	case BO_LAnd:
	case BO_LOr:
	ResultTy = CheckLogicalOperands(LHS, RHS, OpLoc, Opc);
	break;
	case BO_MulAssign:
	case BO_DivAssign:
	CompResultTy = CheckMultiplyDivideOperands(LHS, RHS, OpLoc, true,
	Opc == BO_DivAssign);
	CompLHSTy = CompResultTy;
	if (!CompResultTy.isNull() && !LHS.isInvalid() && !RHS.isInvalid())
	ResultTy = CheckAssignmentOperands(LHS.get(), RHS, OpLoc, CompResultTy);
	break;
	case BO_RemAssign:
	CompResultTy = CheckRemainderOperands(LHS, RHS, OpLoc, true);
	CompLHSTy = CompResultTy;
	if (!CompResultTy.isNull() && !LHS.isInvalid() && !RHS.isInvalid())
	ResultTy = CheckAssignmentOperands(LHS.get(), RHS, OpLoc, CompResultTy);
	break;
	case BO_AddAssign:
	CompResultTy = CheckAdditionOperands(LHS, RHS, OpLoc, Opc, &CompLHSTy);
	if (!CompResultTy.isNull() && !LHS.isInvalid() && !RHS.isInvalid())
	ResultTy = CheckAssignmentOperands(LHS.get(), RHS, OpLoc, CompResultTy);
	break;
	case BO_SubAssign:
	CompResultTy = CheckSubtractionOperands(LHS, RHS, OpLoc, &CompLHSTy);
	if (!CompResultTy.isNull() && !LHS.isInvalid() && !RHS.isInvalid())
	ResultTy = CheckAssignmentOperands(LHS.get(), RHS, OpLoc, CompResultTy);
	break;
	case BO_ShlAssign:
	case BO_ShrAssign:
	CompResultTy = CheckShiftOperands(LHS, RHS, OpLoc, Opc, true);
	CompLHSTy = CompResultTy;
	if (!CompResultTy.isNull() && !LHS.isInvalid() && !RHS.isInvalid())
	ResultTy = CheckAssignmentOperands(LHS.get(), RHS, OpLoc, CompResultTy);
	break;
	case BO_AndAssign:
	case BO_OrAssign: // fallthrough
	DiagnoseSelfAssignment(*this, LHS.get(), RHS.get(), OpLoc);
	LLVM_FALLTHROUGH;
	case BO_XorAssign:
	CompResultTy = CheckBitwiseOperands(LHS, RHS, OpLoc, Opc);
	CompLHSTy = CompResultTy;
	if (!CompResultTy.isNull() && !LHS.isInvalid() && !RHS.isInvalid())
	ResultTy = CheckAssignmentOperands(LHS.get(), RHS, OpLoc, CompResultTy);
	break;
	case BO_Comma:
	ResultTy = CheckCommaOperands(*this, LHS, RHS, OpLoc);
	if (getLangOpts().CPlusPlus && !RHS.isInvalid()) {
	VK = RHS.get()->getValueKind();
	OK = RHS.get()->getObjectKind();
	}
	break;
	}
	if (ResultTy.isNull() \|\| LHS.isInvalid() \|\| RHS.isInvalid())
	return ExprError();

	// Check for array bounds violations for both sides of the BinaryOperator
	CheckArrayAccess(LHS.get());
	CheckArrayAccess(RHS.get());

	if (const ObjCIsaExpr *OISA = dyn_cast<ObjCIsaExpr>(LHS.get()->IgnoreParenCasts())) {
	NamedDecl *ObjectSetClass = LookupSingleName(TUScope,
	&Context.Idents.get("object_setClass"),
	SourceLocation(), LookupOrdinaryName);
	if (ObjectSetClass && isa<ObjCIsaExpr>(LHS.get())) {
	SourceLocation RHSLocEnd = getLocForEndOfToken(RHS.get()->getLocEnd());
	Diag(LHS.get()->getExprLoc(), diag::warn_objc_isa_assign) <<
	FixItHint::CreateInsertion(LHS.get()->getLocStart(), "object_setClass(") <<
	FixItHint::CreateReplacement(SourceRange(OISA->getOpLoc(), OpLoc), ",") <<
	FixItHint::CreateInsertion(RHSLocEnd, ")");
	}
	else
	Diag(LHS.get()->getExprLoc(), diag::warn_objc_isa_assign);
	}
	else if (const ObjCIvarRefExpr *OIRE =
	dyn_cast<ObjCIvarRefExpr>(LHS.get()->IgnoreParenCasts()))
	DiagnoseDirectIsaAccess(*this, OIRE, OpLoc, RHS.get());

	if (CompResultTy.isNull())
	return new (Context) BinaryOperator(LHS.get(), RHS.get(), Opc, ResultTy, VK,
	OK, OpLoc, FPFeatures);
	if (getLangOpts().CPlusPlus && LHS.get()->getObjectKind() !=
	OK_ObjCProperty) {
	VK = VK_LValue;
	OK = LHS.get()->getObjectKind();
	}
	return new (Context) CompoundAssignOperator(
	LHS.get(), RHS.get(), Opc, ResultTy, VK, OK, CompLHSTy, CompResultTy,
	OpLoc, FPFeatures);
	}

	/// DiagnoseBitwisePrecedence - Emit a warning when bitwise and comparison
	/// operators are mixed in a way that suggests that the programmer forgot that
	/// comparison operators have higher precedence. The most typical example of
	/// such code is "flags & 0x0020 != 0", which is equivalent to "flags & 1".
	static void DiagnoseBitwisePrecedence(Sema &Self, BinaryOperatorKind Opc,
	SourceLocation OpLoc, Expr *LHSExpr,
	Expr *RHSExpr) {
	BinaryOperator *LHSBO = dyn_cast<BinaryOperator>(LHSExpr);
	BinaryOperator *RHSBO = dyn_cast<BinaryOperator>(RHSExpr);

	// Check that one of the sides is a comparison operator and the other isn't.
	bool isLeftComp = LHSBO && LHSBO->isComparisonOp();
	bool isRightComp = RHSBO && RHSBO->isComparisonOp();
	if (isLeftComp == isRightComp)
	return;

	// Bitwise operations are sometimes used as eager logical ops.
	// Don't diagnose this.
	bool isLeftBitwise = LHSBO && LHSBO->isBitwiseOp();
	bool isRightBitwise = RHSBO && RHSBO->isBitwiseOp();
	if (isLeftBitwise \|\| isRightBitwise)
	return;

	SourceRange DiagRange = isLeftComp ? SourceRange(LHSExpr->getLocStart(),
	OpLoc)
	: SourceRange(OpLoc, RHSExpr->getLocEnd());
	StringRef OpStr = isLeftComp ? LHSBO->getOpcodeStr() : RHSBO->getOpcodeStr();
	SourceRange ParensRange = isLeftComp ?
	SourceRange(LHSBO->getRHS()->getLocStart(), RHSExpr->getLocEnd())
	: SourceRange(LHSExpr->getLocStart(), RHSBO->getLHS()->getLocEnd());

	Self.Diag(OpLoc, diag::warn_precedence_bitwise_rel)
	<< DiagRange << BinaryOperator::getOpcodeStr(Opc) << OpStr;
	SuggestParentheses(Self, OpLoc,
	Self.PDiag(diag::note_precedence_silence) << OpStr,
	(isLeftComp ? LHSExpr : RHSExpr)->getSourceRange());
	SuggestParentheses(Self, OpLoc,
	Self.PDiag(diag::note_precedence_bitwise_first)
	<< BinaryOperator::getOpcodeStr(Opc),
	ParensRange);
	}

	/// \brief It accepts a '&&' expr that is inside a '\|\|' one.
	/// Emit a diagnostic together with a fixit hint that wraps the '&&' expression
	/// in parentheses.
	static void
	EmitDiagnosticForLogicalAndInLogicalOr(Sema &Self, SourceLocation OpLoc,
	BinaryOperator *Bop) {
	assert(Bop->getOpcode() == BO_LAnd);
	Self.Diag(Bop->getOperatorLoc(), diag::warn_logical_and_in_logical_or)
	<< Bop->getSourceRange() << OpLoc;
	SuggestParentheses(Self, Bop->getOperatorLoc(),
	Self.PDiag(diag::note_precedence_silence)
	<< Bop->getOpcodeStr(),
	Bop->getSourceRange());
	}

	/// \brief Returns true if the given expression can be evaluated as a constant
	/// 'true'.
	static bool EvaluatesAsTrue(Sema &S, Expr *E) {
	bool Res;
	return !E->isValueDependent() &&
	E->EvaluateAsBooleanCondition(Res, S.getASTContext()) && Res;
	}

	/// \brief Returns true if the given expression can be evaluated as a constant
	/// 'false'.
	static bool EvaluatesAsFalse(Sema &S, Expr *E) {
	bool Res;
	return !E->isValueDependent() &&
	E->EvaluateAsBooleanCondition(Res, S.getASTContext()) && !Res;
	}

	/// \brief Look for '&&' in the left hand of a '\|\|' expr.
	static void DiagnoseLogicalAndInLogicalOrLHS(Sema &S, SourceLocation OpLoc,
	Expr LHSExpr, Expr RHSExpr) {
	if (BinaryOperator *Bop = dyn_cast<BinaryOperator>(LHSExpr)) {
	if (Bop->getOpcode() == BO_LAnd) {
	// If it's "a && b \|\| 0" don't warn since the precedence doesn't matter.
	if (EvaluatesAsFalse(S, RHSExpr))
	return;
	// If it's "1 && a \|\| b" don't warn since the precedence doesn't matter.
	if (!EvaluatesAsTrue(S, Bop->getLHS()))
	return EmitDiagnosticForLogicalAndInLogicalOr(S, OpLoc, Bop);
	} else if (Bop->getOpcode() == BO_LOr) {
	if (BinaryOperator *RBop = dyn_cast<BinaryOperator>(Bop->getRHS())) {
	// If it's "a \|\| b && 1 \|\| c" we didn't warn earlier for
	// "a \|\| b && 1", but warn now.
	if (RBop->getOpcode() == BO_LAnd && EvaluatesAsTrue(S, RBop->getRHS()))
	return EmitDiagnosticForLogicalAndInLogicalOr(S, OpLoc, RBop);
	}
	}
	}
	}

	/// \brief Look for '&&' in the right hand of a '\|\|' expr.
	static void DiagnoseLogicalAndInLogicalOrRHS(Sema &S, SourceLocation OpLoc,
	Expr LHSExpr, Expr RHSExpr) {
	if (BinaryOperator *Bop = dyn_cast<BinaryOperator>(RHSExpr)) {
	if (Bop->getOpcode() == BO_LAnd) {
	// If it's "0 \|\| a && b" don't warn since the precedence doesn't matter.
	if (EvaluatesAsFalse(S, LHSExpr))
	return;
	// If it's "a \|\| b && 1" don't warn since the precedence doesn't matter.
	if (!EvaluatesAsTrue(S, Bop->getRHS()))
	return EmitDiagnosticForLogicalAndInLogicalOr(S, OpLoc, Bop);
	}
	}
	}

	/// \brief Look for bitwise op in the left or right hand of a bitwise op with
	/// lower precedence and emit a diagnostic together with a fixit hint that wraps
	/// the '&' expression in parentheses.
	static void DiagnoseBitwiseOpInBitwiseOp(Sema &S, BinaryOperatorKind Opc,
	SourceLocation OpLoc, Expr *SubExpr) {
	if (BinaryOperator *Bop = dyn_cast<BinaryOperator>(SubExpr)) {
	if (Bop->isBitwiseOp() && Bop->getOpcode() < Opc) {
	S.Diag(Bop->getOperatorLoc(), diag::warn_bitwise_op_in_bitwise_op)
	<< Bop->getOpcodeStr() << BinaryOperator::getOpcodeStr(Opc)
	<< Bop->getSourceRange() << OpLoc;
	SuggestParentheses(S, Bop->getOperatorLoc(),
	S.PDiag(diag::note_precedence_silence)
	<< Bop->getOpcodeStr(),
	Bop->getSourceRange());
	}
	}
	}

	static void DiagnoseAdditionInShift(Sema &S, SourceLocation OpLoc,
	Expr *SubExpr, StringRef Shift) {
	if (BinaryOperator *Bop = dyn_cast<BinaryOperator>(SubExpr)) {
	if (Bop->getOpcode() == BO_Add \|\| Bop->getOpcode() == BO_Sub) {
	StringRef Op = Bop->getOpcodeStr();
	S.Diag(Bop->getOperatorLoc(), diag::warn_addition_in_bitshift)
	<< Bop->getSourceRange() << OpLoc << Shift << Op;
	SuggestParentheses(S, Bop->getOperatorLoc(),
	S.PDiag(diag::note_precedence_silence) << Op,
	Bop->getSourceRange());
	}
	}
	}

	static void DiagnoseShiftCompare(Sema &S, SourceLocation OpLoc,
	Expr LHSExpr, Expr RHSExpr) {
	CXXOperatorCallExpr *OCE = dyn_cast<CXXOperatorCallExpr>(LHSExpr);
	if (!OCE)
	return;

	FunctionDecl *FD = OCE->getDirectCallee();
	if (!FD \|\| !FD->isOverloadedOperator())
	return;

	OverloadedOperatorKind Kind = FD->getOverloadedOperator();
	if (Kind != OO_LessLess && Kind != OO_GreaterGreater)
	return;

	S.Diag(OpLoc, diag::warn_overloaded_shift_in_comparison)
	<< LHSExpr->getSourceRange() << RHSExpr->getSourceRange()
	<< (Kind == OO_LessLess);
	SuggestParentheses(S, OCE->getOperatorLoc(),
	S.PDiag(diag::note_precedence_silence)
	<< (Kind == OO_LessLess ? "<<" : ">>"),
	OCE->getSourceRange());
	SuggestParentheses(S, OpLoc,
	S.PDiag(diag::note_evaluate_comparison_first),
	SourceRange(OCE->getArg(1)->getLocStart(),
	RHSExpr->getLocEnd()));
	}

	/// DiagnoseBinOpPrecedence - Emit warnings for expressions with tricky
	/// precedence.
	static void DiagnoseBinOpPrecedence(Sema &Self, BinaryOperatorKind Opc,
	SourceLocation OpLoc, Expr *LHSExpr,
	Expr *RHSExpr){
	// Diagnose "arg1 'bitwise' arg2 'eq' arg3".
	if (BinaryOperator::isBitwiseOp(Opc))
	DiagnoseBitwisePrecedence(Self, Opc, OpLoc, LHSExpr, RHSExpr);

	// Diagnose "arg1 & arg2 \| arg3"
	if ((Opc == BO_Or \|\| Opc == BO_Xor) &&
	!OpLoc.isMacroID()/* Don't warn in macros. */) {
	DiagnoseBitwiseOpInBitwiseOp(Self, Opc, OpLoc, LHSExpr);
	DiagnoseBitwiseOpInBitwiseOp(Self, Opc, OpLoc, RHSExpr);
	}

	// Warn about arg1 \|\| arg2 && arg3, as GCC 4.3+ does.
	// We don't warn for 'assert(a \|\| b && "bad")' since this is safe.
	if (Opc == BO_LOr && !OpLoc.isMacroID()/* Don't warn in macros. */) {
	DiagnoseLogicalAndInLogicalOrLHS(Self, OpLoc, LHSExpr, RHSExpr);
	DiagnoseLogicalAndInLogicalOrRHS(Self, OpLoc, LHSExpr, RHSExpr);
	}

	if ((Opc == BO_Shl && LHSExpr->getType()->isIntegralType(Self.getASTContext()))
	\|\| Opc == BO_Shr) {
	StringRef Shift = BinaryOperator::getOpcodeStr(Opc);
	DiagnoseAdditionInShift(Self, OpLoc, LHSExpr, Shift);
	DiagnoseAdditionInShift(Self, OpLoc, RHSExpr, Shift);
	}

	// Warn on overloaded shift operators and comparisons, such as:
	// cout << 5 == 4;
	if (BinaryOperator::isComparisonOp(Opc))
	DiagnoseShiftCompare(Self, OpLoc, LHSExpr, RHSExpr);
	}

	// Binary Operators. 'Tok' is the token for the operator.
	ExprResult Sema::ActOnBinOp(Scope *S, SourceLocation TokLoc,
	tok::TokenKind Kind,
	Expr LHSExpr, Expr RHSExpr) {
	BinaryOperatorKind Opc = ConvertTokenKindToBinaryOpcode(Kind);
	assert(LHSExpr && "ActOnBinOp(): missing left expression");
	assert(RHSExpr && "ActOnBinOp(): missing right expression");

	// Emit warnings for tricky precedence issues, e.g. "bitfield & 0x4 == 0"
	DiagnoseBinOpPrecedence(*this, Opc, TokLoc, LHSExpr, RHSExpr);

	return BuildBinOp(S, TokLoc, Opc, LHSExpr, RHSExpr);
	}

	/// Build an overloaded binary operator expression in the given scope.
	static ExprResult BuildOverloadedBinOp(Sema &S, Scope *Sc, SourceLocation OpLoc,
	BinaryOperatorKind Opc,
	Expr LHS, Expr RHS) {
	// Find all of the overloaded operators visible from this
	// point. We perform both an operator-name lookup from the local
	// scope and an argument-dependent lookup based on the types of
	// the arguments.
	UnresolvedSet<16> Functions;
	OverloadedOperatorKind OverOp
	= BinaryOperator::getOverloadedOperator(Opc);
	if (Sc && OverOp != OO_None && OverOp != OO_Equal)
	S.LookupOverloadedOperatorName(OverOp, Sc, LHS->getType(),
	RHS->getType(), Functions);

	// Build the (potentially-overloaded, potentially-dependent)
	// binary operation.
	return S.CreateOverloadedBinOp(OpLoc, Opc, Functions, LHS, RHS);
	}

	ExprResult Sema::BuildBinOp(Scope *S, SourceLocation OpLoc,
	BinaryOperatorKind Opc,
	Expr LHSExpr, Expr RHSExpr) {
	// We want to end up calling one of checkPseudoObjectAssignment
	// (if the LHS is a pseudo-object), BuildOverloadedBinOp (if
	// both expressions are overloadable or either is type-dependent),
	// or CreateBuiltinBinOp (in any other case). We also want to get
	// any placeholder types out of the way.

	// Handle pseudo-objects in the LHS.
	if (const BuiltinType *pty = LHSExpr->getType()->getAsPlaceholderType()) {
	// Assignments with a pseudo-object l-value need special analysis.
	if (pty->getKind() == BuiltinType::PseudoObject &&
	BinaryOperator::isAssignmentOp(Opc))
	return checkPseudoObjectAssignment(S, OpLoc, Opc, LHSExpr, RHSExpr);

	// Don't resolve overloads if the other type is overloadable.
	if (getLangOpts().CPlusPlus && pty->getKind() == BuiltinType::Overload) {
	// We can't actually test that if we still have a placeholder,
	// though. Fortunately, none of the exceptions we see in that
	// code below are valid when the LHS is an overload set. Note
	// that an overload set can be dependently-typed, but it never
	// instantiates to having an overloadable type.
	ExprResult resolvedRHS = CheckPlaceholderExpr(RHSExpr);
	if (resolvedRHS.isInvalid()) return ExprError();
	RHSExpr = resolvedRHS.get();

	if (RHSExpr->isTypeDependent() \|\|
	RHSExpr->getType()->isOverloadableType())
	return BuildOverloadedBinOp(*this, S, OpLoc, Opc, LHSExpr, RHSExpr);
	}

	// If we're instantiating "a.x < b" or "A::x < b" and 'x' names a function
	// template, diagnose the missing 'template' keyword instead of diagnosing
	// an invalid use of a bound member function.
	//
	// Note that "A::x < b" might be valid if 'b' has an overloadable type due
	// to C++1z [over.over]/1.4, but we already checked for that case above.
	if (Opc == BO_LT && inTemplateInstantiation() &&
	(pty->getKind() == BuiltinType::BoundMember \|\|
	pty->getKind() == BuiltinType::Overload)) {
	auto *OE = dyn_cast<OverloadExpr>(LHSExpr);
	if (OE && !OE->hasTemplateKeyword() && !OE->hasExplicitTemplateArgs() &&
	std::any_of(OE->decls_begin(), OE->decls_end(), [](NamedDecl *ND) {
	return isa<FunctionTemplateDecl>(ND);
	})) {
	Diag(OE->getQualifier() ? OE->getQualifierLoc().getBeginLoc()
	: OE->getNameLoc(),
	diag::err_template_kw_missing)
	<< OE->getName().getAsString() << "";
	return ExprError();
	}
	}

	ExprResult LHS = CheckPlaceholderExpr(LHSExpr);
	if (LHS.isInvalid()) return ExprError();
	LHSExpr = LHS.get();
	}

	// Handle pseudo-objects in the RHS.
	if (const BuiltinType *pty = RHSExpr->getType()->getAsPlaceholderType()) {
	// An overload in the RHS can potentially be resolved by the type
	// being assigned to.
	if (Opc == BO_Assign && pty->getKind() == BuiltinType::Overload) {
	if (getLangOpts().CPlusPlus &&
	(LHSExpr->isTypeDependent() \|\| RHSExpr->isTypeDependent() \|\|
	LHSExpr->getType()->isOverloadableType()))
	return BuildOverloadedBinOp(*this, S, OpLoc, Opc, LHSExpr, RHSExpr);

	return CreateBuiltinBinOp(OpLoc, Opc, LHSExpr, RHSExpr);
	}

	// Don't resolve overloads if the other type is overloadable.
	if (getLangOpts().CPlusPlus && pty->getKind() == BuiltinType::Overload &&
	LHSExpr->getType()->isOverloadableType())
	return BuildOverloadedBinOp(*this, S, OpLoc, Opc, LHSExpr, RHSExpr);

	ExprResult resolvedRHS = CheckPlaceholderExpr(RHSExpr);
	if (!resolvedRHS.isUsable()) return ExprError();
	RHSExpr = resolvedRHS.get();
	}

	if (getLangOpts().CPlusPlus) {
	// If either expression is type-dependent, always build an
	// overloaded op.
	if (LHSExpr->isTypeDependent() \|\| RHSExpr->isTypeDependent())
	return BuildOverloadedBinOp(*this, S, OpLoc, Opc, LHSExpr, RHSExpr);

	// Otherwise, build an overloaded op if either expression has an
	// overloadable type.
	if (LHSExpr->getType()->isOverloadableType() \|\|
	RHSExpr->getType()->isOverloadableType())
	return BuildOverloadedBinOp(*this, S, OpLoc, Opc, LHSExpr, RHSExpr);
	}

	// Build a built-in binary operation.
	return CreateBuiltinBinOp(OpLoc, Opc, LHSExpr, RHSExpr);
	}

	ExprResult Sema::CreateBuiltinUnaryOp(SourceLocation OpLoc,
	UnaryOperatorKind Opc,
	Expr *InputExpr) {
	ExprResult Input = InputExpr;
	ExprValueKind VK = VK_RValue;
	ExprObjectKind OK = OK_Ordinary;
	QualType resultType;
	if (getLangOpts().OpenCL) {
	QualType Ty = InputExpr->getType();
	// The only legal unary operation for atomics is '&'.
	if ((Opc != UO_AddrOf && Ty->isAtomicType()) \|\|
	// OpenCL special types - image, sampler, pipe, and blocks are to be used
	// only with a builtin functions and therefore should be disallowed here.
	(Ty->isImageType() \|\| Ty->isSamplerT() \|\| Ty->isPipeType()
	\|\| Ty->isBlockPointerType())) {
	return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
	<< InputExpr->getType()
	<< Input.get()->getSourceRange());
	}
	}
	switch (Opc) {
	case UO_PreInc:
	case UO_PreDec:
	case UO_PostInc:
	case UO_PostDec:
	resultType = CheckIncrementDecrementOperand(*this, Input.get(), VK, OK,
	OpLoc,
	Opc == UO_PreInc \|\|
	Opc == UO_PostInc,
	Opc == UO_PreInc \|\|
	Opc == UO_PreDec);
	break;
	case UO_AddrOf:
	resultType = CheckAddressOfOperand(Input, OpLoc);
	RecordModifiableNonNullParam(*this, InputExpr);
	break;
	case UO_Deref: {
	Input = DefaultFunctionArrayLvalueConversion(Input.get());
	if (Input.isInvalid()) return ExprError();
	resultType = CheckIndirectionOperand(*this, Input.get(), VK, OpLoc);
	break;
	}
	case UO_Plus:
	case UO_Minus:
	Input = UsualUnaryConversions(Input.get());
	if (Input.isInvalid()) return ExprError();
	resultType = Input.get()->getType();
	if (resultType->isDependentType())
	break;
	if (resultType->isArithmeticType()) // C99 6.5.3.3p1
	break;
	else if (resultType->isVectorType() &&
	// The z vector extensions don't allow + or - with bool vectors.
	(!Context.getLangOpts().ZVector \|\|
	resultType->getAs<VectorType>()->getVectorKind() !=
	VectorType::AltiVecBool))
	break;
	else if (getLangOpts().CPlusPlus && // C++ [expr.unary.op]p6
	Opc == UO_Plus &&
	resultType->isPointerType())
	break;

	return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
	<< resultType << Input.get()->getSourceRange());

	case UO_Not: // bitwise complement
	Input = UsualUnaryConversions(Input.get());
	if (Input.isInvalid())
	return ExprError();
	resultType = Input.get()->getType();
	if (resultType->isDependentType())
	break;
	// C99 6.5.3.3p1. We allow complex int and float as a GCC extension.
	if (resultType->isComplexType() \|\| resultType->isComplexIntegerType())
	// C99 does not support '~' for complex conjugation.
	Diag(OpLoc, diag::ext_integer_complement_complex)
	<< resultType << Input.get()->getSourceRange();
	else if (resultType->hasIntegerRepresentation())
	break;
	else if (resultType->isExtVectorType() && Context.getLangOpts().OpenCL) {
	// OpenCL v1.1 s6.3.f: The bitwise operator not (~) does not operate
	// on vector float types.
	QualType T = resultType->getAs<ExtVectorType>()->getElementType();
	if (!T->isIntegerType())
	return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
	<< resultType << Input.get()->getSourceRange());
	} else {
	return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
	<< resultType << Input.get()->getSourceRange());
	}
	break;

	case UO_LNot: // logical negation
	// Unlike +/-/~, integer promotions aren't done here (C99 6.5.3.3p5).
	Input = DefaultFunctionArrayLvalueConversion(Input.get());
	if (Input.isInvalid()) return ExprError();
	resultType = Input.get()->getType();

	// Though we still have to promote half FP to float...
	if (resultType->isHalfType() && !Context.getLangOpts().NativeHalfType) {
	Input = ImpCastExprToType(Input.get(), Context.FloatTy, CK_FloatingCast).get();
	resultType = Context.FloatTy;
	}

	if (resultType->isDependentType())
	break;
	if (resultType->isScalarType() && !isScopedEnumerationType(resultType)) {
	// C99 6.5.3.3p1: ok, fallthrough;
	if (Context.getLangOpts().CPlusPlus) {
	// C++03 [expr.unary.op]p8, C++0x [expr.unary.op]p9:
	// operand contextually converted to bool.
	Input = ImpCastExprToType(Input.get(), Context.BoolTy,
	ScalarTypeToBooleanCastKind(resultType));
	} else if (Context.getLangOpts().OpenCL &&
	Context.getLangOpts().OpenCLVersion < 120) {
	// OpenCL v1.1 6.3.h: The logical operator not (!) does not
	// operate on scalar float types.
	if (!resultType->isIntegerType() && !resultType->isPointerType())
	return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
	<< resultType << Input.get()->getSourceRange());
	}
	} else if (resultType->isExtVectorType()) {
	if (Context.getLangOpts().OpenCL &&
	Context.getLangOpts().OpenCLVersion < 120) {
	// OpenCL v1.1 6.3.h: The logical operator not (!) does not
	// operate on vector float types.
	QualType T = resultType->getAs<ExtVectorType>()->getElementType();
	if (!T->isIntegerType())
	return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
	<< resultType << Input.get()->getSourceRange());
	}
	// Vector logical not returns the signed variant of the operand type.
	resultType = GetSignedVectorType(resultType);
	break;
	} else {
	// FIXME: GCC's vector extension permits the usage of '!' with a vector
	// type in C++. We should allow that here too.
	return ExprError(Diag(OpLoc, diag::err_typecheck_unary_expr)
	<< resultType << Input.get()->getSourceRange());
	}

	// LNot always has type int. C99 6.5.3.3p5.
	// In C++, it's bool. C++ 5.3.1p8
	resultType = Context.getLogicalOperationType();
	break;
	case UO_Real:
	case UO_Imag:
	resultType = CheckRealImagOperand(*this, Input, OpLoc, Opc == UO_Real);
	// _Real maps ordinary l-values into ordinary l-values. _Imag maps ordinary
	// complex l-values to ordinary l-values and all other values to r-values.
	if (Input.isInvalid()) return ExprError();
	if (Opc == UO_Real \|\| Input.get()->getType()->isAnyComplexType()) {
	if (Input.get()->getValueKind() != VK_RValue &&
	Input.get()->getObjectKind() == OK_Ordinary)
	VK = Input.get()->getValueKind();
	} else if (!getLangOpts().CPlusPlus) {
	// In C, a volatile scalar is read by __imag. In C++, it is not.
	Input = DefaultLvalueConversion(Input.get());
	}
	break;
	case UO_Extension:
	resultType = Input.get()->getType();
	VK = Input.get()->getValueKind();
	OK = Input.get()->getObjectKind();
	break;
	case UO_Coawait:
	// It's unnessesary to represent the pass-through operator co_await in the
	// AST; just return the input expression instead.
	assert(!Input.get()->getType()->isDependentType() &&
	"the co_await expression must be non-dependant before "
	"building operator co_await");
	return Input;
	}
	if (resultType.isNull() \|\| Input.isInvalid())
	return ExprError();

	// Check for array bounds violations in the operand of the UnaryOperator,
	// except for the '*' and '&' operators that have to be handled specially
	// by CheckArrayAccess (as there are special cases like &array[arraysize]
	// that are explicitly defined as valid by the standard).
	if (Opc != UO_AddrOf && Opc != UO_Deref)
	CheckArrayAccess(Input.get());

	return new (Context)
	UnaryOperator(Input.get(), Opc, resultType, VK, OK, OpLoc);
	}

	/// \brief Determine whether the given expression is a qualified member
	/// access expression, of a form that could be turned into a pointer to member
	/// with the address-of operator.
	static bool isQualifiedMemberAccess(Expr *E) {
	if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
	if (!DRE->getQualifier())
	return false;

	ValueDecl *VD = DRE->getDecl();
	if (!VD->isCXXClassMember())
	return false;

	if (isa<FieldDecl>(VD) \|\| isa<IndirectFieldDecl>(VD))
	return true;
	if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(VD))
	return Method->isInstance();

	return false;
	}

	if (UnresolvedLookupExpr *ULE = dyn_cast<UnresolvedLookupExpr>(E)) {
	if (!ULE->getQualifier())
	return false;

	for (NamedDecl *D : ULE->decls()) {
	if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(D)) {
	if (Method->isInstance())
	return true;
	} else {
	// Overload set does not contain methods.
	break;
	}
	}

	return false;
	}

	return false;
	}

	ExprResult Sema::BuildUnaryOp(Scope *S, SourceLocation OpLoc,
	UnaryOperatorKind Opc, Expr *Input) {
	// First things first: handle placeholders so that the
	// overloaded-operator check considers the right type.
	if (const BuiltinType *pty = Input->getType()->getAsPlaceholderType()) {
	// Increment and decrement of pseudo-object references.
	if (pty->getKind() == BuiltinType::PseudoObject &&
	UnaryOperator::isIncrementDecrementOp(Opc))
	return checkPseudoObjectIncDec(S, OpLoc, Opc, Input);

	// extension is always a builtin operator.
	if (Opc == UO_Extension)
	return CreateBuiltinUnaryOp(OpLoc, Opc, Input);

	// & gets special logic for several kinds of placeholder.
	// The builtin code knows what to do.
	if (Opc == UO_AddrOf &&
	(pty->getKind() == BuiltinType::Overload \|\|
	pty->getKind() == BuiltinType::UnknownAny \|\|
	pty->getKind() == BuiltinType::BoundMember))
	return CreateBuiltinUnaryOp(OpLoc, Opc, Input);

	// Anything else needs to be handled now.
	ExprResult Result = CheckPlaceholderExpr(Input);
	if (Result.isInvalid()) return ExprError();
	Input = Result.get();
	}

	if (getLangOpts().CPlusPlus && Input->getType()->isOverloadableType() &&
	UnaryOperator::getOverloadedOperator(Opc) != OO_None &&
	!(Opc == UO_AddrOf && isQualifiedMemberAccess(Input))) {
	// Find all of the overloaded operators visible from this
	// point. We perform both an operator-name lookup from the local
	// scope and an argument-dependent lookup based on the types of
	// the arguments.
	UnresolvedSet<16> Functions;
	OverloadedOperatorKind OverOp = UnaryOperator::getOverloadedOperator(Opc);
	if (S && OverOp != OO_None)
	LookupOverloadedOperatorName(OverOp, S, Input->getType(), QualType(),
	Functions);

	return CreateOverloadedUnaryOp(OpLoc, Opc, Functions, Input);
	}

	return CreateBuiltinUnaryOp(OpLoc, Opc, Input);
	}

	// Unary Operators. 'Tok' is the token for the operator.
	ExprResult Sema::ActOnUnaryOp(Scope *S, SourceLocation OpLoc,
	tok::TokenKind Op, Expr *Input) {
	return BuildUnaryOp(S, OpLoc, ConvertTokenKindToUnaryOpcode(Op), Input);
	}

	/// ActOnAddrLabel - Parse the GNU address of label extension: "&&foo".
	ExprResult Sema::ActOnAddrLabel(SourceLocation OpLoc, SourceLocation LabLoc,
	LabelDecl *TheDecl) {
	TheDecl->markUsed(Context);
	// Create the AST node. The address of a label always has type 'void*'.
	return new (Context) AddrLabelExpr(OpLoc, LabLoc, TheDecl,
	Context.getPointerType(Context.VoidTy));
	}

	/// Given the last statement in a statement-expression, check whether
	/// the result is a producing expression (like a call to an
	/// ns_returns_retained function) and, if so, rebuild it to hoist the
	/// release out of the full-expression. Otherwise, return null.
	/// Cannot fail.
	static Expr maybeRebuildARCConsumingStmt(Stmt Statement) {
	// Should always be wrapped with one of these.
	ExprWithCleanups *cleanups = dyn_cast<ExprWithCleanups>(Statement);
	if (!cleanups) return nullptr;

	ImplicitCastExpr *cast = dyn_cast<ImplicitCastExpr>(cleanups->getSubExpr());
	if (!cast \|\| cast->getCastKind() != CK_ARCConsumeObject)
	return nullptr;

	// Splice out the cast. This shouldn't modify any interesting
	// features of the statement.
	Expr *producer = cast->getSubExpr();
	assert(producer->getType() == cast->getType());
	assert(producer->getValueKind() == cast->getValueKind());
	cleanups->setSubExpr(producer);
	return cleanups;
	}

	void Sema::ActOnStartStmtExpr() {
	PushExpressionEvaluationContext(ExprEvalContexts.back().Context);
	}

	void Sema::ActOnStmtExprError() {
	// Note that function is also called by TreeTransform when leaving a
	// StmtExpr scope without rebuilding anything.

	DiscardCleanupsInEvaluationContext();
	PopExpressionEvaluationContext();
	}

	ExprResult
	Sema::ActOnStmtExpr(SourceLocation LPLoc, Stmt *SubStmt,
	SourceLocation RPLoc) { // "({..})"
	assert(SubStmt && isa<CompoundStmt>(SubStmt) && "Invalid action invocation!");
	CompoundStmt *Compound = cast<CompoundStmt>(SubStmt);

	if (hasAnyUnrecoverableErrorsInThisFunction())
	DiscardCleanupsInEvaluationContext();
	assert(!Cleanup.exprNeedsCleanups() &&
	"cleanups within StmtExpr not correctly bound!");
	PopExpressionEvaluationContext();

	// FIXME: there are a variety of strange constraints to enforce here, for
	// example, it is not possible to goto into a stmt expression apparently.
	// More semantic analysis is needed.

	// If there are sub-stmts in the compound stmt, take the type of the last one
	// as the type of the stmtexpr.
	QualType Ty = Context.VoidTy;
	bool StmtExprMayBindToTemp = false;
	if (!Compound->body_empty()) {
	Stmt *LastStmt = Compound->body_back();
	LabelStmt *LastLabelStmt = nullptr;
	// If LastStmt is a label, skip down through into the body.
	while (LabelStmt *Label = dyn_cast<LabelStmt>(LastStmt)) {
	LastLabelStmt = Label;
	LastStmt = Label->getSubStmt();
	}

	if (Expr *LastE = dyn_cast<Expr>(LastStmt)) {
	// Do function/array conversion on the last expression, but not
	// lvalue-to-rvalue. However, initialize an unqualified type.
	ExprResult LastExpr = DefaultFunctionArrayConversion(LastE);
	if (LastExpr.isInvalid())
	return ExprError();
	Ty = LastExpr.get()->getType().getUnqualifiedType();

	if (!Ty->isDependentType() && !LastExpr.get()->isTypeDependent()) {
	// In ARC, if the final expression ends in a consume, splice
	// the consume out and bind it later. In the alternate case
	// (when dealing with a retainable type), the result
	// initialization will create a produce. In both cases the
	// result will be +1, and we'll need to balance that out with
	// a bind.
	if (Expr *rebuiltLastStmt
	= maybeRebuildARCConsumingStmt(LastExpr.get())) {
	LastExpr = rebuiltLastStmt;
	} else {
	LastExpr = PerformCopyInitialization(
	InitializedEntity::InitializeResult(LPLoc,
	Ty,
	false),
	SourceLocation(),
	LastExpr);
	}

	if (LastExpr.isInvalid())
	return ExprError();
	if (LastExpr.get() != nullptr) {
	if (!LastLabelStmt)
	Compound->setLastStmt(LastExpr.get());
	else
	LastLabelStmt->setSubStmt(LastExpr.get());
	StmtExprMayBindToTemp = true;
	}
	}
	}
	}

	// FIXME: Check that expression type is complete/non-abstract; statement
	// expressions are not lvalues.
	Expr *ResStmtExpr = new (Context) StmtExpr(Compound, Ty, LPLoc, RPLoc);
	if (StmtExprMayBindToTemp)
	return MaybeBindToTemporary(ResStmtExpr);
	return ResStmtExpr;
	}

	ExprResult Sema::BuildBuiltinOffsetOf(SourceLocation BuiltinLoc,
	TypeSourceInfo *TInfo,
	ArrayRef<OffsetOfComponent> Components,
	SourceLocation RParenLoc) {
	QualType ArgTy = TInfo->getType();
	bool Dependent = ArgTy->isDependentType();
	SourceRange TypeRange = TInfo->getTypeLoc().getLocalSourceRange();

	// We must have at least one component that refers to the type, and the first
	// one is known to be a field designator. Verify that the ArgTy represents
	// a struct/union/class.
	if (!Dependent && !ArgTy->isRecordType())
	return ExprError(Diag(BuiltinLoc, diag::err_offsetof_record_type)
	<< ArgTy << TypeRange);

	// Type must be complete per C99 7.17p3 because a declaring a variable
	// with an incomplete type would be ill-formed.
	if (!Dependent
	&& RequireCompleteType(BuiltinLoc, ArgTy,
	diag::err_offsetof_incomplete_type, TypeRange))
	return ExprError();

	// offsetof with non-identifier designators (e.g. "offsetof(x, a.b[c])") are a
	// GCC extension, diagnose them.
	// FIXME: This diagnostic isn't actually visible because the location is in
	// a system header!
	if (Components.size() != 1)
	Diag(BuiltinLoc, diag::ext_offsetof_extended_field_designator)
	<< SourceRange(Components[1].LocStart, Components.back().LocEnd);

	bool DidWarnAboutNonPOD = false;
	QualType CurrentType = ArgTy;
	SmallVector<OffsetOfNode, 4> Comps;
	SmallVector<Expr*, 4> Exprs;
	for (const OffsetOfComponent &OC : Components) {
	if (OC.isBrackets) {
	// Offset of an array sub-field. TODO: Should we allow vector elements?
	if (!CurrentType->isDependentType()) {
	const ArrayType *AT = Context.getAsArrayType(CurrentType);
	if(!AT)
	return ExprError(Diag(OC.LocEnd, diag::err_offsetof_array_type)
	<< CurrentType);
	CurrentType = AT->getElementType();
	} else
	CurrentType = Context.DependentTy;

	ExprResult IdxRval = DefaultLvalueConversion(static_cast<Expr*>(OC.U.E));
	if (IdxRval.isInvalid())
	return ExprError();
	Expr *Idx = IdxRval.get();

	// The expression must be an integral expression.
	// FIXME: An integral constant expression?
	if (!Idx->isTypeDependent() && !Idx->isValueDependent() &&
	!Idx->getType()->isIntegerType())
	return ExprError(Diag(Idx->getLocStart(),
	diag::err_typecheck_subscript_not_integer)
	<< Idx->getSourceRange());

	// Record this array index.
	Comps.push_back(OffsetOfNode(OC.LocStart, Exprs.size(), OC.LocEnd));
	Exprs.push_back(Idx);
	continue;
	}

	// Offset of a field.
	if (CurrentType->isDependentType()) {
	// We have the offset of a field, but we can't look into the dependent
	// type. Just record the identifier of the field.
	Comps.push_back(OffsetOfNode(OC.LocStart, OC.U.IdentInfo, OC.LocEnd));
	CurrentType = Context.DependentTy;
	continue;
	}

	// We need to have a complete type to look into.
	if (RequireCompleteType(OC.LocStart, CurrentType,
	diag::err_offsetof_incomplete_type))
	return ExprError();

	// Look for the designated field.
	const RecordType *RC = CurrentType->getAs<RecordType>();
	if (!RC)
	return ExprError(Diag(OC.LocEnd, diag::err_offsetof_record_type)
	<< CurrentType);
	RecordDecl *RD = RC->getDecl();

	// C++ [lib.support.types]p5:
	// The macro offsetof accepts a restricted set of type arguments in this
	// International Standard. type shall be a POD structure or a POD union
	// (clause 9).
	// C++11 [support.types]p4:
	// If type is not a standard-layout class (Clause 9), the results are
	// undefined.
	if (CXXRecordDecl *CRD = dyn_cast<CXXRecordDecl>(RD)) {
	bool IsSafe = LangOpts.CPlusPlus11? CRD->isStandardLayout() : CRD->isPOD();
	unsigned DiagID =
	LangOpts.CPlusPlus11? diag::ext_offsetof_non_standardlayout_type
	: diag::ext_offsetof_non_pod_type;

	if (!IsSafe && !DidWarnAboutNonPOD &&
	DiagRuntimeBehavior(BuiltinLoc, nullptr,
	PDiag(DiagID)
	<< SourceRange(Components[0].LocStart, OC.LocEnd)
	<< CurrentType))
	DidWarnAboutNonPOD = true;
	}

	// Look for the field.
	LookupResult R(*this, OC.U.IdentInfo, OC.LocStart, LookupMemberName);
	LookupQualifiedName(R, RD);
	FieldDecl *MemberDecl = R.getAsSingle<FieldDecl>();
	IndirectFieldDecl *IndirectMemberDecl = nullptr;
	if (!MemberDecl) {
	if ((IndirectMemberDecl = R.getAsSingle<IndirectFieldDecl>()))
	MemberDecl = IndirectMemberDecl->getAnonField();
	}

	if (!MemberDecl)
	return ExprError(Diag(BuiltinLoc, diag::err_no_member)
	<< OC.U.IdentInfo << RD << SourceRange(OC.LocStart,
	OC.LocEnd));

	// C99 7.17p3:
	// (If the specified member is a bit-field, the behavior is undefined.)
	//
	// We diagnose this as an error.
	if (MemberDecl->isBitField()) {
	Diag(OC.LocEnd, diag::err_offsetof_bitfield)
	<< MemberDecl->getDeclName()
	<< SourceRange(BuiltinLoc, RParenLoc);
	Diag(MemberDecl->getLocation(), diag::note_bitfield_decl);
	return ExprError();
	}

	RecordDecl *Parent = MemberDecl->getParent();
	if (IndirectMemberDecl)
	Parent = cast<RecordDecl>(IndirectMemberDecl->getDeclContext());

	// If the member was found in a base class, introduce OffsetOfNodes for
	// the base class indirections.
	CXXBasePaths Paths;
	if (IsDerivedFrom(OC.LocStart, CurrentType, Context.getTypeDeclType(Parent),
	Paths)) {
	if (Paths.getDetectedVirtual()) {
	Diag(OC.LocEnd, diag::err_offsetof_field_of_virtual_base)
	<< MemberDecl->getDeclName()
	<< SourceRange(BuiltinLoc, RParenLoc);
	return ExprError();
	}

	CXXBasePath &Path = Paths.front();
	for (const CXXBasePathElement &B : Path)
	Comps.push_back(OffsetOfNode(B.Base));
	}

	if (IndirectMemberDecl) {
	for (auto *FI : IndirectMemberDecl->chain()) {
	assert(isa<FieldDecl>(FI));
	Comps.push_back(OffsetOfNode(OC.LocStart,
	cast<FieldDecl>(FI), OC.LocEnd));
	}
	} else
	Comps.push_back(OffsetOfNode(OC.LocStart, MemberDecl, OC.LocEnd));

	CurrentType = MemberDecl->getType().getNonReferenceType();
	}

	return OffsetOfExpr::Create(Context, Context.getSizeType(), BuiltinLoc, TInfo,
	Comps, Exprs, RParenLoc);
	}

	ExprResult Sema::ActOnBuiltinOffsetOf(Scope *S,
	SourceLocation BuiltinLoc,
	SourceLocation TypeLoc,
	ParsedType ParsedArgTy,
	ArrayRef<OffsetOfComponent> Components,
	SourceLocation RParenLoc) {

	TypeSourceInfo *ArgTInfo;
	QualType ArgTy = GetTypeFromParser(ParsedArgTy, &ArgTInfo);
	if (ArgTy.isNull())
	return ExprError();

	if (!ArgTInfo)
	ArgTInfo = Context.getTrivialTypeSourceInfo(ArgTy, TypeLoc);

	return BuildBuiltinOffsetOf(BuiltinLoc, ArgTInfo, Components, RParenLoc);
	}


	ExprResult Sema::ActOnChooseExpr(SourceLocation BuiltinLoc,
	Expr *CondExpr,
	Expr LHSExpr, Expr RHSExpr,
	SourceLocation RPLoc) {
	assert((CondExpr && LHSExpr && RHSExpr) && "Missing type argument(s)");

	ExprValueKind VK = VK_RValue;
	ExprObjectKind OK = OK_Ordinary;
	QualType resType;
	bool ValueDependent = false;
	bool CondIsTrue = false;
	if (CondExpr->isTypeDependent() \|\| CondExpr->isValueDependent()) {
	resType = Context.DependentTy;
	ValueDependent = true;
	} else {
	// The conditional expression is required to be a constant expression.
	llvm::APSInt condEval(32);
	ExprResult CondICE
	= VerifyIntegerConstantExpression(CondExpr, &condEval,
	diag::err_typecheck_choose_expr_requires_constant, false);
	if (CondICE.isInvalid())
	return ExprError();
	CondExpr = CondICE.get();
	CondIsTrue = condEval.getZExtValue();

	// If the condition is > zero, then the AST type is the same as the LSHExpr.
	Expr *ActiveExpr = CondIsTrue ? LHSExpr : RHSExpr;

	resType = ActiveExpr->getType();
	ValueDependent = ActiveExpr->isValueDependent();
	VK = ActiveExpr->getValueKind();
	OK = ActiveExpr->getObjectKind();
	}

	return new (Context)
	ChooseExpr(BuiltinLoc, CondExpr, LHSExpr, RHSExpr, resType, VK, OK, RPLoc,
	CondIsTrue, resType->isDependentType(), ValueDependent);
	}

	//===----------------------------------------------------------------------===//
	// Clang Extensions.
	//===----------------------------------------------------------------------===//

	/// ActOnBlockStart - This callback is invoked when a block literal is started.
	void Sema::ActOnBlockStart(SourceLocation CaretLoc, Scope *CurScope) {
	BlockDecl *Block = BlockDecl::Create(Context, CurContext, CaretLoc);

	if (LangOpts.CPlusPlus) {
	Decl *ManglingContextDecl;
	if (MangleNumberingContext *MCtx =
	getCurrentMangleNumberContext(Block->getDeclContext(),
	ManglingContextDecl)) {
	unsigned ManglingNumber = MCtx->getManglingNumber(Block);
	Block->setBlockMangling(ManglingNumber, ManglingContextDecl);
	}
	}

	PushBlockScope(CurScope, Block);
	CurContext->addDecl(Block);
	if (CurScope)
	PushDeclContext(CurScope, Block);
	else
	CurContext = Block;

	getCurBlock()->HasImplicitReturnType = true;

	// Enter a new evaluation context to insulate the block from any
	// cleanups from the enclosing full-expression.
	PushExpressionEvaluationContext(
	ExpressionEvaluationContext::PotentiallyEvaluated);
	}

	void Sema::ActOnBlockArguments(SourceLocation CaretLoc, Declarator &ParamInfo,
	Scope *CurScope) {
	assert(ParamInfo.getIdentifier() == nullptr &&
	"block-id should have no identifier!");
	assert(ParamInfo.getContext() == Declarator::BlockLiteralContext);
	BlockScopeInfo *CurBlock = getCurBlock();

	TypeSourceInfo *Sig = GetTypeForDeclarator(ParamInfo, CurScope);
	QualType T = Sig->getType();

	// FIXME: We should allow unexpanded parameter packs here, but that would,
	// in turn, make the block expression contain unexpanded parameter packs.
	if (DiagnoseUnexpandedParameterPack(CaretLoc, Sig, UPPC_Block)) {
	// Drop the parameters.
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.HasTrailingReturn = false;
	EPI.TypeQuals \|= DeclSpec::TQ_const;
	T = Context.getFunctionType(Context.DependentTy, None, EPI);
	Sig = Context.getTrivialTypeSourceInfo(T);
	}

	// GetTypeForDeclarator always produces a function type for a block
	// literal signature. Furthermore, it is always a FunctionProtoType
	// unless the function was written with a typedef.
	assert(T->isFunctionType() &&
	"GetTypeForDeclarator made a non-function block signature");

	// Look for an explicit signature in that function type.
	FunctionProtoTypeLoc ExplicitSignature;

	TypeLoc tmp = Sig->getTypeLoc().IgnoreParens();
	if ((ExplicitSignature = tmp.getAs<FunctionProtoTypeLoc>())) {

	// Check whether that explicit signature was synthesized by
	// GetTypeForDeclarator. If so, don't save that as part of the
	// written signature.
	if (ExplicitSignature.getLocalRangeBegin() ==
	ExplicitSignature.getLocalRangeEnd()) {
	// This would be much cheaper if we stored TypeLocs instead of
	// TypeSourceInfos.
	TypeLoc Result = ExplicitSignature.getReturnLoc();
	unsigned Size = Result.getFullDataSize();
	Sig = Context.CreateTypeSourceInfo(Result.getType(), Size);
	Sig->getTypeLoc().initializeFullCopy(Result, Size);

	ExplicitSignature = FunctionProtoTypeLoc();
	}
	}

	CurBlock->TheDecl->setSignatureAsWritten(Sig);
	CurBlock->FunctionType = T;

	const FunctionType *Fn = T->getAs<FunctionType>();
	QualType RetTy = Fn->getReturnType();
	bool isVariadic =
	(isa<FunctionProtoType>(Fn) && cast<FunctionProtoType>(Fn)->isVariadic());

	CurBlock->TheDecl->setIsVariadic(isVariadic);

	// Context.DependentTy is used as a placeholder for a missing block
	// return type. TODO: what should we do with declarators like:
	// ^ * { ... }
	// If the answer is "apply template argument deduction"....
	if (RetTy != Context.DependentTy) {
	CurBlock->ReturnType = RetTy;
	CurBlock->TheDecl->setBlockMissingReturnType(false);
	CurBlock->HasImplicitReturnType = false;
	}

	// Push block parameters from the declarator if we had them.
	SmallVector<ParmVarDecl*, 8> Params;
	if (ExplicitSignature) {
	for (unsigned I = 0, E = ExplicitSignature.getNumParams(); I != E; ++I) {
	ParmVarDecl *Param = ExplicitSignature.getParam(I);
	if (Param->getIdentifier() == nullptr &&
	!Param->isImplicit() &&
	!Param->isInvalidDecl() &&
	!getLangOpts().CPlusPlus)
	Diag(Param->getLocation(), diag::err_parameter_name_omitted);
	Params.push_back(Param);
	}

	// Fake up parameter variables if we have a typedef, like
	// ^ fntype { ... }
	} else if (const FunctionProtoType *Fn = T->getAs<FunctionProtoType>()) {
	for (const auto &I : Fn->param_types()) {
	ParmVarDecl *Param = BuildParmVarDeclForTypedef(
	CurBlock->TheDecl, ParamInfo.getLocStart(), I);
	Params.push_back(Param);
	}
	}

	// Set the parameters on the block decl.
	if (!Params.empty()) {
	CurBlock->TheDecl->setParams(Params);
	CheckParmsForFunctionDef(CurBlock->TheDecl->parameters(),
	/CheckParameterNames=/false);
	}

	// Finally we can process decl attributes.
	ProcessDeclAttributes(CurScope, CurBlock->TheDecl, ParamInfo);

	// Put the parameter variables in scope.
	for (auto AI : CurBlock->TheDecl->parameters()) {
	AI->setOwningFunction(CurBlock->TheDecl);

	// If this has an identifier, add it to the scope stack.
	if (AI->getIdentifier()) {
	CheckShadow(CurBlock->TheScope, AI);

	PushOnScopeChains(AI, CurBlock->TheScope);
	}
	}
	}

	/// ActOnBlockError - If there is an error parsing a block, this callback
	/// is invoked to pop the information about the block from the action impl.
	void Sema::ActOnBlockError(SourceLocation CaretLoc, Scope *CurScope) {
	// Leave the expression-evaluation context.
	DiscardCleanupsInEvaluationContext();
	PopExpressionEvaluationContext();

	// Pop off CurBlock, handle nested blocks.
	PopDeclContext();
	PopFunctionScopeInfo();
	}

	/// ActOnBlockStmtExpr - This is called when the body of a block statement
	/// literal was successfully completed. ^(int x){...}
	ExprResult Sema::ActOnBlockStmtExpr(SourceLocation CaretLoc,
	Stmt Body, Scope CurScope) {
	// If blocks are disabled, emit an error.
	if (!LangOpts.Blocks)
	Diag(CaretLoc, diag::err_blocks_disable) << LangOpts.OpenCL;

	// Leave the expression-evaluation context.
	if (hasAnyUnrecoverableErrorsInThisFunction())
	DiscardCleanupsInEvaluationContext();
	assert(!Cleanup.exprNeedsCleanups() &&
	"cleanups within block not correctly bound!");
	PopExpressionEvaluationContext();

	BlockScopeInfo *BSI = cast<BlockScopeInfo>(FunctionScopes.back());

	if (BSI->HasImplicitReturnType)
	deduceClosureReturnType(*BSI);

	PopDeclContext();

	QualType RetTy = Context.VoidTy;
	if (!BSI->ReturnType.isNull())
	RetTy = BSI->ReturnType;

	bool NoReturn = BSI->TheDecl->hasAttr<NoReturnAttr>();
	QualType BlockTy;

	// Set the captured variables on the block.
	// FIXME: Share capture structure between BlockDecl and CapturingScopeInfo!
	SmallVector<BlockDecl::Capture, 4> Captures;
	for (CapturingScopeInfo::Capture &Cap : BSI->Captures) {
	if (Cap.isThisCapture())
	continue;
	BlockDecl::Capture NewCap(Cap.getVariable(), Cap.isBlockCapture(),
	Cap.isNested(), Cap.getInitExpr());
	Captures.push_back(NewCap);
	}
	BSI->TheDecl->setCaptures(Context, Captures, BSI->CXXThisCaptureIndex != 0);

	// If the user wrote a function type in some form, try to use that.
	if (!BSI->FunctionType.isNull()) {
	const FunctionType *FTy = BSI->FunctionType->getAs<FunctionType>();

	FunctionType::ExtInfo Ext = FTy->getExtInfo();
	if (NoReturn && !Ext.getNoReturn()) Ext = Ext.withNoReturn(true);

	// Turn protoless block types into nullary block types.
	if (isa<FunctionNoProtoType>(FTy)) {
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.ExtInfo = Ext;
	BlockTy = Context.getFunctionType(RetTy, None, EPI);

	// Otherwise, if we don't need to change anything about the function type,
	// preserve its sugar structure.
	} else if (FTy->getReturnType() == RetTy &&
	(!NoReturn \|\| FTy->getNoReturnAttr())) {
	BlockTy = BSI->FunctionType;

	// Otherwise, make the minimal modifications to the function type.
	} else {
	const FunctionProtoType *FPT = cast<FunctionProtoType>(FTy);
	FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
	EPI.TypeQuals = 0; // FIXME: silently?
	EPI.ExtInfo = Ext;
	BlockTy = Context.getFunctionType(RetTy, FPT->getParamTypes(), EPI);
	}

	// If we don't have a function type, just build one from nothing.
	} else {
	FunctionProtoType::ExtProtoInfo EPI;
	EPI.ExtInfo = FunctionType::ExtInfo().withNoReturn(NoReturn);
	BlockTy = Context.getFunctionType(RetTy, None, EPI);
	}

	DiagnoseUnusedParameters(BSI->TheDecl->parameters());
	BlockTy = Context.getBlockPointerType(BlockTy);

	// If needed, diagnose invalid gotos and switches in the block.
	if (getCurFunction()->NeedsScopeChecking() &&
	!PP.isCodeCompletionEnabled())
	DiagnoseInvalidJumps(cast<CompoundStmt>(Body));

	BSI->TheDecl->setBody(cast<CompoundStmt>(Body));

	if (Body && getCurFunction()->HasPotentialAvailabilityViolations)
	DiagnoseUnguardedAvailabilityViolations(BSI->TheDecl);

	// Try to apply the named return value optimization. We have to check again
	// if we can do this, though, because blocks keep return statements around
	// to deduce an implicit return type.
	if (getLangOpts().CPlusPlus && RetTy->isRecordType() &&
	!BSI->TheDecl->isDependentContext())
	computeNRVO(Body, BSI);

	BlockExpr *Result = new (Context) BlockExpr(BSI->TheDecl, BlockTy);
	AnalysisBasedWarnings::Policy WP = AnalysisWarnings.getDefaultPolicy();
	PopFunctionScopeInfo(&WP, Result->getBlockDecl(), Result);

	// If the block isn't obviously global, i.e. it captures anything at
	// all, then we need to do a few things in the surrounding context:
	if (Result->getBlockDecl()->hasCaptures()) {
	// First, this expression has a new cleanup object.
	ExprCleanupObjects.push_back(Result->getBlockDecl());
	Cleanup.setExprNeedsCleanups(true);

	// It also gets a branch-protected scope if any of the captured
	// variables needs destruction.
	for (const auto &CI : Result->getBlockDecl()->captures()) {
	const VarDecl *var = CI.getVariable();
	if (var->getType().isDestructedType() != QualType::DK_none) {
	getCurFunction()->setHasBranchProtectedScope();
	break;
	}
	}
	}

	return Result;
	}

	ExprResult Sema::ActOnVAArg(SourceLocation BuiltinLoc, Expr *E, ParsedType Ty,
	SourceLocation RPLoc) {
	TypeSourceInfo *TInfo;
	GetTypeFromParser(Ty, &TInfo);
	return BuildVAArgExpr(BuiltinLoc, E, TInfo, RPLoc);
	}

	ExprResult Sema::BuildVAArgExpr(SourceLocation BuiltinLoc,
	Expr E, TypeSourceInfo TInfo,
	SourceLocation RPLoc) {
	Expr *OrigExpr = E;
	bool IsMS = false;

	// CUDA device code does not support varargs.
	if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) {
	if (const FunctionDecl *F = dyn_cast<FunctionDecl>(CurContext)) {
	CUDAFunctionTarget T = IdentifyCUDATarget(F);
	if (T == CFT_Global \|\| T == CFT_Device \|\| T == CFT_HostDevice)
	return ExprError(Diag(E->getLocStart(), diag::err_va_arg_in_device));
	}
	}

	// It might be a __builtin_ms_va_list. (But don't ever mark a va_arg()
	// as Microsoft ABI on an actual Microsoft platform, where
	// __builtin_ms_va_list and __builtin_va_list are the same.)
	if (!E->isTypeDependent() && Context.getTargetInfo().hasBuiltinMSVaList() &&
	Context.getTargetInfo().getBuiltinVaListKind() != TargetInfo::CharPtrBuiltinVaList) {
	QualType MSVaListType = Context.getBuiltinMSVaListType();
	if (Context.hasSameType(MSVaListType, E->getType())) {
	if (CheckForModifiableLvalue(E, BuiltinLoc, *this))
	return ExprError();
	IsMS = true;
	}
	}

	// Get the va_list type
	QualType VaListType = Context.getBuiltinVaListType();
	if (!IsMS) {
	if (VaListType->isArrayType()) {
	// Deal with implicit array decay; for example, on x86-64,
	// va_list is an array, but it's supposed to decay to
	// a pointer for va_arg.
	VaListType = Context.getArrayDecayedType(VaListType);
	// Make sure the input expression also decays appropriately.
	ExprResult Result = UsualUnaryConversions(E);
	if (Result.isInvalid())
	return ExprError();
	E = Result.get();
	} else if (VaListType->isRecordType() && getLangOpts().CPlusPlus) {
	// If va_list is a record type and we are compiling in C++ mode,
	// check the argument using reference binding.
	InitializedEntity Entity = InitializedEntity::InitializeParameter(
	Context, Context.getLValueReferenceType(VaListType), false);
	ExprResult Init = PerformCopyInitialization(Entity, SourceLocation(), E);
	if (Init.isInvalid())
	return ExprError();
	E = Init.getAs<Expr>();
	} else {
	// Otherwise, the va_list argument must be an l-value because
	// it is modified by va_arg.
	if (!E->isTypeDependent() &&
	CheckForModifiableLvalue(E, BuiltinLoc, *this))
	return ExprError();
	}
	}

	if (!IsMS && !E->isTypeDependent() &&
	!Context.hasSameType(VaListType, E->getType()))
	return ExprError(Diag(E->getLocStart(),
	diag::err_first_argument_to_va_arg_not_of_type_va_list)
	<< OrigExpr->getType() << E->getSourceRange());

	if (!TInfo->getType()->isDependentType()) {
	if (RequireCompleteType(TInfo->getTypeLoc().getBeginLoc(), TInfo->getType(),
	diag::err_second_parameter_to_va_arg_incomplete,
	TInfo->getTypeLoc()))
	return ExprError();

	if (RequireNonAbstractType(TInfo->getTypeLoc().getBeginLoc(),
	TInfo->getType(),
	diag::err_second_parameter_to_va_arg_abstract,
	TInfo->getTypeLoc()))
	return ExprError();

	if (!TInfo->getType().isPODType(Context)) {
	Diag(TInfo->getTypeLoc().getBeginLoc(),
	TInfo->getType()->isObjCLifetimeType()
	? diag::warn_second_parameter_to_va_arg_ownership_qualified
	: diag::warn_second_parameter_to_va_arg_not_pod)
	<< TInfo->getType()
	<< TInfo->getTypeLoc().getSourceRange();
	}

	// Check for va_arg where arguments of the given type will be promoted
	// (i.e. this va_arg is guaranteed to have undefined behavior).
	QualType PromoteType;
	if (TInfo->getType()->isPromotableIntegerType()) {
	PromoteType = Context.getPromotedIntegerType(TInfo->getType());
	if (Context.typesAreCompatible(PromoteType, TInfo->getType()))
	PromoteType = QualType();
	}
	if (TInfo->getType()->isSpecificBuiltinType(BuiltinType::Float))
	PromoteType = Context.DoubleTy;
	if (!PromoteType.isNull())
	DiagRuntimeBehavior(TInfo->getTypeLoc().getBeginLoc(), E,
	PDiag(diag::warn_second_parameter_to_va_arg_never_compatible)
	<< TInfo->getType()
	<< PromoteType
	<< TInfo->getTypeLoc().getSourceRange());
	}

	QualType T = TInfo->getType().getNonLValueExprType(Context);
	return new (Context) VAArgExpr(BuiltinLoc, E, TInfo, RPLoc, T, IsMS);
	}

	ExprResult Sema::ActOnGNUNullExpr(SourceLocation TokenLoc) {
	// The type of __null will be int or long, depending on the size of
	// pointers on the target.
	QualType Ty;
	unsigned pw = Context.getTargetInfo().getPointerWidth(0);
	if (pw == Context.getTargetInfo().getIntWidth())
	Ty = Context.IntTy;
	else if (pw == Context.getTargetInfo().getLongWidth())
	Ty = Context.LongTy;
	else if (pw == Context.getTargetInfo().getLongLongWidth())
	Ty = Context.LongLongTy;
	else {
	llvm_unreachable("I don't know size of pointer!");
	}

	return new (Context) GNUNullExpr(Ty, TokenLoc);
	}

	bool Sema::ConversionToObjCStringLiteralCheck(QualType DstType, Expr *&Exp,
	bool Diagnose) {
	if (!getLangOpts().ObjC1)
	return false;

	const ObjCObjectPointerType *PT = DstType->getAs<ObjCObjectPointerType>();
	if (!PT)
	return false;

	if (!PT->isObjCIdType()) {
	// Check if the destination is the 'NSString' interface.
	const ObjCInterfaceDecl *ID = PT->getInterfaceDecl();
	if (!ID \|\| !ID->getIdentifier()->isStr("NSString"))
	return false;
	}

	// Ignore any parens, implicit casts (should only be
	// array-to-pointer decays), and not-so-opaque values. The last is
	// important for making this trigger for property assignments.
	Expr *SrcExpr = Exp->IgnoreParenImpCasts();
	if (OpaqueValueExpr *OV = dyn_cast<OpaqueValueExpr>(SrcExpr))
	if (OV->getSourceExpr())
	SrcExpr = OV->getSourceExpr()->IgnoreParenImpCasts();

	StringLiteral *SL = dyn_cast<StringLiteral>(SrcExpr);
	if (!SL \|\| !SL->isAscii())
	return false;
	if (Diagnose) {
	Diag(SL->getLocStart(), diag::err_missing_atsign_prefix)
	<< FixItHint::CreateInsertion(SL->getLocStart(), "@");
	Exp = BuildObjCStringLiteral(SL->getLocStart(), SL).get();
	}
	return true;
	}

	static bool maybeDiagnoseAssignmentToFunction(Sema &S, QualType DstType,
	const Expr *SrcExpr) {
	if (!DstType->isFunctionPointerType() \|\|
	!SrcExpr->getType()->isFunctionType())
	return false;

	auto *DRE = dyn_cast<DeclRefExpr>(SrcExpr->IgnoreParenImpCasts());
	if (!DRE)
	return false;

	auto *FD = dyn_cast<FunctionDecl>(DRE->getDecl());
	if (!FD)
	return false;

	return !S.checkAddressOfFunctionIsAvailable(FD,
	/Complain=/true,
	SrcExpr->getLocStart());
	}

	bool Sema::DiagnoseAssignmentResult(AssignConvertType ConvTy,
	SourceLocation Loc,
	QualType DstType, QualType SrcType,
	Expr *SrcExpr, AssignmentAction Action,
	bool *Complained) {
	if (Complained)
	*Complained = false;

	// Decode the result (notice that AST's are still created for extensions).
	bool CheckInferredResultType = false;
	bool isInvalid = false;
	unsigned DiagKind = 0;
	FixItHint Hint;
	ConversionFixItGenerator ConvHints;
	bool MayHaveConvFixit = false;
	bool MayHaveFunctionDiff = false;
	const ObjCInterfaceDecl *IFace = nullptr;
	const ObjCProtocolDecl *PDecl = nullptr;

	switch (ConvTy) {
	case Compatible:
	DiagnoseAssignmentEnum(DstType, SrcType, SrcExpr);
	return false;

	case PointerToInt:
	DiagKind = diag::ext_typecheck_convert_pointer_int;
	ConvHints.tryToFixConversion(SrcExpr, SrcType, DstType, *this);
	MayHaveConvFixit = true;
	break;
	case IntToPointer:
	DiagKind = diag::ext_typecheck_convert_int_pointer;
	ConvHints.tryToFixConversion(SrcExpr, SrcType, DstType, *this);
	MayHaveConvFixit = true;
	break;
	case IncompatiblePointer:
	if (Action == AA_Passing_CFAudited)
	DiagKind = diag::err_arc_typecheck_convert_incompatible_pointer;
	else if (SrcType->isFunctionPointerType() &&
	DstType->isFunctionPointerType())
	DiagKind = diag::ext_typecheck_convert_incompatible_function_pointer;
	else
	DiagKind = diag::ext_typecheck_convert_incompatible_pointer;

	CheckInferredResultType = DstType->isObjCObjectPointerType() &&
	SrcType->isObjCObjectPointerType();
	if (Hint.isNull() && !CheckInferredResultType) {
	ConvHints.tryToFixConversion(SrcExpr, SrcType, DstType, *this);
	}
	else if (CheckInferredResultType) {
	SrcType = SrcType.getUnqualifiedType();
	DstType = DstType.getUnqualifiedType();
	}
	MayHaveConvFixit = true;
	break;
	case IncompatiblePointerSign:
	DiagKind = diag::ext_typecheck_convert_incompatible_pointer_sign;
	break;
	case FunctionVoidPointer:
	DiagKind = diag::ext_typecheck_convert_pointer_void_func;
	break;
	case IncompatiblePointerDiscardsQualifiers: {
	// Perform array-to-pointer decay if necessary.
	if (SrcType->isArrayType()) SrcType = Context.getArrayDecayedType(SrcType);

	Qualifiers lhq = SrcType->getPointeeType().getQualifiers();
	Qualifiers rhq = DstType->getPointeeType().getQualifiers();
	if (lhq.getAddressSpace() != rhq.getAddressSpace()) {
	DiagKind = diag::err_typecheck_incompatible_address_space;
	break;


	} else if (lhq.getObjCLifetime() != rhq.getObjCLifetime()) {
	DiagKind = diag::err_typecheck_incompatible_ownership;
	break;
	}

	llvm_unreachable("unknown error case for discarding qualifiers!");
	// fallthrough
	}
	case CompatiblePointerDiscardsQualifiers:
	// If the qualifiers lost were because we were applying the
	// (deprecated) C++ conversion from a string literal to a char*
	// (or wchar_t*), then there was no error (C++ 4.2p2). FIXME:
	// Ideally, this check would be performed in
	// checkPointerTypesForAssignment. However, that would require a
	// bit of refactoring (so that the second argument is an
	// expression, rather than a type), which should be done as part
	// of a larger effort to fix checkPointerTypesForAssignment for
	// C++ semantics.
	if (getLangOpts().CPlusPlus &&
	IsStringLiteralToNonConstPointerConversion(SrcExpr, DstType))
	return false;
	DiagKind = diag::ext_typecheck_convert_discards_qualifiers;
	break;
	case IncompatibleNestedPointerQualifiers:
	DiagKind = diag::ext_nested_pointer_qualifier_mismatch;
	break;
	case IntToBlockPointer:
	DiagKind = diag::err_int_to_block_pointer;
	break;
	case IncompatibleBlockPointer:
	DiagKind = diag::err_typecheck_convert_incompatible_block_pointer;
	break;
	case IncompatibleObjCQualifiedId: {
	if (SrcType->isObjCQualifiedIdType()) {
	const ObjCObjectPointerType *srcOPT =
	SrcType->getAs<ObjCObjectPointerType>();
	for (auto *srcProto : srcOPT->quals()) {
	PDecl = srcProto;
	break;
	}
	if (const ObjCInterfaceType *IFaceT =
	DstType->getAs<ObjCObjectPointerType>()->getInterfaceType())
	IFace = IFaceT->getDecl();
	}
	else if (DstType->isObjCQualifiedIdType()) {
	const ObjCObjectPointerType *dstOPT =
	DstType->getAs<ObjCObjectPointerType>();
	for (auto *dstProto : dstOPT->quals()) {
	PDecl = dstProto;
	break;
	}
	if (const ObjCInterfaceType *IFaceT =
	SrcType->getAs<ObjCObjectPointerType>()->getInterfaceType())
	IFace = IFaceT->getDecl();
	}
	DiagKind = diag::warn_incompatible_qualified_id;
	break;
	}
	case IncompatibleVectors:
	DiagKind = diag::warn_incompatible_vectors;
	break;
	case IncompatibleObjCWeakRef:
	DiagKind = diag::err_arc_weak_unavailable_assign;
	break;
	case Incompatible:
	if (maybeDiagnoseAssignmentToFunction(*this, DstType, SrcExpr)) {
	if (Complained)
	*Complained = true;
	return true;
	}

	DiagKind = diag::err_typecheck_convert_incompatible;
	ConvHints.tryToFixConversion(SrcExpr, SrcType, DstType, *this);
	MayHaveConvFixit = true;
	isInvalid = true;
	MayHaveFunctionDiff = true;
	break;
	}

	QualType FirstType, SecondType;
	switch (Action) {
	case AA_Assigning:
	case AA_Initializing:
	// The destination type comes first.
	FirstType = DstType;
	SecondType = SrcType;
	break;

	case AA_Returning:
	case AA_Passing:
	case AA_Passing_CFAudited:
	case AA_Converting:
	case AA_Sending:
	case AA_Casting:
	// The source type comes first.
	FirstType = SrcType;
	SecondType = DstType;
	break;
	}

	PartialDiagnostic FDiag = PDiag(DiagKind);
	if (Action == AA_Passing_CFAudited)
	FDiag << FirstType << SecondType << AA_Passing << SrcExpr->getSourceRange();
	else
	FDiag << FirstType << SecondType << Action << SrcExpr->getSourceRange();

	// If we can fix the conversion, suggest the FixIts.
	assert(ConvHints.isNull() \|\| Hint.isNull());
	if (!ConvHints.isNull()) {
	for (FixItHint &H : ConvHints.Hints)
	FDiag << H;
	} else {
	FDiag << Hint;
	}
	if (MayHaveConvFixit) { FDiag << (unsigned) (ConvHints.Kind); }

	if (MayHaveFunctionDiff)
	HandleFunctionTypeMismatch(FDiag, SecondType, FirstType);

	Diag(Loc, FDiag);
	if (DiagKind == diag::warn_incompatible_qualified_id &&
	PDecl && IFace && !IFace->hasDefinition())
	Diag(IFace->getLocation(), diag::note_incomplete_class_and_qualified_id)
	<< IFace->getName() << PDecl->getName();

	if (SecondType == Context.OverloadTy)
	NoteAllOverloadCandidates(OverloadExpr::find(SrcExpr).Expression,
	FirstType, /TakingAddress=/true);

	if (CheckInferredResultType)
	EmitRelatedResultTypeNote(SrcExpr);

	if (Action == AA_Returning && ConvTy == IncompatiblePointer)
	EmitRelatedResultTypeNoteForReturn(DstType);

	if (Complained)
	*Complained = true;
	return isInvalid;
	}

	ExprResult Sema::VerifyIntegerConstantExpression(Expr *E,
	llvm::APSInt *Result) {
	class SimpleICEDiagnoser : public VerifyICEDiagnoser {
	public:
	void diagnoseNotICE(Sema &S, SourceLocation Loc, SourceRange SR) override {
	S.Diag(Loc, diag::err_expr_not_ice) << S.LangOpts.CPlusPlus << SR;
	}
	} Diagnoser;

	return VerifyIntegerConstantExpression(E, Result, Diagnoser);
	}

	ExprResult Sema::VerifyIntegerConstantExpression(Expr *E,
	llvm::APSInt *Result,
	unsigned DiagID,
	bool AllowFold) {
	class IDDiagnoser : public VerifyICEDiagnoser {
	unsigned DiagID;

	public:
	IDDiagnoser(unsigned DiagID)
	: VerifyICEDiagnoser(DiagID == 0), DiagID(DiagID) { }

	void diagnoseNotICE(Sema &S, SourceLocation Loc, SourceRange SR) override {
	S.Diag(Loc, DiagID) << SR;
	}
	} Diagnoser(DiagID);

	return VerifyIntegerConstantExpression(E, Result, Diagnoser, AllowFold);
	}

	void Sema::VerifyICEDiagnoser::diagnoseFold(Sema &S, SourceLocation Loc,
	SourceRange SR) {
	S.Diag(Loc, diag::ext_expr_not_ice) << SR << S.LangOpts.CPlusPlus;
	}

	ExprResult
	Sema::VerifyIntegerConstantExpression(Expr E, llvm::APSInt Result,
	VerifyICEDiagnoser &Diagnoser,
	bool AllowFold) {
	SourceLocation DiagLoc = E->getLocStart();

	if (getLangOpts().CPlusPlus11) {
	// C++11 [expr.const]p5:
	// If an expression of literal class type is used in a context where an
	// integral constant expression is required, then that class type shall
	// have a single non-explicit conversion function to an integral or
	// unscoped enumeration type
	ExprResult Converted;
	class CXX11ConvertDiagnoser : public ICEConvertDiagnoser {
	public:
	CXX11ConvertDiagnoser(bool Silent)
	: ICEConvertDiagnoser(/AllowScopedEnumerations/false,
	Silent, true) {}

	SemaDiagnosticBuilder diagnoseNotInt(Sema &S, SourceLocation Loc,
	QualType T) override {
	return S.Diag(Loc, diag::err_ice_not_integral) << T;
	}

	SemaDiagnosticBuilder diagnoseIncomplete(
	Sema &S, SourceLocation Loc, QualType T) override {
	return S.Diag(Loc, diag::err_ice_incomplete_type) << T;
	}

	SemaDiagnosticBuilder diagnoseExplicitConv(
	Sema &S, SourceLocation Loc, QualType T, QualType ConvTy) override {
	return S.Diag(Loc, diag::err_ice_explicit_conversion) << T << ConvTy;
	}

	SemaDiagnosticBuilder noteExplicitConv(
	Sema &S, CXXConversionDecl *Conv, QualType ConvTy) override {
	return S.Diag(Conv->getLocation(), diag::note_ice_conversion_here)
	<< ConvTy->isEnumeralType() << ConvTy;
	}

	SemaDiagnosticBuilder diagnoseAmbiguous(
	Sema &S, SourceLocation Loc, QualType T) override {
	return S.Diag(Loc, diag::err_ice_ambiguous_conversion) << T;
	}

	SemaDiagnosticBuilder noteAmbiguous(
	Sema &S, CXXConversionDecl *Conv, QualType ConvTy) override {
	return S.Diag(Conv->getLocation(), diag::note_ice_conversion_here)
	<< ConvTy->isEnumeralType() << ConvTy;
	}

	SemaDiagnosticBuilder diagnoseConversion(
	Sema &S, SourceLocation Loc, QualType T, QualType ConvTy) override {
	llvm_unreachable("conversion functions are permitted");
	}
	} ConvertDiagnoser(Diagnoser.Suppress);

	Converted = PerformContextualImplicitConversion(DiagLoc, E,
	ConvertDiagnoser);
	if (Converted.isInvalid())
	return Converted;
	E = Converted.get();
	if (!E->getType()->isIntegralOrUnscopedEnumerationType())
	return ExprError();
	} else if (!E->getType()->isIntegralOrUnscopedEnumerationType()) {
	// An ICE must be of integral or unscoped enumeration type.
	if (!Diagnoser.Suppress)
	Diagnoser.diagnoseNotICE(*this, DiagLoc, E->getSourceRange());
	return ExprError();
	}

	// Circumvent ICE checking in C++11 to avoid evaluating the expression twice
	// in the non-ICE case.
	if (!getLangOpts().CPlusPlus11 && E->isIntegerConstantExpr(Context)) {
	if (Result)
	*Result = E->EvaluateKnownConstInt(Context);
	return E;
	}

	Expr::EvalResult EvalResult;
	SmallVector<PartialDiagnosticAt, 8> Notes;
	EvalResult.Diag = &Notes;

	// Try to evaluate the expression, and produce diagnostics explaining why it's
	// not a constant expression as a side-effect.
	bool Folded = E->EvaluateAsRValue(EvalResult, Context) &&
	EvalResult.Val.isInt() && !EvalResult.HasSideEffects;

	// In C++11, we can rely on diagnostics being produced for any expression
	// which is not a constant expression. If no diagnostics were produced, then
	// this is a constant expression.
	if (Folded && getLangOpts().CPlusPlus11 && Notes.empty()) {
	if (Result)
	*Result = EvalResult.Val.getInt();
	return E;
	}

	// If our only note is the usual "invalid subexpression" note, just point
	// the caret at its location rather than producing an essentially
	// redundant note.
	if (Notes.size() == 1 && Notes[0].second.getDiagID() ==
	diag::note_invalid_subexpr_in_const_expr) {
	DiagLoc = Notes[0].first;
	Notes.clear();
	}

	if (!Folded \|\| !AllowFold) {
	if (!Diagnoser.Suppress) {
	Diagnoser.diagnoseNotICE(*this, DiagLoc, E->getSourceRange());
	for (const PartialDiagnosticAt &Note : Notes)
	Diag(Note.first, Note.second);
	}

	return ExprError();
	}

	Diagnoser.diagnoseFold(*this, DiagLoc, E->getSourceRange());
	for (const PartialDiagnosticAt &Note : Notes)
	Diag(Note.first, Note.second);

	if (Result)
	*Result = EvalResult.Val.getInt();
	return E;
	}

	namespace {
	// Handle the case where we conclude a expression which we speculatively
	// considered to be unevaluated is actually evaluated.
	class TransformToPE : public TreeTransform<TransformToPE> {
	typedef TreeTransform<TransformToPE> BaseTransform;

	public:
	TransformToPE(Sema &SemaRef) : BaseTransform(SemaRef) { }

	// Make sure we redo semantic analysis
	bool AlwaysRebuild() { return true; }

	// Make sure we handle LabelStmts correctly.
	// FIXME: This does the right thing, but maybe we need a more general
	// fix to TreeTransform?
	StmtResult TransformLabelStmt(LabelStmt *S) {
	S->getDecl()->setStmt(nullptr);
	return BaseTransform::TransformLabelStmt(S);
	}

	// We need to special-case DeclRefExprs referring to FieldDecls which
	// are not part of a member pointer formation; normal TreeTransforming
	// doesn't catch this case because of the way we represent them in the AST.
	// FIXME: This is a bit ugly; is it really the best way to handle this
	// case?
	//
	// Error on DeclRefExprs referring to FieldDecls.
	ExprResult TransformDeclRefExpr(DeclRefExpr *E) {
	if (isa<FieldDecl>(E->getDecl()) &&
	!SemaRef.isUnevaluatedContext())
	return SemaRef.Diag(E->getLocation(),
	diag::err_invalid_non_static_member_use)
	<< E->getDecl() << E->getSourceRange();

	return BaseTransform::TransformDeclRefExpr(E);
	}

	// Exception: filter out member pointer formation
	ExprResult TransformUnaryOperator(UnaryOperator *E) {
	if (E->getOpcode() == UO_AddrOf && E->getType()->isMemberPointerType())
	return E;

	return BaseTransform::TransformUnaryOperator(E);
	}

	ExprResult TransformLambdaExpr(LambdaExpr *E) {
	// Lambdas never need to be transformed.
	return E;
	}
	};
	}

	ExprResult Sema::TransformToPotentiallyEvaluated(Expr *E) {
	assert(isUnevaluatedContext() &&
	"Should only transform unevaluated expressions");
	ExprEvalContexts.back().Context =
	ExprEvalContexts[ExprEvalContexts.size()-2].Context;
	if (isUnevaluatedContext())
	return E;
	return TransformToPE(*this).TransformExpr(E);
	}

	void
	Sema::PushExpressionEvaluationContext(ExpressionEvaluationContext NewContext,
	Decl *LambdaContextDecl,
	bool IsDecltype) {
	ExprEvalContexts.emplace_back(NewContext, ExprCleanupObjects.size(), Cleanup,
	LambdaContextDecl, IsDecltype);
	Cleanup.reset();
	if (!MaybeODRUseExprs.empty())
	std::swap(MaybeODRUseExprs, ExprEvalContexts.back().SavedMaybeODRUseExprs);
	}

	void
	Sema::PushExpressionEvaluationContext(ExpressionEvaluationContext NewContext,
	ReuseLambdaContextDecl_t,
	bool IsDecltype) {
	Decl *ClosureContextDecl = ExprEvalContexts.back().ManglingContextDecl;
	PushExpressionEvaluationContext(NewContext, ClosureContextDecl, IsDecltype);
	}

	void Sema::PopExpressionEvaluationContext() {
	ExpressionEvaluationContextRecord& Rec = ExprEvalContexts.back();
	unsigned NumTypos = Rec.NumTypos;

	if (!Rec.Lambdas.empty()) {
	if (Rec.isUnevaluated() \|\| Rec.isConstantEvaluated()) {
	unsigned D;
	if (Rec.isUnevaluated()) {
	// C++11 [expr.prim.lambda]p2:
	// A lambda-expression shall not appear in an unevaluated operand
	// (Clause 5).
	D = diag::err_lambda_unevaluated_operand;
	} else {
	// C++1y [expr.const]p2:
	// A conditional-expression e is a core constant expression unless the
	// evaluation of e, following the rules of the abstract machine, would
	// evaluate [...] a lambda-expression.
	D = diag::err_lambda_in_constant_expression;
	}

	// C++1z allows lambda expressions as core constant expressions.
	// FIXME: In C++1z, reinstate the restrictions on lambda expressions (CWG
	// 1607) from appearing within template-arguments and array-bounds that
	// are part of function-signatures. Be mindful that P0315 (Lambdas in
	// unevaluated contexts) might lift some of these restrictions in a
	// future version.
	if (!Rec.isConstantEvaluated() \|\| !getLangOpts().CPlusPlus1z)
	for (const auto *L : Rec.Lambdas)
	Diag(L->getLocStart(), D);
	} else {
	// Mark the capture expressions odr-used. This was deferred
	// during lambda expression creation.
	for (auto *Lambda : Rec.Lambdas) {
	for (auto *C : Lambda->capture_inits())
	MarkDeclarationsReferencedInExpr(C);
	}
	}
	}

	// When are coming out of an unevaluated context, clear out any
	// temporaries that we may have created as part of the evaluation of
	// the expression in that context: they aren't relevant because they
	// will never be constructed.
	if (Rec.isUnevaluated() \|\| Rec.isConstantEvaluated()) {
	ExprCleanupObjects.erase(ExprCleanupObjects.begin() + Rec.NumCleanupObjects,
	ExprCleanupObjects.end());
	Cleanup = Rec.ParentCleanup;
	CleanupVarDeclMarking();
	std::swap(MaybeODRUseExprs, Rec.SavedMaybeODRUseExprs);
	// Otherwise, merge the contexts together.
	} else {
	Cleanup.mergeFrom(Rec.ParentCleanup);
	MaybeODRUseExprs.insert(Rec.SavedMaybeODRUseExprs.begin(),
	Rec.SavedMaybeODRUseExprs.end());
	}

	// Pop the current expression evaluation context off the stack.
	ExprEvalContexts.pop_back();

	if (!ExprEvalContexts.empty())
	ExprEvalContexts.back().NumTypos += NumTypos;
	else
	assert(NumTypos == 0 && "There are outstanding typos after popping the "
	"last ExpressionEvaluationContextRecord");
	}

	void Sema::DiscardCleanupsInEvaluationContext() {
	ExprCleanupObjects.erase(
	ExprCleanupObjects.begin() + ExprEvalContexts.back().NumCleanupObjects,
	ExprCleanupObjects.end());
	Cleanup.reset();
	MaybeODRUseExprs.clear();
	}

	ExprResult Sema::HandleExprEvaluationContextForTypeof(Expr *E) {
	if (!E->getType()->isVariablyModifiedType())
	return E;
	return TransformToPotentiallyEvaluated(E);
	}

	/// Are we within a context in which some evaluation could be performed (be it
	/// constant evaluation or runtime evaluation)? Sadly, this notion is not quite
	/// captured by C++'s idea of an "unevaluated context".
	static bool isEvaluatableContext(Sema &SemaRef) {
	switch (SemaRef.ExprEvalContexts.back().Context) {
	case Sema::ExpressionEvaluationContext::Unevaluated:
	case Sema::ExpressionEvaluationContext::UnevaluatedAbstract:
	case Sema::ExpressionEvaluationContext::DiscardedStatement:
	// Expressions in this context are never evaluated.
	return false;

	case Sema::ExpressionEvaluationContext::UnevaluatedList:
	case Sema::ExpressionEvaluationContext::ConstantEvaluated:
	case Sema::ExpressionEvaluationContext::PotentiallyEvaluated:
	// Expressions in this context could be evaluated.
	return true;

	case Sema::ExpressionEvaluationContext::PotentiallyEvaluatedIfUsed:
	// Referenced declarations will only be used if the construct in the
	// containing expression is used, at which point we'll be given another
	// turn to mark them.
	return false;
	}
	llvm_unreachable("Invalid context");
	}

	/// Are we within a context in which references to resolved functions or to
	/// variables result in odr-use?
	static bool isOdrUseContext(Sema &SemaRef, bool SkipDependentUses = true) {
	// An expression in a template is not really an expression until it's been
	// instantiated, so it doesn't trigger odr-use.
	if (SkipDependentUses && SemaRef.CurContext->isDependentContext())
	return false;

	switch (SemaRef.ExprEvalContexts.back().Context) {
	case Sema::ExpressionEvaluationContext::Unevaluated:
	case Sema::ExpressionEvaluationContext::UnevaluatedList:
	case Sema::ExpressionEvaluationContext::UnevaluatedAbstract:
	case Sema::ExpressionEvaluationContext::DiscardedStatement:
	return false;

	case Sema::ExpressionEvaluationContext::ConstantEvaluated:
	case Sema::ExpressionEvaluationContext::PotentiallyEvaluated:
	return true;

	case Sema::ExpressionEvaluationContext::PotentiallyEvaluatedIfUsed:
	return false;
	}
	llvm_unreachable("Invalid context");
	}

	static bool isImplicitlyDefinableConstexprFunction(FunctionDecl *Func) {
	CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(Func);
	return Func->isConstexpr() &&
	(Func->isImplicitlyInstantiable() \|\| (MD && !MD->isUserProvided()));
	}

	/// \brief Mark a function referenced, and check whether it is odr-used
	/// (C++ [basic.def.odr]p2, C99 6.9p3)
	void Sema::MarkFunctionReferenced(SourceLocation Loc, FunctionDecl *Func,
	bool MightBeOdrUse) {
	assert(Func && "No function?");

	Func->setReferenced();

	// C++11 [basic.def.odr]p3:
	// A function whose name appears as a potentially-evaluated expression is
	// odr-used if it is the unique lookup result or the selected member of a
	// set of overloaded functions [...].
	//
	// We (incorrectly) mark overload resolution as an unevaluated context, so we
	// can just check that here.
	bool OdrUse = MightBeOdrUse && isOdrUseContext(*this);

	// Determine whether we require a function definition to exist, per
	// C++11 [temp.inst]p3:
	// Unless a function template specialization has been explicitly
	// instantiated or explicitly specialized, the function template
	// specialization is implicitly instantiated when the specialization is
	// referenced in a context that requires a function definition to exist.
	//
	// That is either when this is an odr-use, or when a usage of a constexpr
	// function occurs within an evaluatable context.
	bool NeedDefinition =
	OdrUse \|\| (isEvaluatableContext(*this) &&
	isImplicitlyDefinableConstexprFunction(Func));

	// C++14 [temp.expl.spec]p6:
	// If a template [...] is explicitly specialized then that specialization
	// shall be declared before the first use of that specialization that would
	// cause an implicit instantiation to take place, in every translation unit
	// in which such a use occurs
	if (NeedDefinition &&
	(Func->getTemplateSpecializationKind() != TSK_Undeclared \|\|
	Func->getMemberSpecializationInfo()))
	checkSpecializationVisibility(Loc, Func);

	// C++14 [except.spec]p17:
	// An exception-specification is considered to be needed when:
	// - the function is odr-used or, if it appears in an unevaluated operand,
	// would be odr-used if the expression were potentially-evaluated;
	//
	// Note, we do this even if MightBeOdrUse is false. That indicates that the
	// function is a pure virtual function we're calling, and in that case the
	// function was selected by overload resolution and we need to resolve its
	// exception specification for a different reason.
	const FunctionProtoType *FPT = Func->getType()->getAs<FunctionProtoType>();
	if (FPT && isUnresolvedExceptionSpec(FPT->getExceptionSpecType()))
	ResolveExceptionSpec(Loc, FPT);

	// If we don't need to mark the function as used, and we don't need to
	// try to provide a definition, there's nothing more to do.
	if ((Func->isUsed(/CheckUsedAttr=/false) \|\| !OdrUse) &&
	(!NeedDefinition \|\| Func->getBody()))
	return;

	// Note that this declaration has been used.
	if (CXXConstructorDecl *Constructor = dyn_cast<CXXConstructorDecl>(Func)) {
	Constructor = cast<CXXConstructorDecl>(Constructor->getFirstDecl());
	if (Constructor->isDefaulted() && !Constructor->isDeleted()) {
	if (Constructor->isDefaultConstructor()) {
	if (Constructor->isTrivial() && !Constructor->hasAttr<DLLExportAttr>())
	return;
	DefineImplicitDefaultConstructor(Loc, Constructor);
	} else if (Constructor->isCopyConstructor()) {
	DefineImplicitCopyConstructor(Loc, Constructor);
	} else if (Constructor->isMoveConstructor()) {
	DefineImplicitMoveConstructor(Loc, Constructor);
	}
	} else if (Constructor->getInheritedConstructor()) {
	DefineInheritingConstructor(Loc, Constructor);
	}
	} else if (CXXDestructorDecl *Destructor =
	dyn_cast<CXXDestructorDecl>(Func)) {
	Destructor = cast<CXXDestructorDecl>(Destructor->getFirstDecl());
	if (Destructor->isDefaulted() && !Destructor->isDeleted()) {
	if (Destructor->isTrivial() && !Destructor->hasAttr<DLLExportAttr>())
	return;
	DefineImplicitDestructor(Loc, Destructor);
	}
	if (Destructor->isVirtual() && getLangOpts().AppleKext)
	MarkVTableUsed(Loc, Destructor->getParent());
	} else if (CXXMethodDecl *MethodDecl = dyn_cast<CXXMethodDecl>(Func)) {
	if (MethodDecl->isOverloadedOperator() &&
	MethodDecl->getOverloadedOperator() == OO_Equal) {
	MethodDecl = cast<CXXMethodDecl>(MethodDecl->getFirstDecl());
	if (MethodDecl->isDefaulted() && !MethodDecl->isDeleted()) {
	if (MethodDecl->isCopyAssignmentOperator())
	DefineImplicitCopyAssignment(Loc, MethodDecl);
	else if (MethodDecl->isMoveAssignmentOperator())
	DefineImplicitMoveAssignment(Loc, MethodDecl);
	}
	} else if (isa<CXXConversionDecl>(MethodDecl) &&
	MethodDecl->getParent()->isLambda()) {
	CXXConversionDecl *Conversion =
	cast<CXXConversionDecl>(MethodDecl->getFirstDecl());
	if (Conversion->isLambdaToBlockPointerConversion())
	DefineImplicitLambdaToBlockPointerConversion(Loc, Conversion);
	else
	DefineImplicitLambdaToFunctionPointerConversion(Loc, Conversion);
	} else if (MethodDecl->isVirtual() && getLangOpts().AppleKext)
	MarkVTableUsed(Loc, MethodDecl->getParent());
	}

	// Recursive functions should be marked when used from another function.
	// FIXME: Is this really right?
	if (CurContext == Func) return;

	// Implicit instantiation of function templates and member functions of
	// class templates.
	if (Func->isImplicitlyInstantiable()) {
	bool AlreadyInstantiated = false;
	SourceLocation PointOfInstantiation = Loc;
	if (FunctionTemplateSpecializationInfo *SpecInfo
	= Func->getTemplateSpecializationInfo()) {
	if (SpecInfo->getPointOfInstantiation().isInvalid())
	SpecInfo->setPointOfInstantiation(Loc);
	else if (SpecInfo->getTemplateSpecializationKind()
	== TSK_ImplicitInstantiation) {
	AlreadyInstantiated = true;
	PointOfInstantiation = SpecInfo->getPointOfInstantiation();
	}
	} else if (MemberSpecializationInfo *MSInfo
	= Func->getMemberSpecializationInfo()) {
	if (MSInfo->getPointOfInstantiation().isInvalid())
	MSInfo->setPointOfInstantiation(Loc);
	else if (MSInfo->getTemplateSpecializationKind()
	== TSK_ImplicitInstantiation) {
	AlreadyInstantiated = true;
	PointOfInstantiation = MSInfo->getPointOfInstantiation();
	}
	}

	if (!AlreadyInstantiated \|\| Func->isConstexpr()) {
	if (isa<CXXRecordDecl>(Func->getDeclContext()) &&
	cast<CXXRecordDecl>(Func->getDeclContext())->isLocalClass() &&
	CodeSynthesisContexts.size())
	PendingLocalImplicitInstantiations.push_back(
	std::make_pair(Func, PointOfInstantiation));
	else if (Func->isConstexpr())
	// Do not defer instantiations of constexpr functions, to avoid the
	// expression evaluator needing to call back into Sema if it sees a
	// call to such a function.
	InstantiateFunctionDefinition(PointOfInstantiation, Func);
	else {
	Func->setInstantiationIsPending(true);
	PendingInstantiations.push_back(std::make_pair(Func,
	PointOfInstantiation));
	// Notify the consumer that a function was implicitly instantiated.
	Consumer.HandleCXXImplicitFunctionInstantiation(Func);
	}
	}
	} else {
	// Walk redefinitions, as some of them may be instantiable.
	for (auto i : Func->redecls()) {
	if (!i->isUsed(false) && i->isImplicitlyInstantiable())
	MarkFunctionReferenced(Loc, i, OdrUse);
	}
	}

	if (!OdrUse) return;

	// Keep track of used but undefined functions.
	if (!Func->isDefined()) {
	if (mightHaveNonExternalLinkage(Func))
	UndefinedButUsed.insert(std::make_pair(Func->getCanonicalDecl(), Loc));
	else if (Func->getMostRecentDecl()->isInlined() &&
	!LangOpts.GNUInline &&
	!Func->getMostRecentDecl()->hasAttr<GNUInlineAttr>())
	UndefinedButUsed.insert(std::make_pair(Func->getCanonicalDecl(), Loc));
	}

	Func->markUsed(Context);
	}

	static void
	diagnoseUncapturableValueReference(Sema &S, SourceLocation loc,
	ValueDecl var, DeclContext DC) {
	DeclContext *VarDC = var->getDeclContext();

	// If the parameter still belongs to the translation unit, then
	// we're actually just using one parameter in the declaration of
	// the next.
	if (isa<ParmVarDecl>(var) &&
	isa<TranslationUnitDecl>(VarDC))
	return;

	// For C code, don't diagnose about capture if we're not actually in code
	// right now; it's impossible to write a non-constant expression outside of
	// function context, so we'll get other (more useful) diagnostics later.
	//
	// For C++, things get a bit more nasty... it would be nice to suppress this
	// diagnostic for certain cases like using a local variable in an array bound
	// for a member of a local class, but the correct predicate is not obvious.
	if (!S.getLangOpts().CPlusPlus && !S.CurContext->isFunctionOrMethod())
	return;

	unsigned ValueKind = isa<BindingDecl>(var) ? 1 : 0;
	unsigned ContextKind = 3; // unknown
	if (isa<CXXMethodDecl>(VarDC) &&
	cast<CXXRecordDecl>(VarDC->getParent())->isLambda()) {
	ContextKind = 2;
	} else if (isa<FunctionDecl>(VarDC)) {
	ContextKind = 0;
	} else if (isa<BlockDecl>(VarDC)) {
	ContextKind = 1;
	}

	S.Diag(loc, diag::err_reference_to_local_in_enclosing_context)
	<< var << ValueKind << ContextKind << VarDC;
	S.Diag(var->getLocation(), diag::note_entity_declared_at)
	<< var;

	// FIXME: Add additional diagnostic info about class etc. which prevents
	// capture.
	}


	static bool isVariableAlreadyCapturedInScopeInfo(CapturingScopeInfo CSI, VarDecl Var,
	bool &SubCapturesAreNested,
	QualType &CaptureType,
	QualType &DeclRefType) {
	// Check whether we've already captured it.
	if (CSI->CaptureMap.count(Var)) {
	// If we found a capture, any subcaptures are nested.
	SubCapturesAreNested = true;

	// Retrieve the capture type for this variable.
	CaptureType = CSI->getCapture(Var).getCaptureType();

	// Compute the type of an expression that refers to this variable.
	DeclRefType = CaptureType.getNonReferenceType();

	// Similarly to mutable captures in lambda, all the OpenMP captures by copy
	// are mutable in the sense that user can change their value - they are
	// private instances of the captured declarations.
	const CapturingScopeInfo::Capture &Cap = CSI->getCapture(Var);
	if (Cap.isCopyCapture() &&
	!(isa<LambdaScopeInfo>(CSI) && cast<LambdaScopeInfo>(CSI)->Mutable) &&
	!(isa<CapturedRegionScopeInfo>(CSI) &&
	cast<CapturedRegionScopeInfo>(CSI)->CapRegionKind == CR_OpenMP))
	DeclRefType.addConst();
	return true;
	}
	return false;
	}

	// Only block literals, captured statements, and lambda expressions can
	// capture; other scopes don't work.
	static DeclContext getParentOfCapturingContextOrNull(DeclContext DC, VarDecl *Var,
	SourceLocation Loc,
	const bool Diagnose, Sema &S) {
	if (isa<BlockDecl>(DC) \|\| isa<CapturedDecl>(DC) \|\| isLambdaCallOperator(DC))
	return getLambdaAwareParentOfDeclContext(DC);
	else if (Var->hasLocalStorage()) {
	if (Diagnose)
	diagnoseUncapturableValueReference(S, Loc, Var, DC);
	}
	return nullptr;
	}

	// Certain capturing entities (lambdas, blocks etc.) are not allowed to capture
	// certain types of variables (unnamed, variably modified types etc.)
	// so check for eligibility.
	static bool isVariableCapturable(CapturingScopeInfo CSI, VarDecl Var,
	SourceLocation Loc,
	const bool Diagnose, Sema &S) {

	bool IsBlock = isa<BlockScopeInfo>(CSI);
	bool IsLambda = isa<LambdaScopeInfo>(CSI);

	// Lambdas are not allowed to capture unnamed variables
	// (e.g. anonymous unions).
	// FIXME: The C++11 rule don't actually state this explicitly, but I'm
	// assuming that's the intent.
	if (IsLambda && !Var->getDeclName()) {
	if (Diagnose) {
	S.Diag(Loc, diag::err_lambda_capture_anonymous_var);
	S.Diag(Var->getLocation(), diag::note_declared_at);
	}
	return false;
	}

	// Prohibit variably-modified types in blocks; they're difficult to deal with.
	if (Var->getType()->isVariablyModifiedType() && IsBlock) {
	if (Diagnose) {
	S.Diag(Loc, diag::err_ref_vm_type);
	S.Diag(Var->getLocation(), diag::note_previous_decl)
	<< Var->getDeclName();
	}
	return false;
	}
	// Prohibit structs with flexible array members too.
	// We cannot capture what is in the tail end of the struct.
	if (const RecordType *VTTy = Var->getType()->getAs<RecordType>()) {
	if (VTTy->getDecl()->hasFlexibleArrayMember()) {
	if (Diagnose) {
	if (IsBlock)
	S.Diag(Loc, diag::err_ref_flexarray_type);
	else
	S.Diag(Loc, diag::err_lambda_capture_flexarray_type)
	<< Var->getDeclName();
	S.Diag(Var->getLocation(), diag::note_previous_decl)
	<< Var->getDeclName();
	}
	return false;
	}
	}
	const bool HasBlocksAttr = Var->hasAttr<BlocksAttr>();
	// Lambdas and captured statements are not allowed to capture __block
	// variables; they don't support the expected semantics.
	if (HasBlocksAttr && (IsLambda \|\| isa<CapturedRegionScopeInfo>(CSI))) {
	if (Diagnose) {
	S.Diag(Loc, diag::err_capture_block_variable)
	<< Var->getDeclName() << !IsLambda;
	S.Diag(Var->getLocation(), diag::note_previous_decl)
	<< Var->getDeclName();
	}
	return false;
	}
	// OpenCL v2.0 s6.12.5: Blocks cannot reference/capture other blocks
	if (S.getLangOpts().OpenCL && IsBlock &&
	Var->getType()->isBlockPointerType()) {
	if (Diagnose)
	S.Diag(Loc, diag::err_opencl_block_ref_block);
	return false;
	}

	return true;
	}

	// Returns true if the capture by block was successful.
	static bool captureInBlock(BlockScopeInfo BSI, VarDecl Var,
	SourceLocation Loc,
	const bool BuildAndDiagnose,
	QualType &CaptureType,
	QualType &DeclRefType,
	const bool Nested,
	Sema &S) {
	Expr *CopyExpr = nullptr;
	bool ByRef = false;

	// Blocks are not allowed to capture arrays.
	if (CaptureType->isArrayType()) {
	if (BuildAndDiagnose) {
	S.Diag(Loc, diag::err_ref_array_type);
	S.Diag(Var->getLocation(), diag::note_previous_decl)
	<< Var->getDeclName();
	}
	return false;
	}

	// Forbid the block-capture of autoreleasing variables.
	if (CaptureType.getObjCLifetime() == Qualifiers::OCL_Autoreleasing) {
	if (BuildAndDiagnose) {
	S.Diag(Loc, diag::err_arc_autoreleasing_capture)
	<< /block/ 0;
	S.Diag(Var->getLocation(), diag::note_previous_decl)
	<< Var->getDeclName();
	}
	return false;
	}

	// Warn about implicitly autoreleasing indirect parameters captured by blocks.
	if (const auto *PT = CaptureType->getAs<PointerType>()) {
	// This function finds out whether there is an AttributedType of kind
	// attr_objc_ownership in Ty. The existence of AttributedType of kind
	// attr_objc_ownership implies __autoreleasing was explicitly specified
	// rather than being added implicitly by the compiler.
	auto IsObjCOwnershipAttributedType = [](QualType Ty) {
	while (const auto *AttrTy = Ty->getAs<AttributedType>()) {
	if (AttrTy->getAttrKind() == AttributedType::attr_objc_ownership)
	return true;

	// Peel off AttributedTypes that are not of kind objc_ownership.
	Ty = AttrTy->getModifiedType();
	}

	return false;
	};

	QualType PointeeTy = PT->getPointeeType();

	if (PointeeTy->getAs<ObjCObjectPointerType>() &&
	PointeeTy.getObjCLifetime() == Qualifiers::OCL_Autoreleasing &&
	!IsObjCOwnershipAttributedType(PointeeTy)) {
	if (BuildAndDiagnose) {
	SourceLocation VarLoc = Var->getLocation();
	S.Diag(Loc, diag::warn_block_capture_autoreleasing);
	{
	auto AddAutoreleaseNote =
	S.Diag(VarLoc, diag::note_declare_parameter_autoreleasing);
	// Provide a fix-it for the '__autoreleasing' keyword at the
	// appropriate location in the variable's type.
	if (const auto *TSI = Var->getTypeSourceInfo()) {
	PointerTypeLoc PTL =
	TSI->getTypeLoc().getAsAdjusted<PointerTypeLoc>();
	if (PTL) {
	SourceLocation Loc = PTL.getPointeeLoc().getEndLoc();
	Loc = Lexer::getLocForEndOfToken(Loc, 0, S.getSourceManager(),
	S.getLangOpts());
	if (Loc.isValid()) {
	StringRef CharAtLoc = Lexer::getSourceText(
	CharSourceRange::getCharRange(Loc, Loc.getLocWithOffset(1)),
	S.getSourceManager(), S.getLangOpts());
	AddAutoreleaseNote << FixItHint::CreateInsertion(
	Loc, CharAtLoc.empty() \|\| !isWhitespace(CharAtLoc[0])
	? " __autoreleasing "
	: " __autoreleasing");
	}
	}
	}
	}
	S.Diag(VarLoc, diag::note_declare_parameter_strong);
	}
	}
	}

	const bool HasBlocksAttr = Var->hasAttr<BlocksAttr>();
	if (HasBlocksAttr \|\| CaptureType->isReferenceType() \|\|
	(S.getLangOpts().OpenMP && S.IsOpenMPCapturedDecl(Var))) {
	// Block capture by reference does not change the capture or
	// declaration reference types.
	ByRef = true;
	} else {
	// Block capture by copy introduces 'const'.
	CaptureType = CaptureType.getNonReferenceType().withConst();
	DeclRefType = CaptureType;

	if (S.getLangOpts().CPlusPlus && BuildAndDiagnose) {
	if (const RecordType *Record = DeclRefType->getAs<RecordType>()) {
	// The capture logic needs the destructor, so make sure we mark it.
	// Usually this is unnecessary because most local variables have
	// their destructors marked at declaration time, but parameters are
	// an exception because it's technically only the call site that
	// actually requires the destructor.
	if (isa<ParmVarDecl>(Var))
	S.FinalizeVarWithDestructor(Var, Record);

	// Enter a new evaluation context to insulate the copy
	// full-expression.
	EnterExpressionEvaluationContext scope(
	S, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);

	// According to the blocks spec, the capture of a variable from
	// the stack requires a const copy constructor. This is not true
	// of the copy/move done to move a __block variable to the heap.
	Expr *DeclRef = new (S.Context) DeclRefExpr(Var, Nested,
	DeclRefType.withConst(),
	VK_LValue, Loc);

	ExprResult Result
	= S.PerformCopyInitialization(
	InitializedEntity::InitializeBlock(Var->getLocation(),
	CaptureType, false),
	Loc, DeclRef);

	// Build a full-expression copy expression if initialization
	// succeeded and used a non-trivial constructor. Recover from
	// errors by pretending that the copy isn't necessary.
	if (!Result.isInvalid() &&
	!cast<CXXConstructExpr>(Result.get())->getConstructor()
	->isTrivial()) {
	Result = S.MaybeCreateExprWithCleanups(Result);
	CopyExpr = Result.get();
	}
	}
	}
	}

	// Actually capture the variable.
	if (BuildAndDiagnose)
	BSI->addCapture(Var, HasBlocksAttr, ByRef, Nested, Loc,
	SourceLocation(), CaptureType, CopyExpr);

	return true;

	}


	/// \brief Capture the given variable in the captured region.
	static bool captureInCapturedRegion(CapturedRegionScopeInfo *RSI,
	VarDecl *Var,
	SourceLocation Loc,
	const bool BuildAndDiagnose,
	QualType &CaptureType,
	QualType &DeclRefType,
	const bool RefersToCapturedVariable,
	Sema &S) {
	// By default, capture variables by reference.
	bool ByRef = true;
	// Using an LValue reference type is consistent with Lambdas (see below).
	if (S.getLangOpts().OpenMP && RSI->CapRegionKind == CR_OpenMP) {
	if (S.IsOpenMPCapturedDecl(Var))
	DeclRefType = DeclRefType.getUnqualifiedType();
	ByRef = S.IsOpenMPCapturedByRef(Var, RSI->OpenMPLevel);
	}

	if (ByRef)
	CaptureType = S.Context.getLValueReferenceType(DeclRefType);
	else
	CaptureType = DeclRefType;

	Expr *CopyExpr = nullptr;
	if (BuildAndDiagnose) {
	// The current implementation assumes that all variables are captured
	// by references. Since there is no capture by copy, no expression
	// evaluation will be needed.
	RecordDecl *RD = RSI->TheRecordDecl;

	FieldDecl *Field
	= FieldDecl::Create(S.Context, RD, Loc, Loc, nullptr, CaptureType,
	S.Context.getTrivialTypeSourceInfo(CaptureType, Loc),
	nullptr, false, ICIS_NoInit);
	Field->setImplicit(true);
	Field->setAccess(AS_private);
	RD->addDecl(Field);

	CopyExpr = new (S.Context) DeclRefExpr(Var, RefersToCapturedVariable,
	DeclRefType, VK_LValue, Loc);
	Var->setReferenced(true);
	Var->markUsed(S.Context);
	}

	// Actually capture the variable.
	if (BuildAndDiagnose)
	RSI->addCapture(Var, /isBlock/false, ByRef, RefersToCapturedVariable, Loc,
	SourceLocation(), CaptureType, CopyExpr);


	return true;
	}

	/// \brief Create a field within the lambda class for the variable
	/// being captured.
	static void addAsFieldToClosureType(Sema &S, LambdaScopeInfo *LSI,
	QualType FieldType, QualType DeclRefType,
	SourceLocation Loc,
	bool RefersToCapturedVariable) {
	CXXRecordDecl *Lambda = LSI->Lambda;

	// Build the non-static data member.
	FieldDecl *Field
	= FieldDecl::Create(S.Context, Lambda, Loc, Loc, nullptr, FieldType,
	S.Context.getTrivialTypeSourceInfo(FieldType, Loc),
	nullptr, false, ICIS_NoInit);
	Field->setImplicit(true);
	Field->setAccess(AS_private);
	Lambda->addDecl(Field);
	}

	/// \brief Capture the given variable in the lambda.
	static bool captureInLambda(LambdaScopeInfo *LSI,
	VarDecl *Var,
	SourceLocation Loc,
	const bool BuildAndDiagnose,
	QualType &CaptureType,
	QualType &DeclRefType,
	const bool RefersToCapturedVariable,
	const Sema::TryCaptureKind Kind,
	SourceLocation EllipsisLoc,
	const bool IsTopScope,
	Sema &S) {

	// Determine whether we are capturing by reference or by value.
	bool ByRef = false;
	if (IsTopScope && Kind != Sema::TryCapture_Implicit) {
	ByRef = (Kind == Sema::TryCapture_ExplicitByRef);
	} else {
	ByRef = (LSI->ImpCaptureStyle == LambdaScopeInfo::ImpCap_LambdaByref);
	}

	// Compute the type of the field that will capture this variable.
	if (ByRef) {
	// C++11 [expr.prim.lambda]p15:
	// An entity is captured by reference if it is implicitly or
	// explicitly captured but not captured by copy. It is
	// unspecified whether additional unnamed non-static data
	// members are declared in the closure type for entities
	// captured by reference.
	//
	// FIXME: It is not clear whether we want to build an lvalue reference
	// to the DeclRefType or to CaptureType.getNonReferenceType(). GCC appears
	// to do the former, while EDG does the latter. Core issue 1249 will
	// clarify, but for now we follow GCC because it's a more permissive and
	// easily defensible position.
	CaptureType = S.Context.getLValueReferenceType(DeclRefType);
	} else {
	// C++11 [expr.prim.lambda]p14:
	// For each entity captured by copy, an unnamed non-static
	// data member is declared in the closure type. The
	// declaration order of these members is unspecified. The type
	// of such a data member is the type of the corresponding
	// captured entity if the entity is not a reference to an
	// object, or the referenced type otherwise. [Note: If the
	// captured entity is a reference to a function, the
	// corresponding data member is also a reference to a
	// function. - end note ]
	if (const ReferenceType *RefType = CaptureType->getAs<ReferenceType>()){
	if (!RefType->getPointeeType()->isFunctionType())
	CaptureType = RefType->getPointeeType();
	}

	// Forbid the lambda copy-capture of autoreleasing variables.
	if (CaptureType.getObjCLifetime() == Qualifiers::OCL_Autoreleasing) {
	if (BuildAndDiagnose) {
	S.Diag(Loc, diag::err_arc_autoreleasing_capture) << /lambda/ 1;
	S.Diag(Var->getLocation(), diag::note_previous_decl)
	<< Var->getDeclName();
	}
	return false;
	}

	// Make sure that by-copy captures are of a complete and non-abstract type.
	if (BuildAndDiagnose) {
	if (!CaptureType->isDependentType() &&
	S.RequireCompleteType(Loc, CaptureType,
	diag::err_capture_of_incomplete_type,
	Var->getDeclName()))
	return false;

	if (S.RequireNonAbstractType(Loc, CaptureType,
	diag::err_capture_of_abstract_type))
	return false;
	}
	}

	// Capture this variable in the lambda.
	if (BuildAndDiagnose)
	addAsFieldToClosureType(S, LSI, CaptureType, DeclRefType, Loc,
	RefersToCapturedVariable);

	// Compute the type of a reference to this captured variable.
	if (ByRef)
	DeclRefType = CaptureType.getNonReferenceType();
	else {
	// C++ [expr.prim.lambda]p5:
	// The closure type for a lambda-expression has a public inline
	// function call operator [...]. This function call operator is
	// declared const (9.3.1) if and only if the lambda-expression's
	// parameter-declaration-clause is not followed by mutable.
	DeclRefType = CaptureType.getNonReferenceType();
	if (!LSI->Mutable && !CaptureType->isReferenceType())
	DeclRefType.addConst();
	}

	// Add the capture.
	if (BuildAndDiagnose)
	LSI->addCapture(Var, /IsBlock=/false, ByRef, RefersToCapturedVariable,
	Loc, EllipsisLoc, CaptureType, /CopyExpr=/nullptr);

	return true;
	}

	bool Sema::tryCaptureVariable(
	VarDecl *Var, SourceLocation ExprLoc, TryCaptureKind Kind,
	SourceLocation EllipsisLoc, bool BuildAndDiagnose, QualType &CaptureType,
	QualType &DeclRefType, const unsigned *const FunctionScopeIndexToStopAt) {
	// An init-capture is notionally from the context surrounding its
	// declaration, but its parent DC is the lambda class.
	DeclContext *VarDC = Var->getDeclContext();
	if (Var->isInitCapture())
	VarDC = VarDC->getParent();

	DeclContext *DC = CurContext;
	const unsigned MaxFunctionScopesIndex = FunctionScopeIndexToStopAt
	? *FunctionScopeIndexToStopAt : FunctionScopes.size() - 1;
	// We need to sync up the Declaration Context with the
	// FunctionScopeIndexToStopAt
	if (FunctionScopeIndexToStopAt) {
	unsigned FSIndex = FunctionScopes.size() - 1;
	while (FSIndex != MaxFunctionScopesIndex) {
	DC = getLambdaAwareParentOfDeclContext(DC);
	--FSIndex;
	}
	}


	// If the variable is declared in the current context, there is no need to
	// capture it.
	if (VarDC == DC) return true;

	// Capture global variables if it is required to use private copy of this
	// variable.
	bool IsGlobal = !Var->hasLocalStorage();
	if (IsGlobal && !(LangOpts.OpenMP && IsOpenMPCapturedDecl(Var)))
	return true;

	// Walk up the stack to determine whether we can capture the variable,
	// performing the "simple" checks that don't depend on type. We stop when
	// we've either hit the declared scope of the variable or find an existing
	// capture of that variable. We start from the innermost capturing-entity
	// (the DC) and ensure that all intervening capturing-entities
	// (blocks/lambdas etc.) between the innermost capturer and the variable`s
	// declcontext can either capture the variable or have already captured
	// the variable.
	CaptureType = Var->getType();
	DeclRefType = CaptureType.getNonReferenceType();
	bool Nested = false;
	bool Explicit = (Kind != TryCapture_Implicit);
	unsigned FunctionScopesIndex = MaxFunctionScopesIndex;
	do {
	// Only block literals, captured statements, and lambda expressions can
	// capture; other scopes don't work.
	DeclContext *ParentDC = getParentOfCapturingContextOrNull(DC, Var,
	ExprLoc,
	BuildAndDiagnose,
	*this);
	// We need to check for the parent first because, if we have
	// private-captured a global variable, we need to recursively capture it in
	// intermediate blocks, lambdas, etc.
	if (!ParentDC) {
	if (IsGlobal) {
	FunctionScopesIndex = MaxFunctionScopesIndex - 1;
	break;
	}
	return true;
	}

	FunctionScopeInfo *FSI = FunctionScopes[FunctionScopesIndex];
	CapturingScopeInfo *CSI = cast<CapturingScopeInfo>(FSI);


	// Check whether we've already captured it.
	if (isVariableAlreadyCapturedInScopeInfo(CSI, Var, Nested, CaptureType,
	DeclRefType)) {
	CSI->getCapture(Var).markUsed(BuildAndDiagnose);
	break;
	}
	// If we are instantiating a generic lambda call operator body,
	// we do not want to capture new variables. What was captured
	// during either a lambdas transformation or initial parsing
	// should be used.
	if (isGenericLambdaCallOperatorSpecialization(DC)) {
	if (BuildAndDiagnose) {
	LambdaScopeInfo *LSI = cast<LambdaScopeInfo>(CSI);
	if (LSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_None) {
	Diag(ExprLoc, diag::err_lambda_impcap) << Var->getDeclName();
	Diag(Var->getLocation(), diag::note_previous_decl)
	<< Var->getDeclName();
	Diag(LSI->Lambda->getLocStart(), diag::note_lambda_decl);
	} else
	diagnoseUncapturableValueReference(*this, ExprLoc, Var, DC);
	}
	return true;
	}
	// Certain capturing entities (lambdas, blocks etc.) are not allowed to capture
	// certain types of variables (unnamed, variably modified types etc.)
	// so check for eligibility.
	if (!isVariableCapturable(CSI, Var, ExprLoc, BuildAndDiagnose, *this))
	return true;

	// Try to capture variable-length arrays types.
	if (Var->getType()->isVariablyModifiedType()) {
	// We're going to walk down into the type and look for VLA
	// expressions.
	QualType QTy = Var->getType();
	if (ParmVarDecl *PVD = dyn_cast_or_null<ParmVarDecl>(Var))
	QTy = PVD->getOriginalType();
	captureVariablyModifiedType(Context, QTy, CSI);
	}

	if (getLangOpts().OpenMP) {
	if (auto *RSI = dyn_cast<CapturedRegionScopeInfo>(CSI)) {
	// OpenMP private variables should not be captured in outer scope, so
	// just break here. Similarly, global variables that are captured in a
	// target region should not be captured outside the scope of the region.
	if (RSI->CapRegionKind == CR_OpenMP) {
	auto IsTargetCap = isOpenMPTargetCapturedDecl(Var, RSI->OpenMPLevel);
	// When we detect target captures we are looking from inside the
	// target region, therefore we need to propagate the capture from the
	// enclosing region. Therefore, the capture is not initially nested.
	if (IsTargetCap)
	FunctionScopesIndex--;

	if (IsTargetCap \|\| isOpenMPPrivateDecl(Var, RSI->OpenMPLevel)) {
	Nested = !IsTargetCap;
	DeclRefType = DeclRefType.getUnqualifiedType();
	CaptureType = Context.getLValueReferenceType(DeclRefType);
	break;
	}
	}
	}
	}
	if (CSI->ImpCaptureStyle == CapturingScopeInfo::ImpCap_None && !Explicit) {
	// No capture-default, and this is not an explicit capture
	// so cannot capture this variable.
	if (BuildAndDiagnose) {
	Diag(ExprLoc, diag::err_lambda_impcap) << Var->getDeclName();
	Diag(Var->getLocation(), diag::note_previous_decl)
	<< Var->getDeclName();
	if (cast<LambdaScopeInfo>(CSI)->Lambda)
	Diag(cast<LambdaScopeInfo>(CSI)->Lambda->getLocStart(),
	diag::note_lambda_decl);
	// FIXME: If we error out because an outer lambda can not implicitly
	// capture a variable that an inner lambda explicitly captures, we
	// should have the inner lambda do the explicit capture - because
	// it makes for cleaner diagnostics later. This would purely be done
	// so that the diagnostic does not misleadingly claim that a variable
	// can not be captured by a lambda implicitly even though it is captured
	// explicitly. Suggestion:
	// - create const bool VariableCaptureWasInitiallyExplicit = Explicit
	// at the function head
	// - cache the StartingDeclContext - this must be a lambda
	// - captureInLambda in the innermost lambda the variable.
	}
	return true;
	}

	FunctionScopesIndex--;
	DC = ParentDC;
	Explicit = false;
	} while (!VarDC->Equals(DC));

	// Walk back down the scope stack, (e.g. from outer lambda to inner lambda)
	// computing the type of the capture at each step, checking type-specific
	// requirements, and adding captures if requested.
	// If the variable had already been captured previously, we start capturing
	// at the lambda nested within that one.
	for (unsigned I = ++FunctionScopesIndex, N = MaxFunctionScopesIndex + 1; I != N;
	++I) {
	CapturingScopeInfo *CSI = cast<CapturingScopeInfo>(FunctionScopes[I]);

	if (BlockScopeInfo *BSI = dyn_cast<BlockScopeInfo>(CSI)) {
	if (!captureInBlock(BSI, Var, ExprLoc,
	BuildAndDiagnose, CaptureType,
	DeclRefType, Nested, *this))
	return true;
	Nested = true;
	} else if (CapturedRegionScopeInfo *RSI = dyn_cast<CapturedRegionScopeInfo>(CSI)) {
	if (!captureInCapturedRegion(RSI, Var, ExprLoc,
	BuildAndDiagnose, CaptureType,
	DeclRefType, Nested, *this))
	return true;
	Nested = true;
	} else {
	LambdaScopeInfo *LSI = cast<LambdaScopeInfo>(CSI);
	if (!captureInLambda(LSI, Var, ExprLoc,
	BuildAndDiagnose, CaptureType,
	DeclRefType, Nested, Kind, EllipsisLoc,
	/IsTopScope/I == N - 1, *this))
	return true;
	Nested = true;
	}
	}
	return false;
	}

	bool Sema::tryCaptureVariable(VarDecl *Var, SourceLocation Loc,
	TryCaptureKind Kind, SourceLocation EllipsisLoc) {
	QualType CaptureType;
	QualType DeclRefType;
	return tryCaptureVariable(Var, Loc, Kind, EllipsisLoc,
	/BuildAndDiagnose=/true, CaptureType,
	DeclRefType, nullptr);
	}

	bool Sema::NeedToCaptureVariable(VarDecl *Var, SourceLocation Loc) {
	QualType CaptureType;
	QualType DeclRefType;
	return !tryCaptureVariable(Var, Loc, TryCapture_Implicit, SourceLocation(),
	/BuildAndDiagnose=/false, CaptureType,
	DeclRefType, nullptr);
	}

	QualType Sema::getCapturedDeclRefType(VarDecl *Var, SourceLocation Loc) {
	QualType CaptureType;
	QualType DeclRefType;

	// Determine whether we can capture this variable.
	if (tryCaptureVariable(Var, Loc, TryCapture_Implicit, SourceLocation(),
	/BuildAndDiagnose=/false, CaptureType,
	DeclRefType, nullptr))
	return QualType();

	return DeclRefType;
	}



	// If either the type of the variable or the initializer is dependent,
	// return false. Otherwise, determine whether the variable is a constant
	// expression. Use this if you need to know if a variable that might or
	// might not be dependent is truly a constant expression.
	static inline bool IsVariableNonDependentAndAConstantExpression(VarDecl *Var,
	ASTContext &Context) {

	if (Var->getType()->isDependentType())
	return false;
	const VarDecl *DefVD = nullptr;
	Var->getAnyInitializer(DefVD);
	if (!DefVD)
	return false;
	EvaluatedStmt *Eval = DefVD->ensureEvaluatedStmt();
	Expr *Init = cast<Expr>(Eval->Value);
	if (Init->isValueDependent())
	return false;
	return IsVariableAConstantExpression(Var, Context);
	}


	void Sema::UpdateMarkingForLValueToRValue(Expr *E) {
	// Per C++11 [basic.def.odr], a variable is odr-used "unless it is
	// an object that satisfies the requirements for appearing in a
	// constant expression (5.19) and the lvalue-to-rvalue conversion (4.1)
	// is immediately applied." This function handles the lvalue-to-rvalue
	// conversion part.
	MaybeODRUseExprs.erase(E->IgnoreParens());

	// If we are in a lambda, check if this DeclRefExpr or MemberExpr refers
	// to a variable that is a constant expression, and if so, identify it as
	// a reference to a variable that does not involve an odr-use of that
	// variable.
	if (LambdaScopeInfo *LSI = getCurLambda()) {
	Expr *SansParensExpr = E->IgnoreParens();
	VarDecl *Var = nullptr;
	if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(SansParensExpr))
	Var = dyn_cast<VarDecl>(DRE->getFoundDecl());
	else if (MemberExpr *ME = dyn_cast<MemberExpr>(SansParensExpr))
	Var = dyn_cast<VarDecl>(ME->getMemberDecl());

	if (Var && IsVariableNonDependentAndAConstantExpression(Var, Context))
	LSI->markVariableExprAsNonODRUsed(SansParensExpr);
	}
	}

	ExprResult Sema::ActOnConstantExpression(ExprResult Res) {
	Res = CorrectDelayedTyposInExpr(Res);

	if (!Res.isUsable())
	return Res;

	// If a constant-expression is a reference to a variable where we delay
	// deciding whether it is an odr-use, just assume we will apply the
	// lvalue-to-rvalue conversion. In the one case where this doesn't happen
	// (a non-type template argument), we have special handling anyway.
	UpdateMarkingForLValueToRValue(Res.get());
	return Res;
	}

	void Sema::CleanupVarDeclMarking() {
	for (Expr *E : MaybeODRUseExprs) {
	VarDecl *Var;
	SourceLocation Loc;
	if (DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E)) {
	Var = cast<VarDecl>(DRE->getDecl());
	Loc = DRE->getLocation();
	} else if (MemberExpr *ME = dyn_cast<MemberExpr>(E)) {
	Var = cast<VarDecl>(ME->getMemberDecl());
	Loc = ME->getMemberLoc();
	} else {
	llvm_unreachable("Unexpected expression");
	}

	MarkVarDeclODRUsed(Var, Loc, *this,
	/MaxFunctionScopeIndex Pointer/ nullptr);
	}

	MaybeODRUseExprs.clear();
	}


	static void DoMarkVarDeclReferenced(Sema &SemaRef, SourceLocation Loc,
	VarDecl Var, Expr E) {
	assert((!E \|\| isa<DeclRefExpr>(E) \|\| isa<MemberExpr>(E)) &&
	"Invalid Expr argument to DoMarkVarDeclReferenced");
	Var->setReferenced();

	TemplateSpecializationKind TSK = Var->getTemplateSpecializationKind();

	bool OdrUseContext = isOdrUseContext(SemaRef);
	bool NeedDefinition =
	OdrUseContext \|\| (isEvaluatableContext(SemaRef) &&
	Var->isUsableInConstantExpressions(SemaRef.Context));

	VarTemplateSpecializationDecl *VarSpec =
	dyn_cast<VarTemplateSpecializationDecl>(Var);
	assert(!isa<VarTemplatePartialSpecializationDecl>(Var) &&
	"Can't instantiate a partial template specialization.");

	// If this might be a member specialization of a static data member, check
	// the specialization is visible. We already did the checks for variable
	// template specializations when we created them.
	if (NeedDefinition && TSK != TSK_Undeclared &&
	!isa<VarTemplateSpecializationDecl>(Var))
	SemaRef.checkSpecializationVisibility(Loc, Var);

	// Perform implicit instantiation of static data members, static data member
	// templates of class templates, and variable template specializations. Delay
	// instantiations of variable templates, except for those that could be used
	// in a constant expression.
	if (NeedDefinition && isTemplateInstantiation(TSK)) {
	bool TryInstantiating = TSK == TSK_ImplicitInstantiation;

	if (TryInstantiating && !isa<VarTemplateSpecializationDecl>(Var)) {
	if (Var->getPointOfInstantiation().isInvalid()) {
	// This is a modification of an existing AST node. Notify listeners.
	if (ASTMutationListener *L = SemaRef.getASTMutationListener())
	L->StaticDataMemberInstantiated(Var);
	} else if (!Var->isUsableInConstantExpressions(SemaRef.Context))
	// Don't bother trying to instantiate it again, unless we might need
	// its initializer before we get to the end of the TU.
	TryInstantiating = false;
	}

	if (Var->getPointOfInstantiation().isInvalid())
	Var->setTemplateSpecializationKind(TSK, Loc);

	if (TryInstantiating) {
	SourceLocation PointOfInstantiation = Var->getPointOfInstantiation();
	bool InstantiationDependent = false;
	bool IsNonDependent =
	VarSpec ? !TemplateSpecializationType::anyDependentTemplateArguments(
	VarSpec->getTemplateArgsInfo(), InstantiationDependent)
	: true;

	// Do not instantiate specializations that are still type-dependent.
	if (IsNonDependent) {
	if (Var->isUsableInConstantExpressions(SemaRef.Context)) {
	// Do not defer instantiations of variables which could be used in a
	// constant expression.
	SemaRef.InstantiateVariableDefinition(PointOfInstantiation, Var);
	} else {
	SemaRef.PendingInstantiations
	.push_back(std::make_pair(Var, PointOfInstantiation));
	}
	}
	}
	}

	// Per C++11 [basic.def.odr], a variable is odr-used "unless it satisfies
	// the requirements for appearing in a constant expression (5.19) and, if
	// it is an object, the lvalue-to-rvalue conversion (4.1)
	// is immediately applied." We check the first part here, and
	// Sema::UpdateMarkingForLValueToRValue deals with the second part.
	// Note that we use the C++11 definition everywhere because nothing in
	// C++03 depends on whether we get the C++03 version correct. The second
	// part does not apply to references, since they are not objects.
	if (OdrUseContext && E &&
	IsVariableAConstantExpression(Var, SemaRef.Context)) {
	// A reference initialized by a constant expression can never be
	// odr-used, so simply ignore it.
	if (!Var->getType()->isReferenceType())
	SemaRef.MaybeODRUseExprs.insert(E);
	} else if (OdrUseContext) {
	MarkVarDeclODRUsed(Var, Loc, SemaRef,
	/MaxFunctionScopeIndex ptr/ nullptr);
	} else if (isOdrUseContext(SemaRef, /SkipDependentUses/false)) {
	// If this is a dependent context, we don't need to mark variables as
	// odr-used, but we may still need to track them for lambda capture.
	// FIXME: Do we also need to do this inside dependent typeid expressions
	// (which are modeled as unevaluated at this point)?
	const bool RefersToEnclosingScope =
	(SemaRef.CurContext != Var->getDeclContext() &&
	Var->getDeclContext()->isFunctionOrMethod() && Var->hasLocalStorage());
	if (RefersToEnclosingScope) {
	LambdaScopeInfo *const LSI =
	SemaRef.getCurLambda(/IgnoreNonLambdaCapturingScope=/true);
	if (LSI && !LSI->CallOperator->Encloses(Var->getDeclContext())) {
	// If a variable could potentially be odr-used, defer marking it so
	// until we finish analyzing the full expression for any
	// lvalue-to-rvalue
	// or discarded value conversions that would obviate odr-use.
	// Add it to the list of potential captures that will be analyzed
	// later (ActOnFinishFullExpr) for eventual capture and odr-use marking
	// unless the variable is a reference that was initialized by a constant
	// expression (this will never need to be captured or odr-used).
	assert(E && "Capture variable should be used in an expression.");
	if (!Var->getType()->isReferenceType() \|\|
	!IsVariableNonDependentAndAConstantExpression(Var, SemaRef.Context))
	LSI->addPotentialCapture(E->IgnoreParens());
	}
	}
	}
	}

	/// \brief Mark a variable referenced, and check whether it is odr-used
	/// (C++ [basic.def.odr]p2, C99 6.9p3). Note that this should not be
	/// used directly for normal expressions referring to VarDecl.
	void Sema::MarkVariableReferenced(SourceLocation Loc, VarDecl *Var) {
	DoMarkVarDeclReferenced(*this, Loc, Var, nullptr);
	}

	static void MarkExprReferenced(Sema &SemaRef, SourceLocation Loc,
	Decl D, Expr E, bool MightBeOdrUse) {
	if (SemaRef.isInOpenMPDeclareTargetContext())
	SemaRef.checkDeclIsAllowedInOpenMPTarget(E, D);

	if (VarDecl *Var = dyn_cast<VarDecl>(D)) {
	DoMarkVarDeclReferenced(SemaRef, Loc, Var, E);
	return;
	}

	SemaRef.MarkAnyDeclReferenced(Loc, D, MightBeOdrUse);

	// If this is a call to a method via a cast, also mark the method in the
	// derived class used in case codegen can devirtualize the call.
	const MemberExpr *ME = dyn_cast<MemberExpr>(E);
	if (!ME)
	return;
	CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(ME->getMemberDecl());
	if (!MD)
	return;
	// Only attempt to devirtualize if this is truly a virtual call.
	bool IsVirtualCall = MD->isVirtual() &&
	ME->performsVirtualDispatch(SemaRef.getLangOpts());
	if (!IsVirtualCall)
	return;

	// If it's possible to devirtualize the call, mark the called function
	// referenced.
	CXXMethodDecl *DM = MD->getDevirtualizedMethod(
	ME->getBase(), SemaRef.getLangOpts().AppleKext);
	if (DM)
	SemaRef.MarkAnyDeclReferenced(Loc, DM, MightBeOdrUse);
	}

	/// \brief Perform reference-marking and odr-use handling for a DeclRefExpr.
	void Sema::MarkDeclRefReferenced(DeclRefExpr E, const Expr Base) {
	// TODO: update this with DR# once a defect report is filed.
	// C++11 defect. The address of a pure member should not be an ODR use, even
	// if it's a qualified reference.
	bool OdrUse = true;
	if (const CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(E->getDecl()))
	if (Method->isVirtual() &&
	!Method->getDevirtualizedMethod(Base, getLangOpts().AppleKext))
	OdrUse = false;
	MarkExprReferenced(*this, E->getLocation(), E->getDecl(), E, OdrUse);
	}

	/// \brief Perform reference-marking and odr-use handling for a MemberExpr.
	void Sema::MarkMemberReferenced(MemberExpr *E) {
	// C++11 [basic.def.odr]p2:
	// A non-overloaded function whose name appears as a potentially-evaluated
	// expression or a member of a set of candidate functions, if selected by
	// overload resolution when referred to from a potentially-evaluated
	// expression, is odr-used, unless it is a pure virtual function and its
	// name is not explicitly qualified.
	bool MightBeOdrUse = true;
	if (E->performsVirtualDispatch(getLangOpts())) {
	if (CXXMethodDecl *Method = dyn_cast<CXXMethodDecl>(E->getMemberDecl()))
	if (Method->isPure())
	MightBeOdrUse = false;
	}
	SourceLocation Loc = E->getMemberLoc().isValid() ?
	E->getMemberLoc() : E->getLocStart();
	MarkExprReferenced(*this, Loc, E->getMemberDecl(), E, MightBeOdrUse);
	}

	/// \brief Perform marking for a reference to an arbitrary declaration. It
	/// marks the declaration referenced, and performs odr-use checking for
	/// functions and variables. This method should not be used when building a
	/// normal expression which refers to a variable.
	void Sema::MarkAnyDeclReferenced(SourceLocation Loc, Decl *D,
	bool MightBeOdrUse) {
	if (MightBeOdrUse) {
	if (auto *VD = dyn_cast<VarDecl>(D)) {
	MarkVariableReferenced(Loc, VD);
	return;
	}
	}
	if (auto *FD = dyn_cast<FunctionDecl>(D)) {
	MarkFunctionReferenced(Loc, FD, MightBeOdrUse);
	return;
	}
	D->setReferenced();
	}

	namespace {
	// Mark all of the declarations used by a type as referenced.
	// FIXME: Not fully implemented yet! We need to have a better understanding
	// of when we're entering a context we should not recurse into.
	// FIXME: This is and EvaluatedExprMarker are more-or-less equivalent to
	// TreeTransforms rebuilding the type in a new context. Rather than
	// duplicating the TreeTransform logic, we should consider reusing it here.
	// Currently that causes problems when rebuilding LambdaExprs.
	class MarkReferencedDecls : public RecursiveASTVisitor<MarkReferencedDecls> {
	Sema &S;
	SourceLocation Loc;

	public:
	typedef RecursiveASTVisitor<MarkReferencedDecls> Inherited;

	MarkReferencedDecls(Sema &S, SourceLocation Loc) : S(S), Loc(Loc) { }

	bool TraverseTemplateArgument(const TemplateArgument &Arg);
	};
	}

	bool MarkReferencedDecls::TraverseTemplateArgument(
	const TemplateArgument &Arg) {
	{
	// A non-type template argument is a constant-evaluated context.
	EnterExpressionEvaluationContext Evaluated(
	S, Sema::ExpressionEvaluationContext::ConstantEvaluated);
	if (Arg.getKind() == TemplateArgument::Declaration) {
	if (Decl *D = Arg.getAsDecl())
	S.MarkAnyDeclReferenced(Loc, D, true);
	} else if (Arg.getKind() == TemplateArgument::Expression) {
	S.MarkDeclarationsReferencedInExpr(Arg.getAsExpr(), false);
	}
	}

	return Inherited::TraverseTemplateArgument(Arg);
	}

	void Sema::MarkDeclarationsReferencedInType(SourceLocation Loc, QualType T) {
	MarkReferencedDecls Marker(*this, Loc);
	Marker.TraverseType(T);
	}

	namespace {
	/// \brief Helper class that marks all of the declarations referenced by
	/// potentially-evaluated subexpressions as "referenced".
	class EvaluatedExprMarker : public EvaluatedExprVisitor<EvaluatedExprMarker> {
	Sema &S;
	bool SkipLocalVariables;

	public:
	typedef EvaluatedExprVisitor<EvaluatedExprMarker> Inherited;

	EvaluatedExprMarker(Sema &S, bool SkipLocalVariables)
	: Inherited(S.Context), S(S), SkipLocalVariables(SkipLocalVariables) { }

	void VisitDeclRefExpr(DeclRefExpr *E) {
	// If we were asked not to visit local variables, don't.
	if (SkipLocalVariables) {
	if (VarDecl *VD = dyn_cast<VarDecl>(E->getDecl()))
	if (VD->hasLocalStorage())
	return;
	}

	S.MarkDeclRefReferenced(E);
	}

	void VisitMemberExpr(MemberExpr *E) {
	S.MarkMemberReferenced(E);
	Inherited::VisitMemberExpr(E);
	}

	void VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) {
	S.MarkFunctionReferenced(E->getLocStart(),
	const_cast<CXXDestructorDecl*>(E->getTemporary()->getDestructor()));
	Visit(E->getSubExpr());
	}

	void VisitCXXNewExpr(CXXNewExpr *E) {
	if (E->getOperatorNew())
	S.MarkFunctionReferenced(E->getLocStart(), E->getOperatorNew());
	if (E->getOperatorDelete())
	S.MarkFunctionReferenced(E->getLocStart(), E->getOperatorDelete());
	Inherited::VisitCXXNewExpr(E);
	}

	void VisitCXXDeleteExpr(CXXDeleteExpr *E) {
	if (E->getOperatorDelete())
	S.MarkFunctionReferenced(E->getLocStart(), E->getOperatorDelete());
	QualType Destroyed = S.Context.getBaseElementType(E->getDestroyedType());
	if (const RecordType *DestroyedRec = Destroyed->getAs<RecordType>()) {
	CXXRecordDecl *Record = cast<CXXRecordDecl>(DestroyedRec->getDecl());
	S.MarkFunctionReferenced(E->getLocStart(),
	S.LookupDestructor(Record));
	}

	Inherited::VisitCXXDeleteExpr(E);
	}

	void VisitCXXConstructExpr(CXXConstructExpr *E) {
	S.MarkFunctionReferenced(E->getLocStart(), E->getConstructor());
	Inherited::VisitCXXConstructExpr(E);
	}

	void VisitCXXDefaultArgExpr(CXXDefaultArgExpr *E) {
	Visit(E->getExpr());
	}

	void VisitImplicitCastExpr(ImplicitCastExpr *E) {
	Inherited::VisitImplicitCastExpr(E);

	if (E->getCastKind() == CK_LValueToRValue)
	S.UpdateMarkingForLValueToRValue(E->getSubExpr());
	}
	};
	}

	/// \brief Mark any declarations that appear within this expression or any
	/// potentially-evaluated subexpressions as "referenced".
	///
	/// \param SkipLocalVariables If true, don't mark local variables as
	/// 'referenced'.
	void Sema::MarkDeclarationsReferencedInExpr(Expr *E,
	bool SkipLocalVariables) {
	EvaluatedExprMarker(*this, SkipLocalVariables).Visit(E);
	}

	/// \brief Emit a diagnostic that describes an effect on the run-time behavior
	/// of the program being compiled.
	///
	/// This routine emits the given diagnostic when the code currently being
	/// type-checked is "potentially evaluated", meaning that there is a
	/// possibility that the code will actually be executable. Code in sizeof()
	/// expressions, code used only during overload resolution, etc., are not
	/// potentially evaluated. This routine will suppress such diagnostics or,
	/// in the absolutely nutty case of potentially potentially evaluated
	/// expressions (C++ typeid), queue the diagnostic to potentially emit it
	/// later.
	///
	/// This routine should be used for all diagnostics that describe the run-time
	/// behavior of a program, such as passing a non-POD value through an ellipsis.
	/// Failure to do so will likely result in spurious diagnostics or failures
	/// during overload resolution or within sizeof/alignof/typeof/typeid.
	bool Sema::DiagRuntimeBehavior(SourceLocation Loc, const Stmt *Statement,
	const PartialDiagnostic &PD) {
	switch (ExprEvalContexts.back().Context) {
	case ExpressionEvaluationContext::Unevaluated:
	case ExpressionEvaluationContext::UnevaluatedList:
	case ExpressionEvaluationContext::UnevaluatedAbstract:
	case ExpressionEvaluationContext::DiscardedStatement:
	// The argument will never be evaluated, so don't complain.
	break;

	case ExpressionEvaluationContext::ConstantEvaluated:
	// Relevant diagnostics should be produced by constant evaluation.
	break;

	case ExpressionEvaluationContext::PotentiallyEvaluated:
	case ExpressionEvaluationContext::PotentiallyEvaluatedIfUsed:
	if (Statement && getCurFunctionOrMethodDecl()) {
	FunctionScopes.back()->PossiblyUnreachableDiags.
	push_back(sema::PossiblyUnreachableDiag(PD, Loc, Statement));
	}
	else
	Diag(Loc, PD);

	return true;
	}

	return false;
	}

	bool Sema::CheckCallReturnType(QualType ReturnType, SourceLocation Loc,
	CallExpr CE, FunctionDecl FD) {
	if (ReturnType->isVoidType() \|\| !ReturnType->isIncompleteType())
	return false;

	// If we're inside a decltype's expression, don't check for a valid return
	// type or construct temporaries until we know whether this is the last call.
	if (ExprEvalContexts.back().IsDecltype) {
	ExprEvalContexts.back().DelayedDecltypeCalls.push_back(CE);
	return false;
	}

	class CallReturnIncompleteDiagnoser : public TypeDiagnoser {
	FunctionDecl *FD;
	CallExpr *CE;

	public:
	CallReturnIncompleteDiagnoser(FunctionDecl FD, CallExpr CE)
	: FD(FD), CE(CE) { }

	void diagnose(Sema &S, SourceLocation Loc, QualType T) override {
	if (!FD) {
	S.Diag(Loc, diag::err_call_incomplete_return)
	<< T << CE->getSourceRange();
	return;
	}

	S.Diag(Loc, diag::err_call_function_incomplete_return)
	<< CE->getSourceRange() << FD->getDeclName() << T;
	S.Diag(FD->getLocation(), diag::note_entity_declared_at)
	<< FD->getDeclName();
	}
	} Diagnoser(FD, CE);

	if (RequireCompleteType(Loc, ReturnType, Diagnoser))
	return true;

	return false;
	}

	// Diagnose the s/=/==/ and s/\\|=/!=/ typos. Note that adding parentheses
	// will prevent this condition from triggering, which is what we want.
	void Sema::DiagnoseAssignmentAsCondition(Expr *E) {
	SourceLocation Loc;

	unsigned diagnostic = diag::warn_condition_is_assignment;
	bool IsOrAssign = false;

	if (BinaryOperator *Op = dyn_cast<BinaryOperator>(E)) {
	if (Op->getOpcode() != BO_Assign && Op->getOpcode() != BO_OrAssign)
	return;

	IsOrAssign = Op->getOpcode() == BO_OrAssign;

	// Greylist some idioms by putting them into a warning subcategory.
	if (ObjCMessageExpr *ME
	= dyn_cast<ObjCMessageExpr>(Op->getRHS()->IgnoreParenCasts())) {
	Selector Sel = ME->getSelector();

	// self = [<foo> init...]
	if (isSelfExpr(Op->getLHS()) && ME->getMethodFamily() == OMF_init)
	diagnostic = diag::warn_condition_is_idiomatic_assignment;

	// <foo> = [<bar> nextObject]
	else if (Sel.isUnarySelector() && Sel.getNameForSlot(0) == "nextObject")
	diagnostic = diag::warn_condition_is_idiomatic_assignment;
	}

	Loc = Op->getOperatorLoc();
	} else if (CXXOperatorCallExpr *Op = dyn_cast<CXXOperatorCallExpr>(E)) {
	if (Op->getOperator() != OO_Equal && Op->getOperator() != OO_PipeEqual)
	return;

	IsOrAssign = Op->getOperator() == OO_PipeEqual;
	Loc = Op->getOperatorLoc();
	} else if (PseudoObjectExpr *POE = dyn_cast<PseudoObjectExpr>(E))
	return DiagnoseAssignmentAsCondition(POE->getSyntacticForm());
	else {
	// Not an assignment.
	return;
	}

	Diag(Loc, diagnostic) << E->getSourceRange();

	SourceLocation Open = E->getLocStart();
	SourceLocation Close = getLocForEndOfToken(E->getSourceRange().getEnd());
	Diag(Loc, diag::note_condition_assign_silence)
	<< FixItHint::CreateInsertion(Open, "(")
	<< FixItHint::CreateInsertion(Close, ")");

	if (IsOrAssign)
	Diag(Loc, diag::note_condition_or_assign_to_comparison)
	<< FixItHint::CreateReplacement(Loc, "!=");
	else
	Diag(Loc, diag::note_condition_assign_to_comparison)
	<< FixItHint::CreateReplacement(Loc, "==");
	}

	/// \brief Redundant parentheses over an equality comparison can indicate
	/// that the user intended an assignment used as condition.
	void Sema::DiagnoseEqualityWithExtraParens(ParenExpr *ParenE) {
	// Don't warn if the parens came from a macro.
	SourceLocation parenLoc = ParenE->getLocStart();
	if (parenLoc.isInvalid() \|\| parenLoc.isMacroID())
	return;
	// Don't warn for dependent expressions.
	if (ParenE->isTypeDependent())
	return;

	Expr *E = ParenE->IgnoreParens();

	if (BinaryOperator *opE = dyn_cast<BinaryOperator>(E))
	if (opE->getOpcode() == BO_EQ &&
	opE->getLHS()->IgnoreParenImpCasts()->isModifiableLvalue(Context)
	== Expr::MLV_Valid) {
	SourceLocation Loc = opE->getOperatorLoc();

	Diag(Loc, diag::warn_equality_with_extra_parens) << E->getSourceRange();
	SourceRange ParenERange = ParenE->getSourceRange();
	Diag(Loc, diag::note_equality_comparison_silence)
	<< FixItHint::CreateRemoval(ParenERange.getBegin())
	<< FixItHint::CreateRemoval(ParenERange.getEnd());
	Diag(Loc, diag::note_equality_comparison_to_assign)
	<< FixItHint::CreateReplacement(Loc, "=");
	}
	}

	ExprResult Sema::CheckBooleanCondition(SourceLocation Loc, Expr *E,
	bool IsConstexpr) {
	DiagnoseAssignmentAsCondition(E);
	if (ParenExpr *parenE = dyn_cast<ParenExpr>(E))
	DiagnoseEqualityWithExtraParens(parenE);

	ExprResult result = CheckPlaceholderExpr(E);
	if (result.isInvalid()) return ExprError();
	E = result.get();

	if (!E->isTypeDependent()) {
	if (getLangOpts().CPlusPlus)
	return CheckCXXBooleanCondition(E, IsConstexpr); // C++ 6.4p4

	ExprResult ERes = DefaultFunctionArrayLvalueConversion(E);
	if (ERes.isInvalid())
	return ExprError();
	E = ERes.get();

	QualType T = E->getType();
	if (!T->isScalarType()) { // C99 6.8.4.1p1
	Diag(Loc, diag::err_typecheck_statement_requires_scalar)
	<< T << E->getSourceRange();
	return ExprError();
	}
	CheckBoolLikeConversion(E, Loc);
	}

	return E;
	}

	Sema::ConditionResult Sema::ActOnCondition(Scope *S, SourceLocation Loc,
	Expr *SubExpr, ConditionKind CK) {
	// Empty conditions are valid in for-statements.
	if (!SubExpr)
	return ConditionResult();

	ExprResult Cond;
	switch (CK) {
	case ConditionKind::Boolean:
	Cond = CheckBooleanCondition(Loc, SubExpr);
	break;

	case ConditionKind::ConstexprIf:
	Cond = CheckBooleanCondition(Loc, SubExpr, true);
	break;

	case ConditionKind::Switch:
	Cond = CheckSwitchCondition(Loc, SubExpr);
	break;
	}
	if (Cond.isInvalid())
	return ConditionError();

	// FIXME: FullExprArg doesn't have an invalid bit, so check nullness instead.
	FullExprArg FullExpr = MakeFullExpr(Cond.get(), Loc);
	if (!FullExpr.get())
	return ConditionError();

	return ConditionResult(*this, nullptr, FullExpr,
	CK == ConditionKind::ConstexprIf);
	}

	namespace {
	/// A visitor for rebuilding a call to an __unknown_any expression
	/// to have an appropriate type.
	struct RebuildUnknownAnyFunction
	: StmtVisitor<RebuildUnknownAnyFunction, ExprResult> {

	Sema &S;

	RebuildUnknownAnyFunction(Sema &S) : S(S) {}

	ExprResult VisitStmt(Stmt *S) {
	llvm_unreachable("unexpected statement!");
	}

	ExprResult VisitExpr(Expr *E) {
	S.Diag(E->getExprLoc(), diag::err_unsupported_unknown_any_call)
	<< E->getSourceRange();
	return ExprError();
	}

	/// Rebuild an expression which simply semantically wraps another
	/// expression which it shares the type and value kind of.
	template <class T> ExprResult rebuildSugarExpr(T *E) {
	ExprResult SubResult = Visit(E->getSubExpr());
	if (SubResult.isInvalid()) return ExprError();

	Expr *SubExpr = SubResult.get();
	E->setSubExpr(SubExpr);
	E->setType(SubExpr->getType());
	E->setValueKind(SubExpr->getValueKind());
	assert(E->getObjectKind() == OK_Ordinary);
	return E;
	}

	ExprResult VisitParenExpr(ParenExpr *E) {
	return rebuildSugarExpr(E);
	}

	ExprResult VisitUnaryExtension(UnaryOperator *E) {
	return rebuildSugarExpr(E);
	}

	ExprResult VisitUnaryAddrOf(UnaryOperator *E) {
	ExprResult SubResult = Visit(E->getSubExpr());
	if (SubResult.isInvalid()) return ExprError();

	Expr *SubExpr = SubResult.get();
	E->setSubExpr(SubExpr);
	E->setType(S.Context.getPointerType(SubExpr->getType()));
	assert(E->getValueKind() == VK_RValue);
	assert(E->getObjectKind() == OK_Ordinary);
	return E;
	}

	ExprResult resolveDecl(Expr E, ValueDecl VD) {
	if (!isa<FunctionDecl>(VD)) return VisitExpr(E);

	E->setType(VD->getType());

	assert(E->getValueKind() == VK_RValue);
	if (S.getLangOpts().CPlusPlus &&
	!(isa<CXXMethodDecl>(VD) &&
	cast<CXXMethodDecl>(VD)->isInstance()))
	E->setValueKind(VK_LValue);

	return E;
	}

	ExprResult VisitMemberExpr(MemberExpr *E) {
	return resolveDecl(E, E->getMemberDecl());
	}

	ExprResult VisitDeclRefExpr(DeclRefExpr *E) {
	return resolveDecl(E, E->getDecl());
	}
	};
	}

	/// Given a function expression of unknown-any type, try to rebuild it
	/// to have a function type.
	static ExprResult rebuildUnknownAnyFunction(Sema &S, Expr *FunctionExpr) {
	ExprResult Result = RebuildUnknownAnyFunction(S).Visit(FunctionExpr);
	if (Result.isInvalid()) return ExprError();
	return S.DefaultFunctionArrayConversion(Result.get());
	}

	namespace {
	/// A visitor for rebuilding an expression of type __unknown_anytype
	/// into one which resolves the type directly on the referring
	/// expression. Strict preservation of the original source
	/// structure is not a goal.
	struct RebuildUnknownAnyExpr
	: StmtVisitor<RebuildUnknownAnyExpr, ExprResult> {

	Sema &S;

	/// The current destination type.
	QualType DestType;

	RebuildUnknownAnyExpr(Sema &S, QualType CastType)
	: S(S), DestType(CastType) {}

	ExprResult VisitStmt(Stmt *S) {
	llvm_unreachable("unexpected statement!");
	}

	ExprResult VisitExpr(Expr *E) {
	S.Diag(E->getExprLoc(), diag::err_unsupported_unknown_any_expr)
	<< E->getSourceRange();
	return ExprError();
	}

	ExprResult VisitCallExpr(CallExpr *E);
	ExprResult VisitObjCMessageExpr(ObjCMessageExpr *E);

	/// Rebuild an expression which simply semantically wraps another
	/// expression which it shares the type and value kind of.
	template <class T> ExprResult rebuildSugarExpr(T *E) {
	ExprResult SubResult = Visit(E->getSubExpr());
	if (SubResult.isInvalid()) return ExprError();
	Expr *SubExpr = SubResult.get();
	E->setSubExpr(SubExpr);
	E->setType(SubExpr->getType());
	E->setValueKind(SubExpr->getValueKind());
	assert(E->getObjectKind() == OK_Ordinary);
	return E;
	}

	ExprResult VisitParenExpr(ParenExpr *E) {
	return rebuildSugarExpr(E);
	}

	ExprResult VisitUnaryExtension(UnaryOperator *E) {
	return rebuildSugarExpr(E);
	}

	ExprResult VisitUnaryAddrOf(UnaryOperator *E) {
	const PointerType *Ptr = DestType->getAs<PointerType>();
	if (!Ptr) {
	S.Diag(E->getOperatorLoc(), diag::err_unknown_any_addrof)
	<< E->getSourceRange();
	return ExprError();
	}

	if (isa<CallExpr>(E->getSubExpr())) {
	S.Diag(E->getOperatorLoc(), diag::err_unknown_any_addrof_call)
	<< E->getSourceRange();
	return ExprError();
	}

	assert(E->getValueKind() == VK_RValue);
	assert(E->getObjectKind() == OK_Ordinary);
	E->setType(DestType);

	// Build the sub-expression as if it were an object of the pointee type.
	DestType = Ptr->getPointeeType();
	ExprResult SubResult = Visit(E->getSubExpr());
	if (SubResult.isInvalid()) return ExprError();
	E->setSubExpr(SubResult.get());
	return E;
	}

	ExprResult VisitImplicitCastExpr(ImplicitCastExpr *E);

	ExprResult resolveDecl(Expr E, ValueDecl VD);

	ExprResult VisitMemberExpr(MemberExpr *E) {
	return resolveDecl(E, E->getMemberDecl());
	}

	ExprResult VisitDeclRefExpr(DeclRefExpr *E) {
	return resolveDecl(E, E->getDecl());
	}
	};
	}

	/// Rebuilds a call expression which yielded __unknown_anytype.
	ExprResult RebuildUnknownAnyExpr::VisitCallExpr(CallExpr *E) {
	Expr *CalleeExpr = E->getCallee();

	enum FnKind {
	FK_MemberFunction,
	FK_FunctionPointer,
	FK_BlockPointer
	};

	FnKind Kind;
	QualType CalleeType = CalleeExpr->getType();
	if (CalleeType == S.Context.BoundMemberTy) {
	assert(isa<CXXMemberCallExpr>(E) \|\| isa<CXXOperatorCallExpr>(E));
	Kind = FK_MemberFunction;
	CalleeType = Expr::findBoundMemberType(CalleeExpr);
	} else if (const PointerType *Ptr = CalleeType->getAs<PointerType>()) {
	CalleeType = Ptr->getPointeeType();
	Kind = FK_FunctionPointer;
	} else {
	CalleeType = CalleeType->castAs<BlockPointerType>()->getPointeeType();
	Kind = FK_BlockPointer;
	}
	const FunctionType *FnType = CalleeType->castAs<FunctionType>();

	// Verify that this is a legal result type of a function.
	if (DestType->isArrayType() \|\| DestType->isFunctionType()) {
	unsigned diagID = diag::err_func_returning_array_function;
	if (Kind == FK_BlockPointer)
	diagID = diag::err_block_returning_array_function;

	S.Diag(E->getExprLoc(), diagID)
	<< DestType->isFunctionType() << DestType;
	return ExprError();
	}

	// Otherwise, go ahead and set DestType as the call's result.
	E->setType(DestType.getNonLValueExprType(S.Context));
	E->setValueKind(Expr::getValueKindForType(DestType));
	assert(E->getObjectKind() == OK_Ordinary);

	// Rebuild the function type, replacing the result type with DestType.
	const FunctionProtoType *Proto = dyn_cast<FunctionProtoType>(FnType);
	if (Proto) {
	// __unknown_anytype(...) is a special case used by the debugger when
	// it has no idea what a function's signature is.
	//
	// We want to build this call essentially under the K&R
	// unprototyped rules, but making a FunctionNoProtoType in C++
	// would foul up all sorts of assumptions. However, we cannot
	// simply pass all arguments as variadic arguments, nor can we
	// portably just call the function under a non-variadic type; see
	// the comment on IR-gen's TargetInfo::isNoProtoCallVariadic.
	// However, it turns out that in practice it is generally safe to
	// call a function declared as "A foo(B,C,D);" under the prototype
	// "A foo(B,C,D,...);". The only known exception is with the
	// Windows ABI, where any variadic function is implicitly cdecl
	// regardless of its normal CC. Therefore we change the parameter
	// types to match the types of the arguments.
	//
	// This is a hack, but it is far superior to moving the
	// corresponding target-specific code from IR-gen to Sema/AST.

	ArrayRef<QualType> ParamTypes = Proto->getParamTypes();
	SmallVector<QualType, 8> ArgTypes;
	if (ParamTypes.empty() && Proto->isVariadic()) { // the special case
	ArgTypes.reserve(E->getNumArgs());
	for (unsigned i = 0, e = E->getNumArgs(); i != e; ++i) {
	Expr *Arg = E->getArg(i);
	QualType ArgType = Arg->getType();
	if (E->isLValue()) {
	ArgType = S.Context.getLValueReferenceType(ArgType);
	} else if (E->isXValue()) {
	ArgType = S.Context.getRValueReferenceType(ArgType);
	}
	ArgTypes.push_back(ArgType);
	}
	ParamTypes = ArgTypes;
	}
	DestType = S.Context.getFunctionType(DestType, ParamTypes,
	Proto->getExtProtoInfo());
	} else {
	DestType = S.Context.getFunctionNoProtoType(DestType,
	FnType->getExtInfo());
	}

	// Rebuild the appropriate pointer-to-function type.
	switch (Kind) {
	case FK_MemberFunction:
	// Nothing to do.
	break;

	case FK_FunctionPointer:
	DestType = S.Context.getPointerType(DestType);
	break;

	case FK_BlockPointer:
	DestType = S.Context.getBlockPointerType(DestType);
	break;
	}

	// Finally, we can recurse.
	ExprResult CalleeResult = Visit(CalleeExpr);
	if (!CalleeResult.isUsable()) return ExprError();
	E->setCallee(CalleeResult.get());

	// Bind a temporary if necessary.
	return S.MaybeBindToTemporary(E);
	}

	ExprResult RebuildUnknownAnyExpr::VisitObjCMessageExpr(ObjCMessageExpr *E) {
	// Verify that this is a legal result type of a call.
	if (DestType->isArrayType() \|\| DestType->isFunctionType()) {
	S.Diag(E->getExprLoc(), diag::err_func_returning_array_function)
	<< DestType->isFunctionType() << DestType;
	return ExprError();
	}

	// Rewrite the method result type if available.
	if (ObjCMethodDecl *Method = E->getMethodDecl()) {
	assert(Method->getReturnType() == S.Context.UnknownAnyTy);
	Method->setReturnType(DestType);
	}

	// Change the type of the message.
	E->setType(DestType.getNonReferenceType());
	E->setValueKind(Expr::getValueKindForType(DestType));

	return S.MaybeBindToTemporary(E);
	}

	ExprResult RebuildUnknownAnyExpr::VisitImplicitCastExpr(ImplicitCastExpr *E) {
	// The only case we should ever see here is a function-to-pointer decay.
	if (E->getCastKind() == CK_FunctionToPointerDecay) {
	assert(E->getValueKind() == VK_RValue);
	assert(E->getObjectKind() == OK_Ordinary);

	E->setType(DestType);

	// Rebuild the sub-expression as the pointee (function) type.
	DestType = DestType->castAs<PointerType>()->getPointeeType();

	ExprResult Result = Visit(E->getSubExpr());
	if (!Result.isUsable()) return ExprError();

	E->setSubExpr(Result.get());
	return E;
	} else if (E->getCastKind() == CK_LValueToRValue) {
	assert(E->getValueKind() == VK_RValue);
	assert(E->getObjectKind() == OK_Ordinary);

	assert(isa<BlockPointerType>(E->getType()));

	E->setType(DestType);

	// The sub-expression has to be a lvalue reference, so rebuild it as such.
	DestType = S.Context.getLValueReferenceType(DestType);

	ExprResult Result = Visit(E->getSubExpr());
	if (!Result.isUsable()) return ExprError();

	E->setSubExpr(Result.get());
	return E;
	} else {
	llvm_unreachable("Unhandled cast type!");
	}
	}

	ExprResult RebuildUnknownAnyExpr::resolveDecl(Expr E, ValueDecl VD) {
	ExprValueKind ValueKind = VK_LValue;
	QualType Type = DestType;

	// We know how to make this work for certain kinds of decls:

	// - functions
	if (FunctionDecl *FD = dyn_cast<FunctionDecl>(VD)) {
	if (const PointerType *Ptr = Type->getAs<PointerType>()) {
	DestType = Ptr->getPointeeType();
	ExprResult Result = resolveDecl(E, VD);
	if (Result.isInvalid()) return ExprError();
	return S.ImpCastExprToType(Result.get(), Type,
	CK_FunctionToPointerDecay, VK_RValue);
	}

	if (!Type->isFunctionType()) {
	S.Diag(E->getExprLoc(), diag::err_unknown_any_function)
	<< VD << E->getSourceRange();
	return ExprError();
	}
	if (const FunctionProtoType *FT = Type->getAs<FunctionProtoType>()) {
	// We must match the FunctionDecl's type to the hack introduced in
	// RebuildUnknownAnyExpr::VisitCallExpr to vararg functions of unknown
	// type. See the lengthy commentary in that routine.
	QualType FDT = FD->getType();
	const FunctionType *FnType = FDT->castAs<FunctionType>();
	const FunctionProtoType *Proto = dyn_cast_or_null<FunctionProtoType>(FnType);
	DeclRefExpr *DRE = dyn_cast<DeclRefExpr>(E);
	if (DRE && Proto && Proto->getParamTypes().empty() && Proto->isVariadic()) {
	SourceLocation Loc = FD->getLocation();
	FunctionDecl *NewFD = FunctionDecl::Create(FD->getASTContext(),
	FD->getDeclContext(),
	Loc, Loc, FD->getNameInfo().getName(),
	DestType, FD->getTypeSourceInfo(),
	SC_None, false/isInlineSpecified/,
	FD->hasPrototype(),
	false/isConstexprSpecified/);

	if (FD->getQualifier())
	NewFD->setQualifierInfo(FD->getQualifierLoc());

	SmallVector<ParmVarDecl*, 16> Params;
	for (const auto &AI : FT->param_types()) {
	ParmVarDecl *Param =
	S.BuildParmVarDeclForTypedef(FD, Loc, AI);
	Param->setScopeInfo(0, Params.size());
	Params.push_back(Param);
	}
	NewFD->setParams(Params);
	DRE->setDecl(NewFD);
	VD = DRE->getDecl();
	}
	}

	if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(FD))
	if (MD->isInstance()) {
	ValueKind = VK_RValue;
	Type = S.Context.BoundMemberTy;
	}

	// Function references aren't l-values in C.
	if (!S.getLangOpts().CPlusPlus)
	ValueKind = VK_RValue;

	// - variables
	} else if (isa<VarDecl>(VD)) {
	if (const ReferenceType *RefTy = Type->getAs<ReferenceType>()) {
	Type = RefTy->getPointeeType();
	} else if (Type->isFunctionType()) {
	S.Diag(E->getExprLoc(), diag::err_unknown_any_var_function_type)
	<< VD << E->getSourceRange();
	return ExprError();
	}

	// - nothing else
	} else {
	S.Diag(E->getExprLoc(), diag::err_unsupported_unknown_any_decl)
	<< VD << E->getSourceRange();
	return ExprError();
	}

	// Modifying the declaration like this is friendly to IR-gen but
	// also really dangerous.
	VD->setType(DestType);
	E->setType(Type);
	E->setValueKind(ValueKind);
	return E;
	}

	/// Check a cast of an unknown-any type. We intentionally only
	/// trigger this for C-style casts.
	ExprResult Sema::checkUnknownAnyCast(SourceRange TypeRange, QualType CastType,
	Expr *CastExpr, CastKind &CastKind,
	ExprValueKind &VK, CXXCastPath &Path) {
	// The type we're casting to must be either void or complete.
	if (!CastType->isVoidType() &&
	RequireCompleteType(TypeRange.getBegin(), CastType,
	diag::err_typecheck_cast_to_incomplete))
	return ExprError();

	// Rewrite the casted expression from scratch.
	ExprResult result = RebuildUnknownAnyExpr(*this, CastType).Visit(CastExpr);
	if (!result.isUsable()) return ExprError();

	CastExpr = result.get();
	VK = CastExpr->getValueKind();
	CastKind = CK_NoOp;

	return CastExpr;
	}

	ExprResult Sema::forceUnknownAnyToType(Expr *E, QualType ToType) {
	return RebuildUnknownAnyExpr(*this, ToType).Visit(E);
	}

	ExprResult Sema::checkUnknownAnyArg(SourceLocation callLoc,
	Expr *arg, QualType &paramType) {
	// If the syntactic form of the argument is not an explicit cast of
	// any sort, just do default argument promotion.
	ExplicitCastExpr *castArg = dyn_cast<ExplicitCastExpr>(arg->IgnoreParens());
	if (!castArg) {
	ExprResult result = DefaultArgumentPromotion(arg);
	if (result.isInvalid()) return ExprError();
	paramType = result.get()->getType();
	return result;
	}

	// Otherwise, use the type that was written in the explicit cast.
	assert(!arg->hasPlaceholderType());
	paramType = castArg->getTypeAsWritten();

	// Copy-initialize a parameter of that type.
	InitializedEntity entity =
	InitializedEntity::InitializeParameter(Context, paramType,
	/consumed/ false);
	return PerformCopyInitialization(entity, callLoc, arg);
	}

	static ExprResult diagnoseUnknownAnyExpr(Sema &S, Expr *E) {
	Expr *orig = E;
	unsigned diagID = diag::err_uncasted_use_of_unknown_any;
	while (true) {
	E = E->IgnoreParenImpCasts();
	if (CallExpr *call = dyn_cast<CallExpr>(E)) {
	E = call->getCallee();
	diagID = diag::err_uncasted_call_of_unknown_any;
	} else {
	break;
	}
	}

	SourceLocation loc;
	NamedDecl *d;
	if (DeclRefExpr *ref = dyn_cast<DeclRefExpr>(E)) {
	loc = ref->getLocation();
	d = ref->getDecl();
	} else if (MemberExpr *mem = dyn_cast<MemberExpr>(E)) {
	loc = mem->getMemberLoc();
	d = mem->getMemberDecl();
	} else if (ObjCMessageExpr *msg = dyn_cast<ObjCMessageExpr>(E)) {
	diagID = diag::err_uncasted_call_of_unknown_any;
	loc = msg->getSelectorStartLoc();
	d = msg->getMethodDecl();
	if (!d) {
	S.Diag(loc, diag::err_uncasted_send_to_unknown_any_method)
	<< static_cast<unsigned>(msg->isClassMessage()) << msg->getSelector()
	<< orig->getSourceRange();
	return ExprError();
	}
	} else {
	S.Diag(E->getExprLoc(), diag::err_unsupported_unknown_any_expr)
	<< E->getSourceRange();
	return ExprError();
	}

	S.Diag(loc, diagID) << d << orig->getSourceRange();

	// Never recoverable.
	return ExprError();
	}

	/// Check for operands with placeholder types and complain if found.
	/// Returns ExprError() if there was an error and no recovery was possible.
	ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
	if (!getLangOpts().CPlusPlus) {
	// C cannot handle TypoExpr nodes on either side of a binop because it
	// doesn't handle dependent types properly, so make sure any TypoExprs have
	// been dealt with before checking the operands.
	ExprResult Result = CorrectDelayedTyposInExpr(E);
	if (!Result.isUsable()) return ExprError();
	E = Result.get();
	}

	const BuiltinType *placeholderType = E->getType()->getAsPlaceholderType();
	if (!placeholderType) return E;

	switch (placeholderType->getKind()) {

	// Overloaded expressions.
	case BuiltinType::Overload: {
	// Try to resolve a single function template specialization.
	// This is obligatory.
	ExprResult Result = E;
	if (ResolveAndFixSingleFunctionTemplateSpecialization(Result, false))
	return Result;

	// No guarantees that ResolveAndFixSingleFunctionTemplateSpecialization
	// leaves Result unchanged on failure.
	Result = E;
	if (resolveAndFixAddressOfOnlyViableOverloadCandidate(Result))
	return Result;

	// If that failed, try to recover with a call.
	tryToRecoverWithCall(Result, PDiag(diag::err_ovl_unresolvable),
	/complain/ true);
	return Result;
	}

	// Bound member functions.
	case BuiltinType::BoundMember: {
	ExprResult result = E;
	const Expr *BME = E->IgnoreParens();
	PartialDiagnostic PD = PDiag(diag::err_bound_member_function);
	// Try to give a nicer diagnostic if it is a bound member that we recognize.
	if (isa<CXXPseudoDestructorExpr>(BME)) {
	PD = PDiag(diag::err_dtor_expr_without_call) << /pseudo-destructor/ 1;
	} else if (const auto *ME = dyn_cast<MemberExpr>(BME)) {
	if (ME->getMemberNameInfo().getName().getNameKind() ==
	DeclarationName::CXXDestructorName)
	PD = PDiag(diag::err_dtor_expr_without_call) << /destructor/ 0;
	}
	tryToRecoverWithCall(result, PD,
	/complain/ true);
	return result;
	}

	// ARC unbridged casts.
	case BuiltinType::ARCUnbridgedCast: {
	Expr *realCast = stripARCUnbridgedCast(E);
	diagnoseARCUnbridgedCast(realCast);
	return realCast;
	}

	// Expressions of unknown type.
	case BuiltinType::UnknownAny:
	return diagnoseUnknownAnyExpr(*this, E);

	// Pseudo-objects.
	case BuiltinType::PseudoObject:
	return checkPseudoObjectRValue(E);

	case BuiltinType::BuiltinFn: {
	// Accept __noop without parens by implicitly converting it to a call expr.
	auto *DRE = dyn_cast<DeclRefExpr>(E->IgnoreParenImpCasts());
	if (DRE) {
	auto *FD = cast<FunctionDecl>(DRE->getDecl());
	if (FD->getBuiltinID() == Builtin::BI__noop) {
	E = ImpCastExprToType(E, Context.getPointerType(FD->getType()),
	CK_BuiltinFnToFnPtr).get();
	return new (Context) CallExpr(Context, E, None, Context.IntTy,
	VK_RValue, SourceLocation());
	}
	}

	Diag(E->getLocStart(), diag::err_builtin_fn_use);
	return ExprError();
	}

	// Expressions of unknown type.
	case BuiltinType::OMPArraySection:
	Diag(E->getLocStart(), diag::err_omp_array_section_use);
	return ExprError();

	// Everything else should be impossible.
	#define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
	case BuiltinType::Id:
	#include "clang/Basic/OpenCLImageTypes.def"
	#define BUILTIN_TYPE(Id, SingletonId) case BuiltinType::Id:
	#define PLACEHOLDER_TYPE(Id, SingletonId)
	#include "clang/AST/BuiltinTypes.def"
	break;
	}

	llvm_unreachable("invalid placeholder type!");
	}

	bool Sema::CheckCaseExpression(Expr *E) {
	if (E->isTypeDependent())
	return true;
	if (E->isValueDependent() \|\| E->isIntegerConstantExpr(Context))
	return E->getType()->isIntegralOrEnumerationType();
	return false;
	}

	/// ActOnObjCBoolLiteral - Parse {__objc_yes,__objc_no} literals.
	ExprResult
	Sema::ActOnObjCBoolLiteral(SourceLocation OpLoc, tok::TokenKind Kind) {
	assert((Kind == tok::kw___objc_yes \|\| Kind == tok::kw___objc_no) &&
	"Unknown Objective-C Boolean value!");
	QualType BoolT = Context.ObjCBuiltinBoolTy;
	if (!Context.getBOOLDecl()) {
	LookupResult Result(*this, &Context.Idents.get("BOOL"), OpLoc,
	Sema::LookupOrdinaryName);
	if (LookupName(Result, getCurScope()) && Result.isSingleResult()) {
	NamedDecl *ND = Result.getFoundDecl();
	if (TypedefDecl *TD = dyn_cast<TypedefDecl>(ND))
	Context.setBOOLDecl(TD);
	}
	}
	if (Context.getBOOLDecl())
	BoolT = Context.getBOOLType();
	return new (Context)
	ObjCBoolLiteralExpr(Kind == tok::kw___objc_yes, BoolT, OpLoc);
	}

	ExprResult Sema::ActOnObjCAvailabilityCheckExpr(
	llvm::ArrayRef<AvailabilitySpec> AvailSpecs, SourceLocation AtLoc,
	SourceLocation RParen) {

	StringRef Platform = getASTContext().getTargetInfo().getPlatformName();

	auto Spec = std::find_if(AvailSpecs.begin(), AvailSpecs.end(),
	[&](const AvailabilitySpec &Spec) {
	return Spec.getPlatform() == Platform;
	});

	VersionTuple Version;
	if (Spec != AvailSpecs.end())
	Version = Spec->getVersion();

	// The use of `@available` in the enclosing function should be analyzed to
	// warn when it's used inappropriately (i.e. not if(@available)).
	if (getCurFunctionOrMethodDecl())
	getEnclosingFunction()->HasPotentialAvailabilityViolations = true;
	else if (getCurBlock() \|\| getCurLambda())
	getCurFunction()->HasPotentialAvailabilityViolations = true;

	return new (Context)
	ObjCAvailabilityCheckExpr(Version, AtLoc, RParen, Context.BoolTy);
	}
	Index: head/contrib/llvm/tools/clang/lib/Serialization/ASTReaderStmt.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Serialization/ASTReaderStmt.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Serialization/ASTReaderStmt.cpp (revision 322320)
	@@ -1,3971 +1,4011 @@
	//===--- ASTReaderStmt.cpp - Stmt/Expr Deserialization ----------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// Statement/expression deserialization. This implements the
	// ASTReader::ReadStmt method.
	//
	//===----------------------------------------------------------------------===//

	#include "clang/Serialization/ASTReader.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/StmtVisitor.h"
	#include "clang/Lex/Token.h"
	#include "llvm/ADT/SmallString.h"
	using namespace clang;
	using namespace clang::serialization;

	namespace clang {

	class ASTStmtReader : public StmtVisitor<ASTStmtReader> {
	friend class OMPClauseReader;

	ASTRecordReader &Record;
	llvm::BitstreamCursor &DeclsCursor;

	SourceLocation ReadSourceLocation() {
	return Record.readSourceLocation();
	}

	SourceRange ReadSourceRange() {
	return Record.readSourceRange();
	}

	std::string ReadString() {
	return Record.readString();
	}

	TypeSourceInfo *GetTypeSourceInfo() {
	return Record.getTypeSourceInfo();
	}

	Decl *ReadDecl() {
	return Record.readDecl();
	}

	template<typename T>
	T *ReadDeclAs() {
	return Record.readDeclAs<T>();
	}

	void ReadDeclarationNameLoc(DeclarationNameLoc &DNLoc,
	DeclarationName Name) {
	Record.readDeclarationNameLoc(DNLoc, Name);
	}

	void ReadDeclarationNameInfo(DeclarationNameInfo &NameInfo) {
	Record.readDeclarationNameInfo(NameInfo);
	}

	public:
	ASTStmtReader(ASTRecordReader &Record, llvm::BitstreamCursor &Cursor)
	: Record(Record), DeclsCursor(Cursor) {}

	/// \brief The number of record fields required for the Stmt class
	/// itself.
	static const unsigned NumStmtFields = 0;

	/// \brief The number of record fields required for the Expr class
	/// itself.
	static const unsigned NumExprFields = NumStmtFields + 7;

	/// \brief Read and initialize a ExplicitTemplateArgumentList structure.
	void ReadTemplateKWAndArgsInfo(ASTTemplateKWAndArgsInfo &Args,
	TemplateArgumentLoc *ArgsLocArray,
	unsigned NumTemplateArgs);
	/// \brief Read and initialize a ExplicitTemplateArgumentList structure.
	void ReadExplicitTemplateArgumentList(ASTTemplateArgumentListInfo &ArgList,
	unsigned NumTemplateArgs);

	void VisitStmt(Stmt *S);
	#define STMT(Type, Base) \
	void Visit##Type(Type *);
	#include "clang/AST/StmtNodes.inc"
	};
	}

	void ASTStmtReader::ReadTemplateKWAndArgsInfo(ASTTemplateKWAndArgsInfo &Args,
	TemplateArgumentLoc *ArgsLocArray,
	unsigned NumTemplateArgs) {
	SourceLocation TemplateKWLoc = ReadSourceLocation();
	TemplateArgumentListInfo ArgInfo;
	ArgInfo.setLAngleLoc(ReadSourceLocation());
	ArgInfo.setRAngleLoc(ReadSourceLocation());
	for (unsigned i = 0; i != NumTemplateArgs; ++i)
	ArgInfo.addArgument(Record.readTemplateArgumentLoc());
	Args.initializeFrom(TemplateKWLoc, ArgInfo, ArgsLocArray);
	}

	void ASTStmtReader::VisitStmt(Stmt *S) {
	assert(Record.getIdx() == NumStmtFields && "Incorrect statement field count");
	}

	void ASTStmtReader::VisitNullStmt(NullStmt *S) {
	VisitStmt(S);
	S->setSemiLoc(ReadSourceLocation());
	S->HasLeadingEmptyMacro = Record.readInt();
	}

	void ASTStmtReader::VisitCompoundStmt(CompoundStmt *S) {
	VisitStmt(S);
	SmallVector<Stmt *, 16> Stmts;
	unsigned NumStmts = Record.readInt();
	while (NumStmts--)
	Stmts.push_back(Record.readSubStmt());
	S->setStmts(Record.getContext(), Stmts);
	S->LBraceLoc = ReadSourceLocation();
	S->RBraceLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitSwitchCase(SwitchCase *S) {
	VisitStmt(S);
	Record.recordSwitchCaseID(S, Record.readInt());
	S->setKeywordLoc(ReadSourceLocation());
	S->setColonLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitCaseStmt(CaseStmt *S) {
	VisitSwitchCase(S);
	S->setLHS(Record.readSubExpr());
	S->setRHS(Record.readSubExpr());
	S->setSubStmt(Record.readSubStmt());
	S->setEllipsisLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitDefaultStmt(DefaultStmt *S) {
	VisitSwitchCase(S);
	S->setSubStmt(Record.readSubStmt());
	}

	void ASTStmtReader::VisitLabelStmt(LabelStmt *S) {
	VisitStmt(S);
	LabelDecl *LD = ReadDeclAs<LabelDecl>();
	LD->setStmt(S);
	S->setDecl(LD);
	S->setSubStmt(Record.readSubStmt());
	S->setIdentLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitAttributedStmt(AttributedStmt *S) {
	VisitStmt(S);
	uint64_t NumAttrs = Record.readInt();
	AttrVec Attrs;
	Record.readAttributes(Attrs);
	(void)NumAttrs;
	assert(NumAttrs == S->NumAttrs);
	assert(NumAttrs == Attrs.size());
	std::copy(Attrs.begin(), Attrs.end(), S->getAttrArrayPtr());
	S->SubStmt = Record.readSubStmt();
	S->AttrLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitIfStmt(IfStmt *S) {
	VisitStmt(S);
	S->setConstexpr(Record.readInt());
	S->setInit(Record.readSubStmt());
	S->setConditionVariable(Record.getContext(), ReadDeclAs<VarDecl>());
	S->setCond(Record.readSubExpr());
	S->setThen(Record.readSubStmt());
	S->setElse(Record.readSubStmt());
	S->setIfLoc(ReadSourceLocation());
	S->setElseLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitSwitchStmt(SwitchStmt *S) {
	VisitStmt(S);
	S->setInit(Record.readSubStmt());
	S->setConditionVariable(Record.getContext(), ReadDeclAs<VarDecl>());
	S->setCond(Record.readSubExpr());
	S->setBody(Record.readSubStmt());
	S->setSwitchLoc(ReadSourceLocation());
	if (Record.readInt())
	S->setAllEnumCasesCovered();

	SwitchCase *PrevSC = nullptr;
	for (auto E = Record.size(); Record.getIdx() != E; ) {
	SwitchCase *SC = Record.getSwitchCaseWithID(Record.readInt());
	if (PrevSC)
	PrevSC->setNextSwitchCase(SC);
	else
	S->setSwitchCaseList(SC);

	PrevSC = SC;
	}
	}

	void ASTStmtReader::VisitWhileStmt(WhileStmt *S) {
	VisitStmt(S);
	S->setConditionVariable(Record.getContext(), ReadDeclAs<VarDecl>());

	S->setCond(Record.readSubExpr());
	S->setBody(Record.readSubStmt());
	S->setWhileLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitDoStmt(DoStmt *S) {
	VisitStmt(S);
	S->setCond(Record.readSubExpr());
	S->setBody(Record.readSubStmt());
	S->setDoLoc(ReadSourceLocation());
	S->setWhileLoc(ReadSourceLocation());
	S->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitForStmt(ForStmt *S) {
	VisitStmt(S);
	S->setInit(Record.readSubStmt());
	S->setCond(Record.readSubExpr());
	S->setConditionVariable(Record.getContext(), ReadDeclAs<VarDecl>());
	S->setInc(Record.readSubExpr());
	S->setBody(Record.readSubStmt());
	S->setForLoc(ReadSourceLocation());
	S->setLParenLoc(ReadSourceLocation());
	S->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitGotoStmt(GotoStmt *S) {
	VisitStmt(S);
	S->setLabel(ReadDeclAs<LabelDecl>());
	S->setGotoLoc(ReadSourceLocation());
	S->setLabelLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitIndirectGotoStmt(IndirectGotoStmt *S) {
	VisitStmt(S);
	S->setGotoLoc(ReadSourceLocation());
	S->setStarLoc(ReadSourceLocation());
	S->setTarget(Record.readSubExpr());
	}

	void ASTStmtReader::VisitContinueStmt(ContinueStmt *S) {
	VisitStmt(S);
	S->setContinueLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitBreakStmt(BreakStmt *S) {
	VisitStmt(S);
	S->setBreakLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitReturnStmt(ReturnStmt *S) {
	VisitStmt(S);
	S->setRetValue(Record.readSubExpr());
	S->setReturnLoc(ReadSourceLocation());
	S->setNRVOCandidate(ReadDeclAs<VarDecl>());
	}

	void ASTStmtReader::VisitDeclStmt(DeclStmt *S) {
	VisitStmt(S);
	S->setStartLoc(ReadSourceLocation());
	S->setEndLoc(ReadSourceLocation());

	if (Record.size() - Record.getIdx() == 1) {
	// Single declaration
	S->setDeclGroup(DeclGroupRef(ReadDecl()));
	} else {
	SmallVector<Decl *, 16> Decls;
	int N = Record.size() - Record.getIdx();
	Decls.reserve(N);
	for (int I = 0; I < N; ++I)
	Decls.push_back(ReadDecl());
	S->setDeclGroup(DeclGroupRef(DeclGroup::Create(Record.getContext(),
	Decls.data(),
	Decls.size())));
	}
	}

	void ASTStmtReader::VisitAsmStmt(AsmStmt *S) {
	VisitStmt(S);
	S->NumOutputs = Record.readInt();
	S->NumInputs = Record.readInt();
	S->NumClobbers = Record.readInt();
	S->setAsmLoc(ReadSourceLocation());
	S->setVolatile(Record.readInt());
	S->setSimple(Record.readInt());
	}

	void ASTStmtReader::VisitGCCAsmStmt(GCCAsmStmt *S) {
	VisitAsmStmt(S);
	S->setRParenLoc(ReadSourceLocation());
	S->setAsmString(cast_or_null<StringLiteral>(Record.readSubStmt()));

	unsigned NumOutputs = S->getNumOutputs();
	unsigned NumInputs = S->getNumInputs();
	unsigned NumClobbers = S->getNumClobbers();

	// Outputs and inputs
	SmallVector<IdentifierInfo *, 16> Names;
	SmallVector<StringLiteral*, 16> Constraints;
	SmallVector<Stmt*, 16> Exprs;
	for (unsigned I = 0, N = NumOutputs + NumInputs; I != N; ++I) {
	Names.push_back(Record.getIdentifierInfo());
	Constraints.push_back(cast_or_null<StringLiteral>(Record.readSubStmt()));
	Exprs.push_back(Record.readSubStmt());
	}

	// Constraints
	SmallVector<StringLiteral*, 16> Clobbers;
	for (unsigned I = 0; I != NumClobbers; ++I)
	Clobbers.push_back(cast_or_null<StringLiteral>(Record.readSubStmt()));

	S->setOutputsAndInputsAndClobbers(Record.getContext(),
	Names.data(), Constraints.data(),
	Exprs.data(), NumOutputs, NumInputs,
	Clobbers.data(), NumClobbers);
	}

	void ASTStmtReader::VisitMSAsmStmt(MSAsmStmt *S) {
	VisitAsmStmt(S);
	S->LBraceLoc = ReadSourceLocation();
	S->EndLoc = ReadSourceLocation();
	S->NumAsmToks = Record.readInt();
	std::string AsmStr = ReadString();

	// Read the tokens.
	SmallVector<Token, 16> AsmToks;
	AsmToks.reserve(S->NumAsmToks);
	for (unsigned i = 0, e = S->NumAsmToks; i != e; ++i) {
	AsmToks.push_back(Record.readToken());
	}

	// The calls to reserve() for the FooData vectors are mandatory to
	// prevent dead StringRefs in the Foo vectors.

	// Read the clobbers.
	SmallVector<std::string, 16> ClobbersData;
	SmallVector<StringRef, 16> Clobbers;
	ClobbersData.reserve(S->NumClobbers);
	Clobbers.reserve(S->NumClobbers);
	for (unsigned i = 0, e = S->NumClobbers; i != e; ++i) {
	ClobbersData.push_back(ReadString());
	Clobbers.push_back(ClobbersData.back());
	}

	// Read the operands.
	unsigned NumOperands = S->NumOutputs + S->NumInputs;
	SmallVector<Expr*, 16> Exprs;
	SmallVector<std::string, 16> ConstraintsData;
	SmallVector<StringRef, 16> Constraints;
	Exprs.reserve(NumOperands);
	ConstraintsData.reserve(NumOperands);
	Constraints.reserve(NumOperands);
	for (unsigned i = 0; i != NumOperands; ++i) {
	Exprs.push_back(cast<Expr>(Record.readSubStmt()));
	ConstraintsData.push_back(ReadString());
	Constraints.push_back(ConstraintsData.back());
	}

	S->initialize(Record.getContext(), AsmStr, AsmToks,
	Constraints, Exprs, Clobbers);
	}

	void ASTStmtReader::VisitCoroutineBodyStmt(CoroutineBodyStmt *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+ VisitStmt(S);
	+ assert(Record.peekInt() == S->NumParams);
	+ Record.skipInts(1);
	+ auto *StoredStmts = S->getStoredStmts();
	+ for (unsigned i = 0;
	+ i < CoroutineBodyStmt::SubStmt::FirstParamMove + S->NumParams; ++i)
	+ StoredStmts[i] = Record.readSubStmt();
	}

	void ASTStmtReader::VisitCoreturnStmt(CoreturnStmt *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+ VisitStmt(S);
	+ S->CoreturnLoc = Record.readSourceLocation();
	+ for (auto &SubStmt: S->SubStmts)
	+ SubStmt = Record.readSubStmt();
	+ S->IsImplicit = Record.readInt() != 0;
	}

	-void ASTStmtReader::VisitCoawaitExpr(CoawaitExpr *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+void ASTStmtReader::VisitCoawaitExpr(CoawaitExpr *E) {
	+ VisitExpr(E);
	+ E->KeywordLoc = ReadSourceLocation();
	+ for (auto &SubExpr: E->SubExprs)
	+ SubExpr = Record.readSubStmt();
	+ E->OpaqueValue = cast_or_null<OpaqueValueExpr>(Record.readSubStmt());
	+ E->setIsImplicit(Record.readInt() != 0);
	}

	-void ASTStmtReader::VisitDependentCoawaitExpr(DependentCoawaitExpr *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+void ASTStmtReader::VisitCoyieldExpr(CoyieldExpr *E) {
	+ VisitExpr(E);
	+ E->KeywordLoc = ReadSourceLocation();
	+ for (auto &SubExpr: E->SubExprs)
	+ SubExpr = Record.readSubStmt();
	+ E->OpaqueValue = cast_or_null<OpaqueValueExpr>(Record.readSubStmt());
	}

	-void ASTStmtReader::VisitCoyieldExpr(CoyieldExpr *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+void ASTStmtReader::VisitDependentCoawaitExpr(DependentCoawaitExpr *E) {
	+ VisitExpr(E);
	+ E->KeywordLoc = ReadSourceLocation();
	+ for (auto &SubExpr: E->SubExprs)
	+ SubExpr = Record.readSubStmt();
	}

	void ASTStmtReader::VisitCapturedStmt(CapturedStmt *S) {
	VisitStmt(S);
	Record.skipInts(1);
	S->setCapturedDecl(ReadDeclAs<CapturedDecl>());
	S->setCapturedRegionKind(static_cast<CapturedRegionKind>(Record.readInt()));
	S->setCapturedRecordDecl(ReadDeclAs<RecordDecl>());

	// Capture inits
	for (CapturedStmt::capture_init_iterator I = S->capture_init_begin(),
	E = S->capture_init_end();
	I != E; ++I)
	*I = Record.readSubExpr();

	// Body
	S->setCapturedStmt(Record.readSubStmt());
	S->getCapturedDecl()->setBody(S->getCapturedStmt());

	// Captures
	for (auto &I : S->captures()) {
	I.VarAndKind.setPointer(ReadDeclAs<VarDecl>());
	I.VarAndKind.setInt(
	static_cast<CapturedStmt::VariableCaptureKind>(Record.readInt()));
	I.Loc = ReadSourceLocation();
	}
	}

	void ASTStmtReader::VisitExpr(Expr *E) {
	VisitStmt(E);
	E->setType(Record.readType());
	E->setTypeDependent(Record.readInt());
	E->setValueDependent(Record.readInt());
	E->setInstantiationDependent(Record.readInt());
	E->ExprBits.ContainsUnexpandedParameterPack = Record.readInt();
	E->setValueKind(static_cast<ExprValueKind>(Record.readInt()));
	E->setObjectKind(static_cast<ExprObjectKind>(Record.readInt()));
	assert(Record.getIdx() == NumExprFields &&
	"Incorrect expression field count");
	}

	void ASTStmtReader::VisitPredefinedExpr(PredefinedExpr *E) {
	VisitExpr(E);
	E->setLocation(ReadSourceLocation());
	E->Type = (PredefinedExpr::IdentType)Record.readInt();
	E->FnName = cast_or_null<StringLiteral>(Record.readSubExpr());
	}

	void ASTStmtReader::VisitDeclRefExpr(DeclRefExpr *E) {
	VisitExpr(E);

	E->DeclRefExprBits.HasQualifier = Record.readInt();
	E->DeclRefExprBits.HasFoundDecl = Record.readInt();
	E->DeclRefExprBits.HasTemplateKWAndArgsInfo = Record.readInt();
	E->DeclRefExprBits.HadMultipleCandidates = Record.readInt();
	E->DeclRefExprBits.RefersToEnclosingVariableOrCapture = Record.readInt();
	unsigned NumTemplateArgs = 0;
	if (E->hasTemplateKWAndArgsInfo())
	NumTemplateArgs = Record.readInt();

	if (E->hasQualifier())
	new (E->getTrailingObjects<NestedNameSpecifierLoc>())
	NestedNameSpecifierLoc(Record.readNestedNameSpecifierLoc());

	if (E->hasFoundDecl())
	E->getTrailingObjects<NamedDecl >() = ReadDeclAs<NamedDecl>();

	if (E->hasTemplateKWAndArgsInfo())
	ReadTemplateKWAndArgsInfo(
	*E->getTrailingObjects<ASTTemplateKWAndArgsInfo>(),
	E->getTrailingObjects<TemplateArgumentLoc>(), NumTemplateArgs);

	E->setDecl(ReadDeclAs<ValueDecl>());
	E->setLocation(ReadSourceLocation());
	ReadDeclarationNameLoc(E->DNLoc, E->getDecl()->getDeclName());
	}

	void ASTStmtReader::VisitIntegerLiteral(IntegerLiteral *E) {
	VisitExpr(E);
	E->setLocation(ReadSourceLocation());
	E->setValue(Record.getContext(), Record.readAPInt());
	}

	void ASTStmtReader::VisitFloatingLiteral(FloatingLiteral *E) {
	VisitExpr(E);
	E->setRawSemantics(static_cast<Stmt::APFloatSemantics>(Record.readInt()));
	E->setExact(Record.readInt());
	E->setValue(Record.getContext(), Record.readAPFloat(E->getSemantics()));
	E->setLocation(ReadSourceLocation());
	}

	void ASTStmtReader::VisitImaginaryLiteral(ImaginaryLiteral *E) {
	VisitExpr(E);
	E->setSubExpr(Record.readSubExpr());
	}

	void ASTStmtReader::VisitStringLiteral(StringLiteral *E) {
	VisitExpr(E);
	unsigned Len = Record.readInt();
	assert(Record.peekInt() == E->getNumConcatenated() &&
	"Wrong number of concatenated tokens!");
	Record.skipInts(1);
	StringLiteral::StringKind kind =
	static_cast<StringLiteral::StringKind>(Record.readInt());
	bool isPascal = Record.readInt();

	// Read string data
	auto B = &Record.peekInt();
	SmallString<16> Str(B, B + Len);
	E->setString(Record.getContext(), Str, kind, isPascal);
	Record.skipInts(Len);

	// Read source locations
	for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I)
	E->setStrTokenLoc(I, ReadSourceLocation());
	}

	void ASTStmtReader::VisitCharacterLiteral(CharacterLiteral *E) {
	VisitExpr(E);
	E->setValue(Record.readInt());
	E->setLocation(ReadSourceLocation());
	E->setKind(static_cast<CharacterLiteral::CharacterKind>(Record.readInt()));
	}

	void ASTStmtReader::VisitParenExpr(ParenExpr *E) {
	VisitExpr(E);
	E->setLParen(ReadSourceLocation());
	E->setRParen(ReadSourceLocation());
	E->setSubExpr(Record.readSubExpr());
	}

	void ASTStmtReader::VisitParenListExpr(ParenListExpr *E) {
	VisitExpr(E);
	unsigned NumExprs = Record.readInt();
	E->Exprs = new (Record.getContext()) Stmt*[NumExprs];
	for (unsigned i = 0; i != NumExprs; ++i)
	E->Exprs[i] = Record.readSubStmt();
	E->NumExprs = NumExprs;
	E->LParenLoc = ReadSourceLocation();
	E->RParenLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitUnaryOperator(UnaryOperator *E) {
	VisitExpr(E);
	E->setSubExpr(Record.readSubExpr());
	E->setOpcode((UnaryOperator::Opcode)Record.readInt());
	E->setOperatorLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitOffsetOfExpr(OffsetOfExpr *E) {
	VisitExpr(E);
	assert(E->getNumComponents() == Record.peekInt());
	Record.skipInts(1);
	assert(E->getNumExpressions() == Record.peekInt());
	Record.skipInts(1);
	E->setOperatorLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	E->setTypeSourceInfo(GetTypeSourceInfo());
	for (unsigned I = 0, N = E->getNumComponents(); I != N; ++I) {
	OffsetOfNode::Kind Kind = static_cast<OffsetOfNode::Kind>(Record.readInt());
	SourceLocation Start = ReadSourceLocation();
	SourceLocation End = ReadSourceLocation();
	switch (Kind) {
	case OffsetOfNode::Array:
	E->setComponent(I, OffsetOfNode(Start, Record.readInt(), End));
	break;

	case OffsetOfNode::Field:
	E->setComponent(
	I, OffsetOfNode(Start, ReadDeclAs<FieldDecl>(), End));
	break;

	case OffsetOfNode::Identifier:
	E->setComponent(
	I,
	OffsetOfNode(Start, Record.getIdentifierInfo(), End));
	break;

	case OffsetOfNode::Base: {
	CXXBaseSpecifier *Base = new (Record.getContext()) CXXBaseSpecifier();
	*Base = Record.readCXXBaseSpecifier();
	E->setComponent(I, OffsetOfNode(Base));
	break;
	}
	}
	}

	for (unsigned I = 0, N = E->getNumExpressions(); I != N; ++I)
	E->setIndexExpr(I, Record.readSubExpr());
	}

	void ASTStmtReader::VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *E) {
	VisitExpr(E);
	E->setKind(static_cast<UnaryExprOrTypeTrait>(Record.readInt()));
	if (Record.peekInt() == 0) {
	E->setArgument(Record.readSubExpr());
	Record.skipInts(1);
	} else {
	E->setArgument(GetTypeSourceInfo());
	}
	E->setOperatorLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
	VisitExpr(E);
	E->setLHS(Record.readSubExpr());
	E->setRHS(Record.readSubExpr());
	E->setRBracketLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitOMPArraySectionExpr(OMPArraySectionExpr *E) {
	VisitExpr(E);
	E->setBase(Record.readSubExpr());
	E->setLowerBound(Record.readSubExpr());
	E->setLength(Record.readSubExpr());
	E->setColonLoc(ReadSourceLocation());
	E->setRBracketLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitCallExpr(CallExpr *E) {
	VisitExpr(E);
	E->setNumArgs(Record.getContext(), Record.readInt());
	E->setRParenLoc(ReadSourceLocation());
	E->setCallee(Record.readSubExpr());
	for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
	E->setArg(I, Record.readSubExpr());
	}

	void ASTStmtReader::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
	VisitCallExpr(E);
	}

	void ASTStmtReader::VisitMemberExpr(MemberExpr *E) {
	// Don't call VisitExpr, this is fully initialized at creation.
	assert(E->getStmtClass() == Stmt::MemberExprClass &&
	"It's a subclass, we must advance Idx!");
	}

	void ASTStmtReader::VisitObjCIsaExpr(ObjCIsaExpr *E) {
	VisitExpr(E);
	E->setBase(Record.readSubExpr());
	E->setIsaMemberLoc(ReadSourceLocation());
	E->setOpLoc(ReadSourceLocation());
	E->setArrow(Record.readInt());
	}

	void ASTStmtReader::
	VisitObjCIndirectCopyRestoreExpr(ObjCIndirectCopyRestoreExpr *E) {
	VisitExpr(E);
	E->Operand = Record.readSubExpr();
	E->setShouldCopy(Record.readInt());
	}

	void ASTStmtReader::VisitObjCBridgedCastExpr(ObjCBridgedCastExpr *E) {
	VisitExplicitCastExpr(E);
	E->LParenLoc = ReadSourceLocation();
	E->BridgeKeywordLoc = ReadSourceLocation();
	E->Kind = Record.readInt();
	}

	void ASTStmtReader::VisitCastExpr(CastExpr *E) {
	VisitExpr(E);
	unsigned NumBaseSpecs = Record.readInt();
	assert(NumBaseSpecs == E->path_size());
	E->setSubExpr(Record.readSubExpr());
	E->setCastKind((CastKind)Record.readInt());
	CastExpr::path_iterator BaseI = E->path_begin();
	while (NumBaseSpecs--) {
	CXXBaseSpecifier *BaseSpec = new (Record.getContext()) CXXBaseSpecifier;
	*BaseSpec = Record.readCXXBaseSpecifier();
	*BaseI++ = BaseSpec;
	}
	}

	void ASTStmtReader::VisitBinaryOperator(BinaryOperator *E) {
	VisitExpr(E);
	E->setLHS(Record.readSubExpr());
	E->setRHS(Record.readSubExpr());
	E->setOpcode((BinaryOperator::Opcode)Record.readInt());
	E->setOperatorLoc(ReadSourceLocation());
	E->setFPFeatures(FPOptions(Record.readInt()));
	}

	void ASTStmtReader::VisitCompoundAssignOperator(CompoundAssignOperator *E) {
	VisitBinaryOperator(E);
	E->setComputationLHSType(Record.readType());
	E->setComputationResultType(Record.readType());
	}

	void ASTStmtReader::VisitConditionalOperator(ConditionalOperator *E) {
	VisitExpr(E);
	E->SubExprs[ConditionalOperator::COND] = Record.readSubExpr();
	E->SubExprs[ConditionalOperator::LHS] = Record.readSubExpr();
	E->SubExprs[ConditionalOperator::RHS] = Record.readSubExpr();
	E->QuestionLoc = ReadSourceLocation();
	E->ColonLoc = ReadSourceLocation();
	}

	void
	ASTStmtReader::VisitBinaryConditionalOperator(BinaryConditionalOperator *E) {
	VisitExpr(E);
	E->OpaqueValue = cast<OpaqueValueExpr>(Record.readSubExpr());
	E->SubExprs[BinaryConditionalOperator::COMMON] = Record.readSubExpr();
	E->SubExprs[BinaryConditionalOperator::COND] = Record.readSubExpr();
	E->SubExprs[BinaryConditionalOperator::LHS] = Record.readSubExpr();
	E->SubExprs[BinaryConditionalOperator::RHS] = Record.readSubExpr();
	E->QuestionLoc = ReadSourceLocation();
	E->ColonLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitImplicitCastExpr(ImplicitCastExpr *E) {
	VisitCastExpr(E);
	}

	void ASTStmtReader::VisitExplicitCastExpr(ExplicitCastExpr *E) {
	VisitCastExpr(E);
	E->setTypeInfoAsWritten(GetTypeSourceInfo());
	}

	void ASTStmtReader::VisitCStyleCastExpr(CStyleCastExpr *E) {
	VisitExplicitCastExpr(E);
	E->setLParenLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
	VisitExpr(E);
	E->setLParenLoc(ReadSourceLocation());
	E->setTypeSourceInfo(GetTypeSourceInfo());
	E->setInitializer(Record.readSubExpr());
	E->setFileScope(Record.readInt());
	}

	void ASTStmtReader::VisitExtVectorElementExpr(ExtVectorElementExpr *E) {
	VisitExpr(E);
	E->setBase(Record.readSubExpr());
	E->setAccessor(Record.getIdentifierInfo());
	E->setAccessorLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitInitListExpr(InitListExpr *E) {
	VisitExpr(E);
	if (InitListExpr *SyntForm = cast_or_null<InitListExpr>(Record.readSubStmt()))
	E->setSyntacticForm(SyntForm);
	E->setLBraceLoc(ReadSourceLocation());
	E->setRBraceLoc(ReadSourceLocation());
	bool isArrayFiller = Record.readInt();
	Expr *filler = nullptr;
	if (isArrayFiller) {
	filler = Record.readSubExpr();
	E->ArrayFillerOrUnionFieldInit = filler;
	} else
	E->ArrayFillerOrUnionFieldInit = ReadDeclAs<FieldDecl>();
	E->sawArrayRangeDesignator(Record.readInt());
	unsigned NumInits = Record.readInt();
	E->reserveInits(Record.getContext(), NumInits);
	if (isArrayFiller) {
	for (unsigned I = 0; I != NumInits; ++I) {
	Expr *init = Record.readSubExpr();
	E->updateInit(Record.getContext(), I, init ? init : filler);
	}
	} else {
	for (unsigned I = 0; I != NumInits; ++I)
	E->updateInit(Record.getContext(), I, Record.readSubExpr());
	}
	}

	void ASTStmtReader::VisitDesignatedInitExpr(DesignatedInitExpr *E) {
	typedef DesignatedInitExpr::Designator Designator;

	VisitExpr(E);
	unsigned NumSubExprs = Record.readInt();
	assert(NumSubExprs == E->getNumSubExprs() && "Wrong number of subexprs");
	for (unsigned I = 0; I != NumSubExprs; ++I)
	E->setSubExpr(I, Record.readSubExpr());
	E->setEqualOrColonLoc(ReadSourceLocation());
	E->setGNUSyntax(Record.readInt());

	SmallVector<Designator, 4> Designators;
	while (Record.getIdx() < Record.size()) {
	switch ((DesignatorTypes)Record.readInt()) {
	case DESIG_FIELD_DECL: {
	FieldDecl *Field = ReadDeclAs<FieldDecl>();
	SourceLocation DotLoc = ReadSourceLocation();
	SourceLocation FieldLoc = ReadSourceLocation();
	Designators.push_back(Designator(Field->getIdentifier(), DotLoc,
	FieldLoc));
	Designators.back().setField(Field);
	break;
	}

	case DESIG_FIELD_NAME: {
	const IdentifierInfo *Name = Record.getIdentifierInfo();
	SourceLocation DotLoc = ReadSourceLocation();
	SourceLocation FieldLoc = ReadSourceLocation();
	Designators.push_back(Designator(Name, DotLoc, FieldLoc));
	break;
	}

	case DESIG_ARRAY: {
	unsigned Index = Record.readInt();
	SourceLocation LBracketLoc = ReadSourceLocation();
	SourceLocation RBracketLoc = ReadSourceLocation();
	Designators.push_back(Designator(Index, LBracketLoc, RBracketLoc));
	break;
	}

	case DESIG_ARRAY_RANGE: {
	unsigned Index = Record.readInt();
	SourceLocation LBracketLoc = ReadSourceLocation();
	SourceLocation EllipsisLoc = ReadSourceLocation();
	SourceLocation RBracketLoc = ReadSourceLocation();
	Designators.push_back(Designator(Index, LBracketLoc, EllipsisLoc,
	RBracketLoc));
	break;
	}
	}
	}
	E->setDesignators(Record.getContext(),
	Designators.data(), Designators.size());
	}

	void ASTStmtReader::VisitDesignatedInitUpdateExpr(DesignatedInitUpdateExpr *E) {
	VisitExpr(E);
	E->setBase(Record.readSubExpr());
	E->setUpdater(Record.readSubExpr());
	}

	void ASTStmtReader::VisitNoInitExpr(NoInitExpr *E) {
	VisitExpr(E);
	}

	void ASTStmtReader::VisitArrayInitLoopExpr(ArrayInitLoopExpr *E) {
	VisitExpr(E);
	E->SubExprs[0] = Record.readSubExpr();
	E->SubExprs[1] = Record.readSubExpr();
	}

	void ASTStmtReader::VisitArrayInitIndexExpr(ArrayInitIndexExpr *E) {
	VisitExpr(E);
	}

	void ASTStmtReader::VisitImplicitValueInitExpr(ImplicitValueInitExpr *E) {
	VisitExpr(E);
	}

	void ASTStmtReader::VisitVAArgExpr(VAArgExpr *E) {
	VisitExpr(E);
	E->setSubExpr(Record.readSubExpr());
	E->setWrittenTypeInfo(GetTypeSourceInfo());
	E->setBuiltinLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	E->setIsMicrosoftABI(Record.readInt());
	}

	void ASTStmtReader::VisitAddrLabelExpr(AddrLabelExpr *E) {
	VisitExpr(E);
	E->setAmpAmpLoc(ReadSourceLocation());
	E->setLabelLoc(ReadSourceLocation());
	E->setLabel(ReadDeclAs<LabelDecl>());
	}

	void ASTStmtReader::VisitStmtExpr(StmtExpr *E) {
	VisitExpr(E);
	E->setLParenLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	E->setSubStmt(cast_or_null<CompoundStmt>(Record.readSubStmt()));
	}

	void ASTStmtReader::VisitChooseExpr(ChooseExpr *E) {
	VisitExpr(E);
	E->setCond(Record.readSubExpr());
	E->setLHS(Record.readSubExpr());
	E->setRHS(Record.readSubExpr());
	E->setBuiltinLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	E->setIsConditionTrue(Record.readInt());
	}

	void ASTStmtReader::VisitGNUNullExpr(GNUNullExpr *E) {
	VisitExpr(E);
	E->setTokenLocation(ReadSourceLocation());
	}

	void ASTStmtReader::VisitShuffleVectorExpr(ShuffleVectorExpr *E) {
	VisitExpr(E);
	SmallVector<Expr *, 16> Exprs;
	unsigned NumExprs = Record.readInt();
	while (NumExprs--)
	Exprs.push_back(Record.readSubExpr());
	E->setExprs(Record.getContext(), Exprs);
	E->setBuiltinLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitConvertVectorExpr(ConvertVectorExpr *E) {
	VisitExpr(E);
	E->BuiltinLoc = ReadSourceLocation();
	E->RParenLoc = ReadSourceLocation();
	E->TInfo = GetTypeSourceInfo();
	E->SrcExpr = Record.readSubExpr();
	}

	void ASTStmtReader::VisitBlockExpr(BlockExpr *E) {
	VisitExpr(E);
	E->setBlockDecl(ReadDeclAs<BlockDecl>());
	}

	void ASTStmtReader::VisitGenericSelectionExpr(GenericSelectionExpr *E) {
	VisitExpr(E);
	E->NumAssocs = Record.readInt();
	E->AssocTypes = new (Record.getContext()) TypeSourceInfo*[E->NumAssocs];
	E->SubExprs =
	new(Record.getContext()) Stmt*[GenericSelectionExpr::END_EXPR+E->NumAssocs];

	E->SubExprs[GenericSelectionExpr::CONTROLLING] = Record.readSubExpr();
	for (unsigned I = 0, N = E->getNumAssocs(); I != N; ++I) {
	E->AssocTypes[I] = GetTypeSourceInfo();
	E->SubExprs[GenericSelectionExpr::END_EXPR+I] = Record.readSubExpr();
	}
	E->ResultIndex = Record.readInt();

	E->GenericLoc = ReadSourceLocation();
	E->DefaultLoc = ReadSourceLocation();
	E->RParenLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitPseudoObjectExpr(PseudoObjectExpr *E) {
	VisitExpr(E);
	unsigned numSemanticExprs = Record.readInt();
	assert(numSemanticExprs + 1 == E->PseudoObjectExprBits.NumSubExprs);
	E->PseudoObjectExprBits.ResultIndex = Record.readInt();

	// Read the syntactic expression.
	E->getSubExprsBuffer()[0] = Record.readSubExpr();

	// Read all the semantic expressions.
	for (unsigned i = 0; i != numSemanticExprs; ++i) {
	Expr *subExpr = Record.readSubExpr();
	E->getSubExprsBuffer()[i+1] = subExpr;
	}
	}

	void ASTStmtReader::VisitAtomicExpr(AtomicExpr *E) {
	VisitExpr(E);
	E->Op = AtomicExpr::AtomicOp(Record.readInt());
	E->NumSubExprs = AtomicExpr::getNumSubExprs(E->Op);
	for (unsigned I = 0; I != E->NumSubExprs; ++I)
	E->SubExprs[I] = Record.readSubExpr();
	E->BuiltinLoc = ReadSourceLocation();
	E->RParenLoc = ReadSourceLocation();
	}

	//===----------------------------------------------------------------------===//
	// Objective-C Expressions and Statements

	void ASTStmtReader::VisitObjCStringLiteral(ObjCStringLiteral *E) {
	VisitExpr(E);
	E->setString(cast<StringLiteral>(Record.readSubStmt()));
	E->setAtLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCBoxedExpr(ObjCBoxedExpr *E) {
	VisitExpr(E);
	// could be one of several IntegerLiteral, FloatLiteral, etc.
	E->SubExpr = Record.readSubStmt();
	E->BoxingMethod = ReadDeclAs<ObjCMethodDecl>();
	E->Range = ReadSourceRange();
	}

	void ASTStmtReader::VisitObjCArrayLiteral(ObjCArrayLiteral *E) {
	VisitExpr(E);
	unsigned NumElements = Record.readInt();
	assert(NumElements == E->getNumElements() && "Wrong number of elements");
	Expr **Elements = E->getElements();
	for (unsigned I = 0, N = NumElements; I != N; ++I)
	Elements[I] = Record.readSubExpr();
	E->ArrayWithObjectsMethod = ReadDeclAs<ObjCMethodDecl>();
	E->Range = ReadSourceRange();
	}

	void ASTStmtReader::VisitObjCDictionaryLiteral(ObjCDictionaryLiteral *E) {
	VisitExpr(E);
	unsigned NumElements = Record.readInt();
	assert(NumElements == E->getNumElements() && "Wrong number of elements");
	bool HasPackExpansions = Record.readInt();
	assert(HasPackExpansions == E->HasPackExpansions &&"Pack expansion mismatch");
	ObjCDictionaryLiteral::KeyValuePair *KeyValues =
	E->getTrailingObjects<ObjCDictionaryLiteral::KeyValuePair>();
	ObjCDictionaryLiteral::ExpansionData *Expansions =
	E->getTrailingObjects<ObjCDictionaryLiteral::ExpansionData>();
	for (unsigned I = 0; I != NumElements; ++I) {
	KeyValues[I].Key = Record.readSubExpr();
	KeyValues[I].Value = Record.readSubExpr();
	if (HasPackExpansions) {
	Expansions[I].EllipsisLoc = ReadSourceLocation();
	Expansions[I].NumExpansionsPlusOne = Record.readInt();
	}
	}
	E->DictWithObjectsMethod = ReadDeclAs<ObjCMethodDecl>();
	E->Range = ReadSourceRange();
	}

	void ASTStmtReader::VisitObjCEncodeExpr(ObjCEncodeExpr *E) {
	VisitExpr(E);
	E->setEncodedTypeSourceInfo(GetTypeSourceInfo());
	E->setAtLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCSelectorExpr(ObjCSelectorExpr *E) {
	VisitExpr(E);
	E->setSelector(Record.readSelector());
	E->setAtLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCProtocolExpr(ObjCProtocolExpr *E) {
	VisitExpr(E);
	E->setProtocol(ReadDeclAs<ObjCProtocolDecl>());
	E->setAtLoc(ReadSourceLocation());
	E->ProtoLoc = ReadSourceLocation();
	E->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCIvarRefExpr(ObjCIvarRefExpr *E) {
	VisitExpr(E);
	E->setDecl(ReadDeclAs<ObjCIvarDecl>());
	E->setLocation(ReadSourceLocation());
	E->setOpLoc(ReadSourceLocation());
	E->setBase(Record.readSubExpr());
	E->setIsArrow(Record.readInt());
	E->setIsFreeIvar(Record.readInt());
	}

	void ASTStmtReader::VisitObjCPropertyRefExpr(ObjCPropertyRefExpr *E) {
	VisitExpr(E);
	unsigned MethodRefFlags = Record.readInt();
	bool Implicit = Record.readInt() != 0;
	if (Implicit) {
	ObjCMethodDecl *Getter = ReadDeclAs<ObjCMethodDecl>();
	ObjCMethodDecl *Setter = ReadDeclAs<ObjCMethodDecl>();
	E->setImplicitProperty(Getter, Setter, MethodRefFlags);
	} else {
	E->setExplicitProperty(ReadDeclAs<ObjCPropertyDecl>(), MethodRefFlags);
	}
	E->setLocation(ReadSourceLocation());
	E->setReceiverLocation(ReadSourceLocation());
	switch (Record.readInt()) {
	case 0:
	E->setBase(Record.readSubExpr());
	break;
	case 1:
	E->setSuperReceiver(Record.readType());
	break;
	case 2:
	E->setClassReceiver(ReadDeclAs<ObjCInterfaceDecl>());
	break;
	}
	}

	void ASTStmtReader::VisitObjCSubscriptRefExpr(ObjCSubscriptRefExpr *E) {
	VisitExpr(E);
	E->setRBracket(ReadSourceLocation());
	E->setBaseExpr(Record.readSubExpr());
	E->setKeyExpr(Record.readSubExpr());
	E->GetAtIndexMethodDecl = ReadDeclAs<ObjCMethodDecl>();
	E->SetAtIndexMethodDecl = ReadDeclAs<ObjCMethodDecl>();
	}

	void ASTStmtReader::VisitObjCMessageExpr(ObjCMessageExpr *E) {
	VisitExpr(E);
	assert(Record.peekInt() == E->getNumArgs());
	Record.skipInts(1);
	unsigned NumStoredSelLocs = Record.readInt();
	E->SelLocsKind = Record.readInt();
	E->setDelegateInitCall(Record.readInt());
	E->IsImplicit = Record.readInt();
	ObjCMessageExpr::ReceiverKind Kind
	= static_cast<ObjCMessageExpr::ReceiverKind>(Record.readInt());
	switch (Kind) {
	case ObjCMessageExpr::Instance:
	E->setInstanceReceiver(Record.readSubExpr());
	break;

	case ObjCMessageExpr::Class:
	E->setClassReceiver(GetTypeSourceInfo());
	break;

	case ObjCMessageExpr::SuperClass:
	case ObjCMessageExpr::SuperInstance: {
	QualType T = Record.readType();
	SourceLocation SuperLoc = ReadSourceLocation();
	E->setSuper(SuperLoc, T, Kind == ObjCMessageExpr::SuperInstance);
	break;
	}
	}

	assert(Kind == E->getReceiverKind());

	if (Record.readInt())
	E->setMethodDecl(ReadDeclAs<ObjCMethodDecl>());
	else
	E->setSelector(Record.readSelector());

	E->LBracLoc = ReadSourceLocation();
	E->RBracLoc = ReadSourceLocation();

	for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
	E->setArg(I, Record.readSubExpr());

	SourceLocation *Locs = E->getStoredSelLocs();
	for (unsigned I = 0; I != NumStoredSelLocs; ++I)
	Locs[I] = ReadSourceLocation();
	}

	void ASTStmtReader::VisitObjCForCollectionStmt(ObjCForCollectionStmt *S) {
	VisitStmt(S);
	S->setElement(Record.readSubStmt());
	S->setCollection(Record.readSubExpr());
	S->setBody(Record.readSubStmt());
	S->setForLoc(ReadSourceLocation());
	S->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCAtCatchStmt(ObjCAtCatchStmt *S) {
	VisitStmt(S);
	S->setCatchBody(Record.readSubStmt());
	S->setCatchParamDecl(ReadDeclAs<VarDecl>());
	S->setAtCatchLoc(ReadSourceLocation());
	S->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCAtFinallyStmt(ObjCAtFinallyStmt *S) {
	VisitStmt(S);
	S->setFinallyBody(Record.readSubStmt());
	S->setAtFinallyLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCAutoreleasePoolStmt(ObjCAutoreleasePoolStmt *S) {
	VisitStmt(S);
	S->setSubStmt(Record.readSubStmt());
	S->setAtLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCAtTryStmt(ObjCAtTryStmt *S) {
	VisitStmt(S);
	assert(Record.peekInt() == S->getNumCatchStmts());
	Record.skipInts(1);
	bool HasFinally = Record.readInt();
	S->setTryBody(Record.readSubStmt());
	for (unsigned I = 0, N = S->getNumCatchStmts(); I != N; ++I)
	S->setCatchStmt(I, cast_or_null<ObjCAtCatchStmt>(Record.readSubStmt()));

	if (HasFinally)
	S->setFinallyStmt(Record.readSubStmt());
	S->setAtTryLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCAtSynchronizedStmt(ObjCAtSynchronizedStmt *S) {
	VisitStmt(S);
	S->setSynchExpr(Record.readSubStmt());
	S->setSynchBody(Record.readSubStmt());
	S->setAtSynchronizedLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCAtThrowStmt(ObjCAtThrowStmt *S) {
	VisitStmt(S);
	S->setThrowExpr(Record.readSubStmt());
	S->setThrowLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCBoolLiteralExpr(ObjCBoolLiteralExpr *E) {
	VisitExpr(E);
	E->setValue(Record.readInt());
	E->setLocation(ReadSourceLocation());
	}

	void ASTStmtReader::VisitObjCAvailabilityCheckExpr(ObjCAvailabilityCheckExpr *E) {
	VisitExpr(E);
	SourceRange R = Record.readSourceRange();
	E->AtLoc = R.getBegin();
	E->RParen = R.getEnd();
	E->VersionToCheck = Record.readVersionTuple();
	}

	//===----------------------------------------------------------------------===//
	// C++ Expressions and Statements
	//===----------------------------------------------------------------------===//

	void ASTStmtReader::VisitCXXCatchStmt(CXXCatchStmt *S) {
	VisitStmt(S);
	S->CatchLoc = ReadSourceLocation();
	S->ExceptionDecl = ReadDeclAs<VarDecl>();
	S->HandlerBlock = Record.readSubStmt();
	}

	void ASTStmtReader::VisitCXXTryStmt(CXXTryStmt *S) {
	VisitStmt(S);
	assert(Record.peekInt() == S->getNumHandlers() && "NumStmtFields is wrong ?");
	Record.skipInts(1);
	S->TryLoc = ReadSourceLocation();
	S->getStmts()[0] = Record.readSubStmt();
	for (unsigned i = 0, e = S->getNumHandlers(); i != e; ++i)
	S->getStmts()[i + 1] = Record.readSubStmt();
	}

	void ASTStmtReader::VisitCXXForRangeStmt(CXXForRangeStmt *S) {
	VisitStmt(S);
	S->ForLoc = ReadSourceLocation();
	S->CoawaitLoc = ReadSourceLocation();
	S->ColonLoc = ReadSourceLocation();
	S->RParenLoc = ReadSourceLocation();
	S->setRangeStmt(Record.readSubStmt());
	S->setBeginStmt(Record.readSubStmt());
	S->setEndStmt(Record.readSubStmt());
	S->setCond(Record.readSubExpr());
	S->setInc(Record.readSubExpr());
	S->setLoopVarStmt(Record.readSubStmt());
	S->setBody(Record.readSubStmt());
	}

	void ASTStmtReader::VisitMSDependentExistsStmt(MSDependentExistsStmt *S) {
	VisitStmt(S);
	S->KeywordLoc = ReadSourceLocation();
	S->IsIfExists = Record.readInt();
	S->QualifierLoc = Record.readNestedNameSpecifierLoc();
	ReadDeclarationNameInfo(S->NameInfo);
	S->SubStmt = Record.readSubStmt();
	}

	void ASTStmtReader::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
	VisitCallExpr(E);
	E->Operator = (OverloadedOperatorKind)Record.readInt();
	E->Range = Record.readSourceRange();
	E->setFPFeatures(FPOptions(Record.readInt()));
	}

	void ASTStmtReader::VisitCXXConstructExpr(CXXConstructExpr *E) {
	VisitExpr(E);
	E->NumArgs = Record.readInt();
	if (E->NumArgs)
	E->Args = new (Record.getContext()) Stmt*[E->NumArgs];
	for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
	E->setArg(I, Record.readSubExpr());
	E->setConstructor(ReadDeclAs<CXXConstructorDecl>());
	E->setLocation(ReadSourceLocation());
	E->setElidable(Record.readInt());
	E->setHadMultipleCandidates(Record.readInt());
	E->setListInitialization(Record.readInt());
	E->setStdInitListInitialization(Record.readInt());
	E->setRequiresZeroInitialization(Record.readInt());
	E->setConstructionKind((CXXConstructExpr::ConstructionKind)Record.readInt());
	E->ParenOrBraceRange = ReadSourceRange();
	}

	void ASTStmtReader::VisitCXXInheritedCtorInitExpr(CXXInheritedCtorInitExpr *E) {
	VisitExpr(E);
	E->Constructor = ReadDeclAs<CXXConstructorDecl>();
	E->Loc = ReadSourceLocation();
	E->ConstructsVirtualBase = Record.readInt();
	E->InheritedFromVirtualBase = Record.readInt();
	}

	void ASTStmtReader::VisitCXXTemporaryObjectExpr(CXXTemporaryObjectExpr *E) {
	VisitCXXConstructExpr(E);
	E->Type = GetTypeSourceInfo();
	}

	void ASTStmtReader::VisitLambdaExpr(LambdaExpr *E) {
	VisitExpr(E);
	unsigned NumCaptures = Record.readInt();
	assert(NumCaptures == E->NumCaptures);(void)NumCaptures;
	E->IntroducerRange = ReadSourceRange();
	E->CaptureDefault = static_cast<LambdaCaptureDefault>(Record.readInt());
	E->CaptureDefaultLoc = ReadSourceLocation();
	E->ExplicitParams = Record.readInt();
	E->ExplicitResultType = Record.readInt();
	E->ClosingBrace = ReadSourceLocation();

	// Read capture initializers.
	for (LambdaExpr::capture_init_iterator C = E->capture_init_begin(),
	CEnd = E->capture_init_end();
	C != CEnd; ++C)
	*C = Record.readSubExpr();
	}

	void
	ASTStmtReader::VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *E) {
	VisitExpr(E);
	E->SubExpr = Record.readSubExpr();
	}

	void ASTStmtReader::VisitCXXNamedCastExpr(CXXNamedCastExpr *E) {
	VisitExplicitCastExpr(E);
	SourceRange R = ReadSourceRange();
	E->Loc = R.getBegin();
	E->RParenLoc = R.getEnd();
	R = ReadSourceRange();
	E->AngleBrackets = R;
	}

	void ASTStmtReader::VisitCXXStaticCastExpr(CXXStaticCastExpr *E) {
	return VisitCXXNamedCastExpr(E);
	}

	void ASTStmtReader::VisitCXXDynamicCastExpr(CXXDynamicCastExpr *E) {
	return VisitCXXNamedCastExpr(E);
	}

	void ASTStmtReader::VisitCXXReinterpretCastExpr(CXXReinterpretCastExpr *E) {
	return VisitCXXNamedCastExpr(E);
	}

	void ASTStmtReader::VisitCXXConstCastExpr(CXXConstCastExpr *E) {
	return VisitCXXNamedCastExpr(E);
	}

	void ASTStmtReader::VisitCXXFunctionalCastExpr(CXXFunctionalCastExpr *E) {
	VisitExplicitCastExpr(E);
	E->setLParenLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitUserDefinedLiteral(UserDefinedLiteral *E) {
	VisitCallExpr(E);
	E->UDSuffixLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitCXXBoolLiteralExpr(CXXBoolLiteralExpr *E) {
	VisitExpr(E);
	E->setValue(Record.readInt());
	E->setLocation(ReadSourceLocation());
	}

	void ASTStmtReader::VisitCXXNullPtrLiteralExpr(CXXNullPtrLiteralExpr *E) {
	VisitExpr(E);
	E->setLocation(ReadSourceLocation());
	}

	void ASTStmtReader::VisitCXXTypeidExpr(CXXTypeidExpr *E) {
	VisitExpr(E);
	E->setSourceRange(ReadSourceRange());
	if (E->isTypeOperand()) { // typeid(int)
	E->setTypeOperandSourceInfo(
	GetTypeSourceInfo());
	return;
	}

	// typeid(42+2)
	E->setExprOperand(Record.readSubExpr());
	}

	void ASTStmtReader::VisitCXXThisExpr(CXXThisExpr *E) {
	VisitExpr(E);
	E->setLocation(ReadSourceLocation());
	E->setImplicit(Record.readInt());
	}

	void ASTStmtReader::VisitCXXThrowExpr(CXXThrowExpr *E) {
	VisitExpr(E);
	E->ThrowLoc = ReadSourceLocation();
	E->Op = Record.readSubExpr();
	E->IsThrownVariableInScope = Record.readInt();
	}

	void ASTStmtReader::VisitCXXDefaultArgExpr(CXXDefaultArgExpr *E) {
	VisitExpr(E);
	E->Param = ReadDeclAs<ParmVarDecl>();
	E->Loc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *E) {
	VisitExpr(E);
	E->Field = ReadDeclAs<FieldDecl>();
	E->Loc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) {
	VisitExpr(E);
	E->setTemporary(Record.readCXXTemporary());
	E->setSubExpr(Record.readSubExpr());
	}

	void ASTStmtReader::VisitCXXScalarValueInitExpr(CXXScalarValueInitExpr *E) {
	VisitExpr(E);
	E->TypeInfo = GetTypeSourceInfo();
	E->RParenLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitCXXNewExpr(CXXNewExpr *E) {
	VisitExpr(E);
	E->GlobalNew = Record.readInt();
	bool isArray = Record.readInt();
	E->PassAlignment = Record.readInt();
	E->UsualArrayDeleteWantsSize = Record.readInt();
	unsigned NumPlacementArgs = Record.readInt();
	E->StoredInitializationStyle = Record.readInt();
	E->setOperatorNew(ReadDeclAs<FunctionDecl>());
	E->setOperatorDelete(ReadDeclAs<FunctionDecl>());
	E->AllocatedTypeInfo = GetTypeSourceInfo();
	E->TypeIdParens = ReadSourceRange();
	E->Range = ReadSourceRange();
	E->DirectInitRange = ReadSourceRange();

	E->AllocateArgsArray(Record.getContext(), isArray, NumPlacementArgs,
	E->StoredInitializationStyle != 0);

	// Install all the subexpressions.
	for (CXXNewExpr::raw_arg_iterator I = E->raw_arg_begin(),e = E->raw_arg_end();
	I != e; ++I)
	*I = Record.readSubStmt();
	}

	void ASTStmtReader::VisitCXXDeleteExpr(CXXDeleteExpr *E) {
	VisitExpr(E);
	E->GlobalDelete = Record.readInt();
	E->ArrayForm = Record.readInt();
	E->ArrayFormAsWritten = Record.readInt();
	E->UsualArrayDeleteWantsSize = Record.readInt();
	E->OperatorDelete = ReadDeclAs<FunctionDecl>();
	E->Argument = Record.readSubExpr();
	E->Loc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitCXXPseudoDestructorExpr(CXXPseudoDestructorExpr *E) {
	VisitExpr(E);

	E->Base = Record.readSubExpr();
	E->IsArrow = Record.readInt();
	E->OperatorLoc = ReadSourceLocation();
	E->QualifierLoc = Record.readNestedNameSpecifierLoc();
	E->ScopeType = GetTypeSourceInfo();
	E->ColonColonLoc = ReadSourceLocation();
	E->TildeLoc = ReadSourceLocation();

	IdentifierInfo *II = Record.getIdentifierInfo();
	if (II)
	E->setDestroyedType(II, ReadSourceLocation());
	else
	E->setDestroyedType(GetTypeSourceInfo());
	}

	void ASTStmtReader::VisitExprWithCleanups(ExprWithCleanups *E) {
	VisitExpr(E);

	unsigned NumObjects = Record.readInt();
	assert(NumObjects == E->getNumObjects());
	for (unsigned i = 0; i != NumObjects; ++i)
	E->getTrailingObjects<BlockDecl *>()[i] =
	ReadDeclAs<BlockDecl>();

	E->ExprWithCleanupsBits.CleanupsHaveSideEffects = Record.readInt();
	E->SubExpr = Record.readSubExpr();
	}

	void
	ASTStmtReader::VisitCXXDependentScopeMemberExpr(CXXDependentScopeMemberExpr *E){
	VisitExpr(E);

	if (Record.readInt()) // HasTemplateKWAndArgsInfo
	ReadTemplateKWAndArgsInfo(
	*E->getTrailingObjects<ASTTemplateKWAndArgsInfo>(),
	E->getTrailingObjects<TemplateArgumentLoc>(),
	/NumTemplateArgs=/Record.readInt());

	E->Base = Record.readSubExpr();
	E->BaseType = Record.readType();
	E->IsArrow = Record.readInt();
	E->OperatorLoc = ReadSourceLocation();
	E->QualifierLoc = Record.readNestedNameSpecifierLoc();
	E->FirstQualifierFoundInScope = ReadDeclAs<NamedDecl>();
	ReadDeclarationNameInfo(E->MemberNameInfo);
	}

	void
	ASTStmtReader::VisitDependentScopeDeclRefExpr(DependentScopeDeclRefExpr *E) {
	VisitExpr(E);

	if (Record.readInt()) // HasTemplateKWAndArgsInfo
	ReadTemplateKWAndArgsInfo(
	*E->getTrailingObjects<ASTTemplateKWAndArgsInfo>(),
	E->getTrailingObjects<TemplateArgumentLoc>(),
	/NumTemplateArgs=/Record.readInt());

	E->QualifierLoc = Record.readNestedNameSpecifierLoc();
	ReadDeclarationNameInfo(E->NameInfo);
	}

	void
	ASTStmtReader::VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *E) {
	VisitExpr(E);
	assert(Record.peekInt() == E->arg_size() &&
	"Read wrong record during creation ?");
	Record.skipInts(1);
	for (unsigned I = 0, N = E->arg_size(); I != N; ++I)
	E->setArg(I, Record.readSubExpr());
	E->Type = GetTypeSourceInfo();
	E->setLParenLoc(ReadSourceLocation());
	E->setRParenLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitOverloadExpr(OverloadExpr *E) {
	VisitExpr(E);

	if (Record.readInt()) // HasTemplateKWAndArgsInfo
	ReadTemplateKWAndArgsInfo(*E->getTrailingASTTemplateKWAndArgsInfo(),
	E->getTrailingTemplateArgumentLoc(),
	/NumTemplateArgs=/Record.readInt());

	unsigned NumDecls = Record.readInt();
	UnresolvedSet<8> Decls;
	for (unsigned i = 0; i != NumDecls; ++i) {
	NamedDecl *D = ReadDeclAs<NamedDecl>();
	AccessSpecifier AS = (AccessSpecifier)Record.readInt();
	Decls.addDecl(D, AS);
	}
	E->initializeResults(Record.getContext(), Decls.begin(), Decls.end());

	ReadDeclarationNameInfo(E->NameInfo);
	E->QualifierLoc = Record.readNestedNameSpecifierLoc();
	}

	void ASTStmtReader::VisitUnresolvedMemberExpr(UnresolvedMemberExpr *E) {
	VisitOverloadExpr(E);
	E->IsArrow = Record.readInt();
	E->HasUnresolvedUsing = Record.readInt();
	E->Base = Record.readSubExpr();
	E->BaseType = Record.readType();
	E->OperatorLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitUnresolvedLookupExpr(UnresolvedLookupExpr *E) {
	VisitOverloadExpr(E);
	E->RequiresADL = Record.readInt();
	E->Overloaded = Record.readInt();
	E->NamingClass = ReadDeclAs<CXXRecordDecl>();
	}

	void ASTStmtReader::VisitTypeTraitExpr(TypeTraitExpr *E) {
	VisitExpr(E);
	E->TypeTraitExprBits.NumArgs = Record.readInt();
	E->TypeTraitExprBits.Kind = Record.readInt();
	E->TypeTraitExprBits.Value = Record.readInt();
	SourceRange Range = ReadSourceRange();
	E->Loc = Range.getBegin();
	E->RParenLoc = Range.getEnd();

	TypeSourceInfo *Args = E->getTrailingObjects<TypeSourceInfo >();
	for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
	Args[I] = GetTypeSourceInfo();
	}

	void ASTStmtReader::VisitArrayTypeTraitExpr(ArrayTypeTraitExpr *E) {
	VisitExpr(E);
	E->ATT = (ArrayTypeTrait)Record.readInt();
	E->Value = (unsigned int)Record.readInt();
	SourceRange Range = ReadSourceRange();
	E->Loc = Range.getBegin();
	E->RParen = Range.getEnd();
	E->QueriedType = GetTypeSourceInfo();
	E->Dimension = Record.readSubExpr();
	}

	void ASTStmtReader::VisitExpressionTraitExpr(ExpressionTraitExpr *E) {
	VisitExpr(E);
	E->ET = (ExpressionTrait)Record.readInt();
	E->Value = (bool)Record.readInt();
	SourceRange Range = ReadSourceRange();
	E->QueriedExpression = Record.readSubExpr();
	E->Loc = Range.getBegin();
	E->RParen = Range.getEnd();
	}

	void ASTStmtReader::VisitCXXNoexceptExpr(CXXNoexceptExpr *E) {
	VisitExpr(E);
	E->Value = (bool)Record.readInt();
	E->Range = ReadSourceRange();
	E->Operand = Record.readSubExpr();
	}

	void ASTStmtReader::VisitPackExpansionExpr(PackExpansionExpr *E) {
	VisitExpr(E);
	E->EllipsisLoc = ReadSourceLocation();
	E->NumExpansions = Record.readInt();
	E->Pattern = Record.readSubExpr();
	}

	void ASTStmtReader::VisitSizeOfPackExpr(SizeOfPackExpr *E) {
	VisitExpr(E);
	unsigned NumPartialArgs = Record.readInt();
	E->OperatorLoc = ReadSourceLocation();
	E->PackLoc = ReadSourceLocation();
	E->RParenLoc = ReadSourceLocation();
	E->Pack = Record.readDeclAs<NamedDecl>();
	if (E->isPartiallySubstituted()) {
	assert(E->Length == NumPartialArgs);
	for (auto *I = E->getTrailingObjects<TemplateArgument>(),
	*E = I + NumPartialArgs;
	I != E; ++I)
	new (I) TemplateArgument(Record.readTemplateArgument());
	} else if (!E->isValueDependent()) {
	E->Length = Record.readInt();
	}
	}

	void ASTStmtReader::VisitSubstNonTypeTemplateParmExpr(
	SubstNonTypeTemplateParmExpr *E) {
	VisitExpr(E);
	E->Param = ReadDeclAs<NonTypeTemplateParmDecl>();
	E->NameLoc = ReadSourceLocation();
	E->Replacement = Record.readSubExpr();
	}

	void ASTStmtReader::VisitSubstNonTypeTemplateParmPackExpr(
	SubstNonTypeTemplateParmPackExpr *E) {
	VisitExpr(E);
	E->Param = ReadDeclAs<NonTypeTemplateParmDecl>();
	TemplateArgument ArgPack = Record.readTemplateArgument();
	if (ArgPack.getKind() != TemplateArgument::Pack)
	return;

	E->Arguments = ArgPack.pack_begin();
	E->NumArguments = ArgPack.pack_size();
	E->NameLoc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitFunctionParmPackExpr(FunctionParmPackExpr *E) {
	VisitExpr(E);
	E->NumParameters = Record.readInt();
	E->ParamPack = ReadDeclAs<ParmVarDecl>();
	E->NameLoc = ReadSourceLocation();
	ParmVarDecl *Parms = E->getTrailingObjects<ParmVarDecl >();
	for (unsigned i = 0, n = E->NumParameters; i != n; ++i)
	Parms[i] = ReadDeclAs<ParmVarDecl>();
	}

	void ASTStmtReader::VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E) {
	VisitExpr(E);
	E->State = Record.readSubExpr();
	auto VD = ReadDeclAs<ValueDecl>();
	unsigned ManglingNumber = Record.readInt();
	E->setExtendingDecl(VD, ManglingNumber);
	}

	void ASTStmtReader::VisitCXXFoldExpr(CXXFoldExpr *E) {
	VisitExpr(E);
	E->LParenLoc = ReadSourceLocation();
	E->EllipsisLoc = ReadSourceLocation();
	E->RParenLoc = ReadSourceLocation();
	E->SubExprs[0] = Record.readSubExpr();
	E->SubExprs[1] = Record.readSubExpr();
	E->Opcode = (BinaryOperatorKind)Record.readInt();
	}

	void ASTStmtReader::VisitOpaqueValueExpr(OpaqueValueExpr *E) {
	VisitExpr(E);
	E->SourceExpr = Record.readSubExpr();
	E->Loc = ReadSourceLocation();
	}

	void ASTStmtReader::VisitTypoExpr(TypoExpr *E) {
	llvm_unreachable("Cannot read TypoExpr nodes");
	}

	//===----------------------------------------------------------------------===//
	// Microsoft Expressions and Statements
	//===----------------------------------------------------------------------===//
	void ASTStmtReader::VisitMSPropertyRefExpr(MSPropertyRefExpr *E) {
	VisitExpr(E);
	E->IsArrow = (Record.readInt() != 0);
	E->BaseExpr = Record.readSubExpr();
	E->QualifierLoc = Record.readNestedNameSpecifierLoc();
	E->MemberLoc = ReadSourceLocation();
	E->TheDecl = ReadDeclAs<MSPropertyDecl>();
	}

	void ASTStmtReader::VisitMSPropertySubscriptExpr(MSPropertySubscriptExpr *E) {
	VisitExpr(E);
	E->setBase(Record.readSubExpr());
	E->setIdx(Record.readSubExpr());
	E->setRBracketLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitCXXUuidofExpr(CXXUuidofExpr *E) {
	VisitExpr(E);
	E->setSourceRange(ReadSourceRange());
	std::string UuidStr = ReadString();
	E->setUuidStr(StringRef(UuidStr).copy(Record.getContext()));
	if (E->isTypeOperand()) { // __uuidof(ComType)
	E->setTypeOperandSourceInfo(
	GetTypeSourceInfo());
	return;
	}

	// __uuidof(expr)
	E->setExprOperand(Record.readSubExpr());
	}

	void ASTStmtReader::VisitSEHLeaveStmt(SEHLeaveStmt *S) {
	VisitStmt(S);
	S->setLeaveLoc(ReadSourceLocation());
	}

	void ASTStmtReader::VisitSEHExceptStmt(SEHExceptStmt *S) {
	VisitStmt(S);
	S->Loc = ReadSourceLocation();
	S->Children[SEHExceptStmt::FILTER_EXPR] = Record.readSubStmt();
	S->Children[SEHExceptStmt::BLOCK] = Record.readSubStmt();
	}

	void ASTStmtReader::VisitSEHFinallyStmt(SEHFinallyStmt *S) {
	VisitStmt(S);
	S->Loc = ReadSourceLocation();
	S->Block = Record.readSubStmt();
	}

	void ASTStmtReader::VisitSEHTryStmt(SEHTryStmt *S) {
	VisitStmt(S);
	S->IsCXXTry = Record.readInt();
	S->TryLoc = ReadSourceLocation();
	S->Children[SEHTryStmt::TRY] = Record.readSubStmt();
	S->Children[SEHTryStmt::HANDLER] = Record.readSubStmt();
	}

	//===----------------------------------------------------------------------===//
	// CUDA Expressions and Statements
	//===----------------------------------------------------------------------===//

	void ASTStmtReader::VisitCUDAKernelCallExpr(CUDAKernelCallExpr *E) {
	VisitCallExpr(E);
	E->setConfig(cast<CallExpr>(Record.readSubExpr()));
	}

	//===----------------------------------------------------------------------===//
	// OpenCL Expressions and Statements.
	//===----------------------------------------------------------------------===//
	void ASTStmtReader::VisitAsTypeExpr(AsTypeExpr *E) {
	VisitExpr(E);
	E->BuiltinLoc = ReadSourceLocation();
	E->RParenLoc = ReadSourceLocation();
	E->SrcExpr = Record.readSubExpr();
	}

	//===----------------------------------------------------------------------===//
	// OpenMP Clauses.
	//===----------------------------------------------------------------------===//

	namespace clang {
	class OMPClauseReader : public OMPClauseVisitor<OMPClauseReader> {
	ASTStmtReader *Reader;
	ASTContext &Context;
	public:
	OMPClauseReader(ASTStmtReader *R, ASTRecordReader &Record)
	: Reader(R), Context(Record.getContext()) {}
	#define OPENMP_CLAUSE(Name, Class) void Visit##Class(Class *C);
	#include "clang/Basic/OpenMPKinds.def"
	OMPClause *readClause();
	void VisitOMPClauseWithPreInit(OMPClauseWithPreInit *C);
	void VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *C);
	};
	}

	OMPClause *OMPClauseReader::readClause() {
	OMPClause *C;
	switch (Reader->Record.readInt()) {
	case OMPC_if:
	C = new (Context) OMPIfClause();
	break;
	case OMPC_final:
	C = new (Context) OMPFinalClause();
	break;
	case OMPC_num_threads:
	C = new (Context) OMPNumThreadsClause();
	break;
	case OMPC_safelen:
	C = new (Context) OMPSafelenClause();
	break;
	case OMPC_simdlen:
	C = new (Context) OMPSimdlenClause();
	break;
	case OMPC_collapse:
	C = new (Context) OMPCollapseClause();
	break;
	case OMPC_default:
	C = new (Context) OMPDefaultClause();
	break;
	case OMPC_proc_bind:
	C = new (Context) OMPProcBindClause();
	break;
	case OMPC_schedule:
	C = new (Context) OMPScheduleClause();
	break;
	case OMPC_ordered:
	C = new (Context) OMPOrderedClause();
	break;
	case OMPC_nowait:
	C = new (Context) OMPNowaitClause();
	break;
	case OMPC_untied:
	C = new (Context) OMPUntiedClause();
	break;
	case OMPC_mergeable:
	C = new (Context) OMPMergeableClause();
	break;
	case OMPC_read:
	C = new (Context) OMPReadClause();
	break;
	case OMPC_write:
	C = new (Context) OMPWriteClause();
	break;
	case OMPC_update:
	C = new (Context) OMPUpdateClause();
	break;
	case OMPC_capture:
	C = new (Context) OMPCaptureClause();
	break;
	case OMPC_seq_cst:
	C = new (Context) OMPSeqCstClause();
	break;
	case OMPC_threads:
	C = new (Context) OMPThreadsClause();
	break;
	case OMPC_simd:
	C = new (Context) OMPSIMDClause();
	break;
	case OMPC_nogroup:
	C = new (Context) OMPNogroupClause();
	break;
	case OMPC_private:
	C = OMPPrivateClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_firstprivate:
	C = OMPFirstprivateClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_lastprivate:
	C = OMPLastprivateClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_shared:
	C = OMPSharedClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_reduction:
	C = OMPReductionClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_task_reduction:
	C = OMPTaskReductionClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_linear:
	C = OMPLinearClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_aligned:
	C = OMPAlignedClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_copyin:
	C = OMPCopyinClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_copyprivate:
	C = OMPCopyprivateClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_flush:
	C = OMPFlushClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_depend:
	C = OMPDependClause::CreateEmpty(Context, Reader->Record.readInt());
	break;
	case OMPC_device:
	C = new (Context) OMPDeviceClause();
	break;
	case OMPC_map: {
	unsigned NumVars = Reader->Record.readInt();
	unsigned NumDeclarations = Reader->Record.readInt();
	unsigned NumLists = Reader->Record.readInt();
	unsigned NumComponents = Reader->Record.readInt();
	C = OMPMapClause::CreateEmpty(Context, NumVars, NumDeclarations, NumLists,
	NumComponents);
	break;
	}
	case OMPC_num_teams:
	C = new (Context) OMPNumTeamsClause();
	break;
	case OMPC_thread_limit:
	C = new (Context) OMPThreadLimitClause();
	break;
	case OMPC_priority:
	C = new (Context) OMPPriorityClause();
	break;
	case OMPC_grainsize:
	C = new (Context) OMPGrainsizeClause();
	break;
	case OMPC_num_tasks:
	C = new (Context) OMPNumTasksClause();
	break;
	case OMPC_hint:
	C = new (Context) OMPHintClause();
	break;
	case OMPC_dist_schedule:
	C = new (Context) OMPDistScheduleClause();
	break;
	case OMPC_defaultmap:
	C = new (Context) OMPDefaultmapClause();
	break;
	case OMPC_to: {
	unsigned NumVars = Reader->Record.readInt();
	unsigned NumDeclarations = Reader->Record.readInt();
	unsigned NumLists = Reader->Record.readInt();
	unsigned NumComponents = Reader->Record.readInt();
	C = OMPToClause::CreateEmpty(Context, NumVars, NumDeclarations, NumLists,
	NumComponents);
	break;
	}
	case OMPC_from: {
	unsigned NumVars = Reader->Record.readInt();
	unsigned NumDeclarations = Reader->Record.readInt();
	unsigned NumLists = Reader->Record.readInt();
	unsigned NumComponents = Reader->Record.readInt();
	C = OMPFromClause::CreateEmpty(Context, NumVars, NumDeclarations, NumLists,
	NumComponents);
	break;
	}
	case OMPC_use_device_ptr: {
	unsigned NumVars = Reader->Record.readInt();
	unsigned NumDeclarations = Reader->Record.readInt();
	unsigned NumLists = Reader->Record.readInt();
	unsigned NumComponents = Reader->Record.readInt();
	C = OMPUseDevicePtrClause::CreateEmpty(Context, NumVars, NumDeclarations,
	NumLists, NumComponents);
	break;
	}
	case OMPC_is_device_ptr: {
	unsigned NumVars = Reader->Record.readInt();
	unsigned NumDeclarations = Reader->Record.readInt();
	unsigned NumLists = Reader->Record.readInt();
	unsigned NumComponents = Reader->Record.readInt();
	C = OMPIsDevicePtrClause::CreateEmpty(Context, NumVars, NumDeclarations,
	NumLists, NumComponents);
	break;
	}
	}
	Visit(C);
	C->setLocStart(Reader->ReadSourceLocation());
	C->setLocEnd(Reader->ReadSourceLocation());

	return C;
	}

	void OMPClauseReader::VisitOMPClauseWithPreInit(OMPClauseWithPreInit *C) {
	C->setPreInitStmt(Reader->Record.readSubStmt(),
	static_cast<OpenMPDirectiveKind>(Reader->Record.readInt()));
	}

	void OMPClauseReader::VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *C) {
	VisitOMPClauseWithPreInit(C);
	C->setPostUpdateExpr(Reader->Record.readSubExpr());
	}

	void OMPClauseReader::VisitOMPIfClause(OMPIfClause *C) {
	VisitOMPClauseWithPreInit(C);
	C->setNameModifier(static_cast<OpenMPDirectiveKind>(Reader->Record.readInt()));
	C->setNameModifierLoc(Reader->ReadSourceLocation());
	C->setColonLoc(Reader->ReadSourceLocation());
	C->setCondition(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPFinalClause(OMPFinalClause *C) {
	C->setCondition(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPNumThreadsClause(OMPNumThreadsClause *C) {
	VisitOMPClauseWithPreInit(C);
	C->setNumThreads(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPSafelenClause(OMPSafelenClause *C) {
	C->setSafelen(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPSimdlenClause(OMPSimdlenClause *C) {
	C->setSimdlen(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPCollapseClause(OMPCollapseClause *C) {
	C->setNumForLoops(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPDefaultClause(OMPDefaultClause *C) {
	C->setDefaultKind(
	static_cast<OpenMPDefaultClauseKind>(Reader->Record.readInt()));
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setDefaultKindKwLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPProcBindClause(OMPProcBindClause *C) {
	C->setProcBindKind(
	static_cast<OpenMPProcBindClauseKind>(Reader->Record.readInt()));
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setProcBindKindKwLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPScheduleClause(OMPScheduleClause *C) {
	VisitOMPClauseWithPreInit(C);
	C->setScheduleKind(
	static_cast<OpenMPScheduleClauseKind>(Reader->Record.readInt()));
	C->setFirstScheduleModifier(
	static_cast<OpenMPScheduleClauseModifier>(Reader->Record.readInt()));
	C->setSecondScheduleModifier(
	static_cast<OpenMPScheduleClauseModifier>(Reader->Record.readInt()));
	C->setChunkSize(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setFirstScheduleModifierLoc(Reader->ReadSourceLocation());
	C->setSecondScheduleModifierLoc(Reader->ReadSourceLocation());
	C->setScheduleKindLoc(Reader->ReadSourceLocation());
	C->setCommaLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPOrderedClause(OMPOrderedClause *C) {
	C->setNumForLoops(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPNowaitClause(OMPNowaitClause *) {}

	void OMPClauseReader::VisitOMPUntiedClause(OMPUntiedClause *) {}

	void OMPClauseReader::VisitOMPMergeableClause(OMPMergeableClause *) {}

	void OMPClauseReader::VisitOMPReadClause(OMPReadClause *) {}

	void OMPClauseReader::VisitOMPWriteClause(OMPWriteClause *) {}

	void OMPClauseReader::VisitOMPUpdateClause(OMPUpdateClause *) {}

	void OMPClauseReader::VisitOMPCaptureClause(OMPCaptureClause *) {}

	void OMPClauseReader::VisitOMPSeqCstClause(OMPSeqCstClause *) {}

	void OMPClauseReader::VisitOMPThreadsClause(OMPThreadsClause *) {}

	void OMPClauseReader::VisitOMPSIMDClause(OMPSIMDClause *) {}

	void OMPClauseReader::VisitOMPNogroupClause(OMPNogroupClause *) {}

	void OMPClauseReader::VisitOMPPrivateClause(OMPPrivateClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setPrivateCopies(Vars);
	}

	void OMPClauseReader::VisitOMPFirstprivateClause(OMPFirstprivateClause *C) {
	VisitOMPClauseWithPreInit(C);
	C->setLParenLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setPrivateCopies(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setInits(Vars);
	}

	void OMPClauseReader::VisitOMPLastprivateClause(OMPLastprivateClause *C) {
	VisitOMPClauseWithPostUpdate(C);
	C->setLParenLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setPrivateCopies(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setSourceExprs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setDestinationExprs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setAssignmentOps(Vars);
	}

	void OMPClauseReader::VisitOMPSharedClause(OMPSharedClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	}

	void OMPClauseReader::VisitOMPReductionClause(OMPReductionClause *C) {
	VisitOMPClauseWithPostUpdate(C);
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setColonLoc(Reader->ReadSourceLocation());
	NestedNameSpecifierLoc NNSL = Reader->Record.readNestedNameSpecifierLoc();
	DeclarationNameInfo DNI;
	Reader->ReadDeclarationNameInfo(DNI);
	C->setQualifierLoc(NNSL);
	C->setNameInfo(DNI);

	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setPrivates(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setLHSExprs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setRHSExprs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setReductionOps(Vars);
	}

	void OMPClauseReader::VisitOMPTaskReductionClause(OMPTaskReductionClause *C) {
	VisitOMPClauseWithPostUpdate(C);
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setColonLoc(Reader->ReadSourceLocation());
	NestedNameSpecifierLoc NNSL = Reader->Record.readNestedNameSpecifierLoc();
	DeclarationNameInfo DNI;
	Reader->ReadDeclarationNameInfo(DNI);
	C->setQualifierLoc(NNSL);
	C->setNameInfo(DNI);

	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned I = 0; I != NumVars; ++I)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	Vars.clear();
	for (unsigned I = 0; I != NumVars; ++I)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setPrivates(Vars);
	Vars.clear();
	for (unsigned I = 0; I != NumVars; ++I)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setLHSExprs(Vars);
	Vars.clear();
	for (unsigned I = 0; I != NumVars; ++I)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setRHSExprs(Vars);
	Vars.clear();
	for (unsigned I = 0; I != NumVars; ++I)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setReductionOps(Vars);
	}

	void OMPClauseReader::VisitOMPLinearClause(OMPLinearClause *C) {
	VisitOMPClauseWithPostUpdate(C);
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setColonLoc(Reader->ReadSourceLocation());
	C->setModifier(static_cast<OpenMPLinearClauseKind>(Reader->Record.readInt()));
	C->setModifierLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setPrivates(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setInits(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setUpdates(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setFinals(Vars);
	C->setStep(Reader->Record.readSubExpr());
	C->setCalcStep(Reader->Record.readSubExpr());
	}

	void OMPClauseReader::VisitOMPAlignedClause(OMPAlignedClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setColonLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	C->setAlignment(Reader->Record.readSubExpr());
	}

	void OMPClauseReader::VisitOMPCopyinClause(OMPCopyinClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Exprs;
	Exprs.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Exprs.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Exprs);
	Exprs.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Exprs.push_back(Reader->Record.readSubExpr());
	C->setSourceExprs(Exprs);
	Exprs.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Exprs.push_back(Reader->Record.readSubExpr());
	C->setDestinationExprs(Exprs);
	Exprs.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Exprs.push_back(Reader->Record.readSubExpr());
	C->setAssignmentOps(Exprs);
	}

	void OMPClauseReader::VisitOMPCopyprivateClause(OMPCopyprivateClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Exprs;
	Exprs.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Exprs.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Exprs);
	Exprs.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Exprs.push_back(Reader->Record.readSubExpr());
	C->setSourceExprs(Exprs);
	Exprs.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Exprs.push_back(Reader->Record.readSubExpr());
	C->setDestinationExprs(Exprs);
	Exprs.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Exprs.push_back(Reader->Record.readSubExpr());
	C->setAssignmentOps(Exprs);
	}

	void OMPClauseReader::VisitOMPFlushClause(OMPFlushClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	}

	void OMPClauseReader::VisitOMPDependClause(OMPDependClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setDependencyKind(
	static_cast<OpenMPDependClauseKind>(Reader->Record.readInt()));
	C->setDependencyLoc(Reader->ReadSourceLocation());
	C->setColonLoc(Reader->ReadSourceLocation());
	unsigned NumVars = C->varlist_size();
	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	C->setCounterValue(Reader->Record.readSubExpr());
	}

	void OMPClauseReader::VisitOMPDeviceClause(OMPDeviceClause *C) {
	C->setDevice(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPMapClause(OMPMapClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setMapTypeModifier(
	static_cast<OpenMPMapClauseKind>(Reader->Record.readInt()));
	C->setMapType(
	static_cast<OpenMPMapClauseKind>(Reader->Record.readInt()));
	C->setMapLoc(Reader->ReadSourceLocation());
	C->setColonLoc(Reader->ReadSourceLocation());
	auto NumVars = C->varlist_size();
	auto UniqueDecls = C->getUniqueDeclarationsNum();
	auto TotalLists = C->getTotalComponentListNum();
	auto TotalComponents = C->getTotalComponentsNum();

	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);

	SmallVector<ValueDecl *, 16> Decls;
	Decls.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	Decls.push_back(Reader->Record.readDeclAs<ValueDecl>());
	C->setUniqueDecls(Decls);

	SmallVector<unsigned, 16> ListsPerDecl;
	ListsPerDecl.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	ListsPerDecl.push_back(Reader->Record.readInt());
	C->setDeclNumLists(ListsPerDecl);

	SmallVector<unsigned, 32> ListSizes;
	ListSizes.reserve(TotalLists);
	for (unsigned i = 0; i < TotalLists; ++i)
	ListSizes.push_back(Reader->Record.readInt());
	C->setComponentListSizes(ListSizes);

	SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
	Components.reserve(TotalComponents);
	for (unsigned i = 0; i < TotalComponents; ++i) {
	Expr *AssociatedExpr = Reader->Record.readSubExpr();
	ValueDecl *AssociatedDecl = Reader->Record.readDeclAs<ValueDecl>();
	Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
	AssociatedExpr, AssociatedDecl));
	}
	C->setComponents(Components, ListSizes);
	}

	void OMPClauseReader::VisitOMPNumTeamsClause(OMPNumTeamsClause *C) {
	VisitOMPClauseWithPreInit(C);
	C->setNumTeams(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPThreadLimitClause(OMPThreadLimitClause *C) {
	VisitOMPClauseWithPreInit(C);
	C->setThreadLimit(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPPriorityClause(OMPPriorityClause *C) {
	C->setPriority(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPGrainsizeClause(OMPGrainsizeClause *C) {
	C->setGrainsize(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPNumTasksClause(OMPNumTasksClause *C) {
	C->setNumTasks(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPHintClause(OMPHintClause *C) {
	C->setHint(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPDistScheduleClause(OMPDistScheduleClause *C) {
	VisitOMPClauseWithPreInit(C);
	C->setDistScheduleKind(
	static_cast<OpenMPDistScheduleClauseKind>(Reader->Record.readInt()));
	C->setChunkSize(Reader->Record.readSubExpr());
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setDistScheduleKindLoc(Reader->ReadSourceLocation());
	C->setCommaLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPDefaultmapClause(OMPDefaultmapClause *C) {
	C->setDefaultmapKind(
	static_cast<OpenMPDefaultmapClauseKind>(Reader->Record.readInt()));
	C->setDefaultmapModifier(
	static_cast<OpenMPDefaultmapClauseModifier>(Reader->Record.readInt()));
	C->setLParenLoc(Reader->ReadSourceLocation());
	C->setDefaultmapModifierLoc(Reader->ReadSourceLocation());
	C->setDefaultmapKindLoc(Reader->ReadSourceLocation());
	}

	void OMPClauseReader::VisitOMPToClause(OMPToClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	auto NumVars = C->varlist_size();
	auto UniqueDecls = C->getUniqueDeclarationsNum();
	auto TotalLists = C->getTotalComponentListNum();
	auto TotalComponents = C->getTotalComponentsNum();

	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);

	SmallVector<ValueDecl *, 16> Decls;
	Decls.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	Decls.push_back(Reader->Record.readDeclAs<ValueDecl>());
	C->setUniqueDecls(Decls);

	SmallVector<unsigned, 16> ListsPerDecl;
	ListsPerDecl.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	ListsPerDecl.push_back(Reader->Record.readInt());
	C->setDeclNumLists(ListsPerDecl);

	SmallVector<unsigned, 32> ListSizes;
	ListSizes.reserve(TotalLists);
	for (unsigned i = 0; i < TotalLists; ++i)
	ListSizes.push_back(Reader->Record.readInt());
	C->setComponentListSizes(ListSizes);

	SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
	Components.reserve(TotalComponents);
	for (unsigned i = 0; i < TotalComponents; ++i) {
	Expr *AssociatedExpr = Reader->Record.readSubExpr();
	ValueDecl *AssociatedDecl = Reader->Record.readDeclAs<ValueDecl>();
	Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
	AssociatedExpr, AssociatedDecl));
	}
	C->setComponents(Components, ListSizes);
	}

	void OMPClauseReader::VisitOMPFromClause(OMPFromClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	auto NumVars = C->varlist_size();
	auto UniqueDecls = C->getUniqueDeclarationsNum();
	auto TotalLists = C->getTotalComponentListNum();
	auto TotalComponents = C->getTotalComponentsNum();

	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);

	SmallVector<ValueDecl *, 16> Decls;
	Decls.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	Decls.push_back(Reader->Record.readDeclAs<ValueDecl>());
	C->setUniqueDecls(Decls);

	SmallVector<unsigned, 16> ListsPerDecl;
	ListsPerDecl.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	ListsPerDecl.push_back(Reader->Record.readInt());
	C->setDeclNumLists(ListsPerDecl);

	SmallVector<unsigned, 32> ListSizes;
	ListSizes.reserve(TotalLists);
	for (unsigned i = 0; i < TotalLists; ++i)
	ListSizes.push_back(Reader->Record.readInt());
	C->setComponentListSizes(ListSizes);

	SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
	Components.reserve(TotalComponents);
	for (unsigned i = 0; i < TotalComponents; ++i) {
	Expr *AssociatedExpr = Reader->Record.readSubExpr();
	ValueDecl *AssociatedDecl = Reader->Record.readDeclAs<ValueDecl>();
	Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
	AssociatedExpr, AssociatedDecl));
	}
	C->setComponents(Components, ListSizes);
	}

	void OMPClauseReader::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	auto NumVars = C->varlist_size();
	auto UniqueDecls = C->getUniqueDeclarationsNum();
	auto TotalLists = C->getTotalComponentListNum();
	auto TotalComponents = C->getTotalComponentsNum();

	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setPrivateCopies(Vars);
	Vars.clear();
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setInits(Vars);

	SmallVector<ValueDecl *, 16> Decls;
	Decls.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	Decls.push_back(Reader->Record.readDeclAs<ValueDecl>());
	C->setUniqueDecls(Decls);

	SmallVector<unsigned, 16> ListsPerDecl;
	ListsPerDecl.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	ListsPerDecl.push_back(Reader->Record.readInt());
	C->setDeclNumLists(ListsPerDecl);

	SmallVector<unsigned, 32> ListSizes;
	ListSizes.reserve(TotalLists);
	for (unsigned i = 0; i < TotalLists; ++i)
	ListSizes.push_back(Reader->Record.readInt());
	C->setComponentListSizes(ListSizes);

	SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
	Components.reserve(TotalComponents);
	for (unsigned i = 0; i < TotalComponents; ++i) {
	Expr *AssociatedExpr = Reader->Record.readSubExpr();
	ValueDecl *AssociatedDecl = Reader->Record.readDeclAs<ValueDecl>();
	Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
	AssociatedExpr, AssociatedDecl));
	}
	C->setComponents(Components, ListSizes);
	}

	void OMPClauseReader::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
	C->setLParenLoc(Reader->ReadSourceLocation());
	auto NumVars = C->varlist_size();
	auto UniqueDecls = C->getUniqueDeclarationsNum();
	auto TotalLists = C->getTotalComponentListNum();
	auto TotalComponents = C->getTotalComponentsNum();

	SmallVector<Expr *, 16> Vars;
	Vars.reserve(NumVars);
	for (unsigned i = 0; i != NumVars; ++i)
	Vars.push_back(Reader->Record.readSubExpr());
	C->setVarRefs(Vars);
	Vars.clear();

	SmallVector<ValueDecl *, 16> Decls;
	Decls.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	Decls.push_back(Reader->Record.readDeclAs<ValueDecl>());
	C->setUniqueDecls(Decls);

	SmallVector<unsigned, 16> ListsPerDecl;
	ListsPerDecl.reserve(UniqueDecls);
	for (unsigned i = 0; i < UniqueDecls; ++i)
	ListsPerDecl.push_back(Reader->Record.readInt());
	C->setDeclNumLists(ListsPerDecl);

	SmallVector<unsigned, 32> ListSizes;
	ListSizes.reserve(TotalLists);
	for (unsigned i = 0; i < TotalLists; ++i)
	ListSizes.push_back(Reader->Record.readInt());
	C->setComponentListSizes(ListSizes);

	SmallVector<OMPClauseMappableExprCommon::MappableComponent, 32> Components;
	Components.reserve(TotalComponents);
	for (unsigned i = 0; i < TotalComponents; ++i) {
	Expr *AssociatedExpr = Reader->Record.readSubExpr();
	ValueDecl *AssociatedDecl = Reader->Record.readDeclAs<ValueDecl>();
	Components.push_back(OMPClauseMappableExprCommon::MappableComponent(
	AssociatedExpr, AssociatedDecl));
	}
	C->setComponents(Components, ListSizes);
	}

	//===----------------------------------------------------------------------===//
	// OpenMP Directives.
	//===----------------------------------------------------------------------===//
	void ASTStmtReader::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
	E->setLocStart(ReadSourceLocation());
	E->setLocEnd(ReadSourceLocation());
	OMPClauseReader ClauseReader(this, Record);
	SmallVector<OMPClause *, 5> Clauses;
	for (unsigned i = 0; i < E->getNumClauses(); ++i)
	Clauses.push_back(ClauseReader.readClause());
	E->setClauses(Clauses);
	if (E->hasAssociatedStmt())
	E->setAssociatedStmt(Record.readSubStmt());
	}

	void ASTStmtReader::VisitOMPLoopDirective(OMPLoopDirective *D) {
	VisitStmt(D);
	// Two fields (NumClauses and CollapsedNum) were read in ReadStmtFromStream.
	Record.skipInts(2);
	VisitOMPExecutableDirective(D);
	D->setIterationVariable(Record.readSubExpr());
	D->setLastIteration(Record.readSubExpr());
	D->setCalcLastIteration(Record.readSubExpr());
	D->setPreCond(Record.readSubExpr());
	D->setCond(Record.readSubExpr());
	D->setInit(Record.readSubExpr());
	D->setInc(Record.readSubExpr());
	D->setPreInits(Record.readSubStmt());
	if (isOpenMPWorksharingDirective(D->getDirectiveKind()) \|\|
	isOpenMPTaskLoopDirective(D->getDirectiveKind()) \|\|
	isOpenMPDistributeDirective(D->getDirectiveKind())) {
	D->setIsLastIterVariable(Record.readSubExpr());
	D->setLowerBoundVariable(Record.readSubExpr());
	D->setUpperBoundVariable(Record.readSubExpr());
	D->setStrideVariable(Record.readSubExpr());
	D->setEnsureUpperBound(Record.readSubExpr());
	D->setNextLowerBound(Record.readSubExpr());
	D->setNextUpperBound(Record.readSubExpr());
	D->setNumIterations(Record.readSubExpr());
	}
	if (isOpenMPLoopBoundSharingDirective(D->getDirectiveKind())) {
	D->setPrevLowerBoundVariable(Record.readSubExpr());
	D->setPrevUpperBoundVariable(Record.readSubExpr());
	D->setDistInc(Record.readSubExpr());
	D->setPrevEnsureUpperBound(Record.readSubExpr());
	D->setCombinedLowerBoundVariable(Record.readSubExpr());
	D->setCombinedUpperBoundVariable(Record.readSubExpr());
	D->setCombinedEnsureUpperBound(Record.readSubExpr());
	D->setCombinedInit(Record.readSubExpr());
	D->setCombinedCond(Record.readSubExpr());
	D->setCombinedNextLowerBound(Record.readSubExpr());
	D->setCombinedNextUpperBound(Record.readSubExpr());
	}
	SmallVector<Expr *, 4> Sub;
	unsigned CollapsedNum = D->getCollapsedNumber();
	Sub.reserve(CollapsedNum);
	for (unsigned i = 0; i < CollapsedNum; ++i)
	Sub.push_back(Record.readSubExpr());
	D->setCounters(Sub);
	Sub.clear();
	for (unsigned i = 0; i < CollapsedNum; ++i)
	Sub.push_back(Record.readSubExpr());
	D->setPrivateCounters(Sub);
	Sub.clear();
	for (unsigned i = 0; i < CollapsedNum; ++i)
	Sub.push_back(Record.readSubExpr());
	D->setInits(Sub);
	Sub.clear();
	for (unsigned i = 0; i < CollapsedNum; ++i)
	Sub.push_back(Record.readSubExpr());
	D->setUpdates(Sub);
	Sub.clear();
	for (unsigned i = 0; i < CollapsedNum; ++i)
	Sub.push_back(Record.readSubExpr());
	D->setFinals(Sub);
	}

	void ASTStmtReader::VisitOMPParallelDirective(OMPParallelDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	D->setHasCancel(Record.readInt());
	}

	void ASTStmtReader::VisitOMPSimdDirective(OMPSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPForDirective(OMPForDirective *D) {
	VisitOMPLoopDirective(D);
	D->setHasCancel(Record.readInt());
	}

	void ASTStmtReader::VisitOMPForSimdDirective(OMPForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPSectionsDirective(OMPSectionsDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	D->setHasCancel(Record.readInt());
	}

	void ASTStmtReader::VisitOMPSectionDirective(OMPSectionDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	D->setHasCancel(Record.readInt());
	}

	void ASTStmtReader::VisitOMPSingleDirective(OMPSingleDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPMasterDirective(OMPMasterDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPCriticalDirective(OMPCriticalDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	ReadDeclarationNameInfo(D->DirName);
	}

	void ASTStmtReader::VisitOMPParallelForDirective(OMPParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	D->setHasCancel(Record.readInt());
	}

	void ASTStmtReader::VisitOMPParallelForSimdDirective(
	OMPParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPParallelSectionsDirective(
	OMPParallelSectionsDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	D->setHasCancel(Record.readInt());
	}

	void ASTStmtReader::VisitOMPTaskDirective(OMPTaskDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	D->setHasCancel(Record.readInt());
	}

	void ASTStmtReader::VisitOMPTaskyieldDirective(OMPTaskyieldDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPBarrierDirective(OMPBarrierDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPTaskwaitDirective(OMPTaskwaitDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPTaskgroupDirective(OMPTaskgroupDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPFlushDirective(OMPFlushDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPOrderedDirective(OMPOrderedDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPAtomicDirective(OMPAtomicDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	D->setX(Record.readSubExpr());
	D->setV(Record.readSubExpr());
	D->setExpr(Record.readSubExpr());
	D->setUpdateExpr(Record.readSubExpr());
	D->IsXLHSInRHSPart = Record.readInt() != 0;
	D->IsPostfixUpdate = Record.readInt() != 0;
	}

	void ASTStmtReader::VisitOMPTargetDirective(OMPTargetDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetDataDirective(OMPTargetDataDirective *D) {
	VisitStmt(D);
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetEnterDataDirective(
	OMPTargetEnterDataDirective *D) {
	VisitStmt(D);
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetExitDataDirective(
	OMPTargetExitDataDirective *D) {
	VisitStmt(D);
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetParallelDirective(
	OMPTargetParallelDirective *D) {
	VisitStmt(D);
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetParallelForDirective(
	OMPTargetParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	D->setHasCancel(Record.readInt());
	}

	void ASTStmtReader::VisitOMPTeamsDirective(OMPTeamsDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPCancellationPointDirective(
	OMPCancellationPointDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	D->setCancelRegion(static_cast<OpenMPDirectiveKind>(Record.readInt()));
	}

	void ASTStmtReader::VisitOMPCancelDirective(OMPCancelDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	D->setCancelRegion(static_cast<OpenMPDirectiveKind>(Record.readInt()));
	}

	void ASTStmtReader::VisitOMPTaskLoopDirective(OMPTaskLoopDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTaskLoopSimdDirective(OMPTaskLoopSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPDistributeDirective(OMPDistributeDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetUpdateDirective(OMPTargetUpdateDirective *D) {
	VisitStmt(D);
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}
	void ASTStmtReader::VisitOMPDistributeParallelForDirective(
	OMPDistributeParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPDistributeParallelForSimdDirective(
	OMPDistributeParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPDistributeSimdDirective(
	OMPDistributeSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetParallelForSimdDirective(
	OMPTargetParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetSimdDirective(OMPTargetSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTeamsDistributeDirective(
	OMPTeamsDistributeDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTeamsDistributeSimdDirective(
	OMPTeamsDistributeSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTeamsDistributeParallelForSimdDirective(
	OMPTeamsDistributeParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTeamsDistributeParallelForDirective(
	OMPTeamsDistributeParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetTeamsDirective(OMPTargetTeamsDirective *D) {
	VisitStmt(D);
	// The NumClauses field was read in ReadStmtFromStream.
	Record.skipInts(1);
	VisitOMPExecutableDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetTeamsDistributeDirective(
	OMPTargetTeamsDistributeDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetTeamsDistributeParallelForDirective(
	OMPTargetTeamsDistributeParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetTeamsDistributeParallelForSimdDirective(
	OMPTargetTeamsDistributeParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	void ASTStmtReader::VisitOMPTargetTeamsDistributeSimdDirective(
	OMPTargetTeamsDistributeSimdDirective *D) {
	VisitOMPLoopDirective(D);
	}

	//===----------------------------------------------------------------------===//
	// ASTReader Implementation
	//===----------------------------------------------------------------------===//

	Stmt *ASTReader::ReadStmt(ModuleFile &F) {
	switch (ReadingKind) {
	case Read_None:
	llvm_unreachable("should not call this when not reading anything");
	case Read_Decl:
	case Read_Type:
	return ReadStmtFromStream(F);
	case Read_Stmt:
	return ReadSubStmt();
	}

	llvm_unreachable("ReadingKind not set ?");
	}

	Expr *ASTReader::ReadExpr(ModuleFile &F) {
	return cast_or_null<Expr>(ReadStmt(F));
	}

	Expr *ASTReader::ReadSubExpr() {
	return cast_or_null<Expr>(ReadSubStmt());
	}

	// Within the bitstream, expressions are stored in Reverse Polish
	// Notation, with each of the subexpressions preceding the
	// expression they are stored in. Subexpressions are stored from last to first.
	// To evaluate expressions, we continue reading expressions and placing them on
	// the stack, with expressions having operands removing those operands from the
	// stack. Evaluation terminates when we see a STMT_STOP record, and
	// the single remaining expression on the stack is our result.
	Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {

	ReadingKindTracker ReadingKind(Read_Stmt, *this);
	llvm::BitstreamCursor &Cursor = F.DeclsCursor;

	// Map of offset to previously deserialized stmt. The offset points
	// just after the stmt record.
	llvm::DenseMap<uint64_t, Stmt *> StmtEntries;

	#ifndef NDEBUG
	unsigned PrevNumStmts = StmtStack.size();
	#endif

	ASTRecordReader Record(*this, F);
	ASTStmtReader Reader(Record, Cursor);
	Stmt::EmptyShell Empty;

	while (true) {
	llvm::BitstreamEntry Entry = Cursor.advanceSkippingSubblocks();

	switch (Entry.Kind) {
	case llvm::BitstreamEntry::SubBlock: // Handled for us already.
	case llvm::BitstreamEntry::Error:
	Error("malformed block record in AST file");
	return nullptr;
	case llvm::BitstreamEntry::EndBlock:
	goto Done;
	case llvm::BitstreamEntry::Record:
	// The interesting case.
	break;
	}

	ASTContext &Context = getContext();
	Stmt *S = nullptr;
	bool Finished = false;
	bool IsStmtReference = false;
	switch ((StmtCode)Record.readRecord(Cursor, Entry.ID)) {
	case STMT_STOP:
	Finished = true;
	break;

	case STMT_REF_PTR:
	IsStmtReference = true;
	assert(StmtEntries.find(Record[0]) != StmtEntries.end() &&
	"No stmt was recorded for this offset reference!");
	S = StmtEntries[Record.readInt()];
	break;

	case STMT_NULL_PTR:
	S = nullptr;
	break;

	case STMT_NULL:
	S = new (Context) NullStmt(Empty);
	break;

	case STMT_COMPOUND:
	S = new (Context) CompoundStmt(Empty);
	break;

	case STMT_CASE:
	S = new (Context) CaseStmt(Empty);
	break;

	case STMT_DEFAULT:
	S = new (Context) DefaultStmt(Empty);
	break;

	case STMT_LABEL:
	S = new (Context) LabelStmt(Empty);
	break;

	case STMT_ATTRIBUTED:
	S = AttributedStmt::CreateEmpty(
	Context,
	/NumAttrs/Record[ASTStmtReader::NumStmtFields]);
	break;

	case STMT_IF:
	S = new (Context) IfStmt(Empty);
	break;

	case STMT_SWITCH:
	S = new (Context) SwitchStmt(Empty);
	break;

	case STMT_WHILE:
	S = new (Context) WhileStmt(Empty);
	break;

	case STMT_DO:
	S = new (Context) DoStmt(Empty);
	break;

	case STMT_FOR:
	S = new (Context) ForStmt(Empty);
	break;

	case STMT_GOTO:
	S = new (Context) GotoStmt(Empty);
	break;

	case STMT_INDIRECT_GOTO:
	S = new (Context) IndirectGotoStmt(Empty);
	break;

	case STMT_CONTINUE:
	S = new (Context) ContinueStmt(Empty);
	break;

	case STMT_BREAK:
	S = new (Context) BreakStmt(Empty);
	break;

	case STMT_RETURN:
	S = new (Context) ReturnStmt(Empty);
	break;

	case STMT_DECL:
	S = new (Context) DeclStmt(Empty);
	break;

	case STMT_GCCASM:
	S = new (Context) GCCAsmStmt(Empty);
	break;

	case STMT_MSASM:
	S = new (Context) MSAsmStmt(Empty);
	break;

	case STMT_CAPTURED:
	S = CapturedStmt::CreateDeserialized(Context,
	Record[ASTStmtReader::NumStmtFields]);
	break;

	case EXPR_PREDEFINED:
	S = new (Context) PredefinedExpr(Empty);
	break;

	case EXPR_DECL_REF:
	S = DeclRefExpr::CreateEmpty(
	Context,
	/HasQualifier=/Record[ASTStmtReader::NumExprFields],
	/HasFoundDecl=/Record[ASTStmtReader::NumExprFields + 1],
	/HasTemplateKWAndArgsInfo=/Record[ASTStmtReader::NumExprFields + 2],
	/NumTemplateArgs=/Record[ASTStmtReader::NumExprFields + 2] ?
	Record[ASTStmtReader::NumExprFields + 5] : 0);
	break;

	case EXPR_INTEGER_LITERAL:
	S = IntegerLiteral::Create(Context, Empty);
	break;

	case EXPR_FLOATING_LITERAL:
	S = FloatingLiteral::Create(Context, Empty);
	break;

	case EXPR_IMAGINARY_LITERAL:
	S = new (Context) ImaginaryLiteral(Empty);
	break;

	case EXPR_STRING_LITERAL:
	S = StringLiteral::CreateEmpty(Context,
	Record[ASTStmtReader::NumExprFields + 1]);
	break;

	case EXPR_CHARACTER_LITERAL:
	S = new (Context) CharacterLiteral(Empty);
	break;

	case EXPR_PAREN:
	S = new (Context) ParenExpr(Empty);
	break;

	case EXPR_PAREN_LIST:
	S = new (Context) ParenListExpr(Empty);
	break;

	case EXPR_UNARY_OPERATOR:
	S = new (Context) UnaryOperator(Empty);
	break;

	case EXPR_OFFSETOF:
	S = OffsetOfExpr::CreateEmpty(Context,
	Record[ASTStmtReader::NumExprFields],
	Record[ASTStmtReader::NumExprFields + 1]);
	break;

	case EXPR_SIZEOF_ALIGN_OF:
	S = new (Context) UnaryExprOrTypeTraitExpr(Empty);
	break;

	case EXPR_ARRAY_SUBSCRIPT:
	S = new (Context) ArraySubscriptExpr(Empty);
	break;

	case EXPR_OMP_ARRAY_SECTION:
	S = new (Context) OMPArraySectionExpr(Empty);
	break;

	case EXPR_CALL:
	S = new (Context) CallExpr(Context, Stmt::CallExprClass, Empty);
	break;

	case EXPR_MEMBER: {
	// We load everything here and fully initialize it at creation.
	// That way we can use MemberExpr::Create and don't have to duplicate its
	// logic with a MemberExpr::CreateEmpty.

	assert(Record.getIdx() == 0);
	NestedNameSpecifierLoc QualifierLoc;
	if (Record.readInt()) { // HasQualifier.
	QualifierLoc = Record.readNestedNameSpecifierLoc();
	}

	SourceLocation TemplateKWLoc;
	TemplateArgumentListInfo ArgInfo;
	bool HasTemplateKWAndArgsInfo = Record.readInt();
	if (HasTemplateKWAndArgsInfo) {
	TemplateKWLoc = Record.readSourceLocation();
	unsigned NumTemplateArgs = Record.readInt();
	ArgInfo.setLAngleLoc(Record.readSourceLocation());
	ArgInfo.setRAngleLoc(Record.readSourceLocation());
	for (unsigned i = 0; i != NumTemplateArgs; ++i)
	ArgInfo.addArgument(Record.readTemplateArgumentLoc());
	}

	bool HadMultipleCandidates = Record.readInt();

	NamedDecl *FoundD = Record.readDeclAs<NamedDecl>();
	AccessSpecifier AS = (AccessSpecifier)Record.readInt();
	DeclAccessPair FoundDecl = DeclAccessPair::make(FoundD, AS);

	QualType T = Record.readType();
	ExprValueKind VK = static_cast<ExprValueKind>(Record.readInt());
	ExprObjectKind OK = static_cast<ExprObjectKind>(Record.readInt());
	Expr *Base = ReadSubExpr();
	ValueDecl *MemberD = Record.readDeclAs<ValueDecl>();
	SourceLocation MemberLoc = Record.readSourceLocation();
	DeclarationNameInfo MemberNameInfo(MemberD->getDeclName(), MemberLoc);
	bool IsArrow = Record.readInt();
	SourceLocation OperatorLoc = Record.readSourceLocation();

	S = MemberExpr::Create(Context, Base, IsArrow, OperatorLoc, QualifierLoc,
	TemplateKWLoc, MemberD, FoundDecl, MemberNameInfo,
	HasTemplateKWAndArgsInfo ? &ArgInfo : nullptr, T,
	VK, OK);
	Record.readDeclarationNameLoc(cast<MemberExpr>(S)->MemberDNLoc,
	MemberD->getDeclName());
	if (HadMultipleCandidates)
	cast<MemberExpr>(S)->setHadMultipleCandidates(true);
	break;
	}

	case EXPR_BINARY_OPERATOR:
	S = new (Context) BinaryOperator(Empty);
	break;

	case EXPR_COMPOUND_ASSIGN_OPERATOR:
	S = new (Context) CompoundAssignOperator(Empty);
	break;

	case EXPR_CONDITIONAL_OPERATOR:
	S = new (Context) ConditionalOperator(Empty);
	break;

	case EXPR_BINARY_CONDITIONAL_OPERATOR:
	S = new (Context) BinaryConditionalOperator(Empty);
	break;

	case EXPR_IMPLICIT_CAST:
	S = ImplicitCastExpr::CreateEmpty(Context,
	/PathSize/ Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_CSTYLE_CAST:
	S = CStyleCastExpr::CreateEmpty(Context,
	/PathSize/ Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_COMPOUND_LITERAL:
	S = new (Context) CompoundLiteralExpr(Empty);
	break;

	case EXPR_EXT_VECTOR_ELEMENT:
	S = new (Context) ExtVectorElementExpr(Empty);
	break;

	case EXPR_INIT_LIST:
	S = new (Context) InitListExpr(Empty);
	break;

	case EXPR_DESIGNATED_INIT:
	S = DesignatedInitExpr::CreateEmpty(Context,
	Record[ASTStmtReader::NumExprFields] - 1);

	break;

	case EXPR_DESIGNATED_INIT_UPDATE:
	S = new (Context) DesignatedInitUpdateExpr(Empty);
	break;

	case EXPR_IMPLICIT_VALUE_INIT:
	S = new (Context) ImplicitValueInitExpr(Empty);
	break;

	case EXPR_NO_INIT:
	S = new (Context) NoInitExpr(Empty);
	break;

	case EXPR_ARRAY_INIT_LOOP:
	S = new (Context) ArrayInitLoopExpr(Empty);
	break;

	case EXPR_ARRAY_INIT_INDEX:
	S = new (Context) ArrayInitIndexExpr(Empty);
	break;

	case EXPR_VA_ARG:
	S = new (Context) VAArgExpr(Empty);
	break;

	case EXPR_ADDR_LABEL:
	S = new (Context) AddrLabelExpr(Empty);
	break;

	case EXPR_STMT:
	S = new (Context) StmtExpr(Empty);
	break;

	case EXPR_CHOOSE:
	S = new (Context) ChooseExpr(Empty);
	break;

	case EXPR_GNU_NULL:
	S = new (Context) GNUNullExpr(Empty);
	break;

	case EXPR_SHUFFLE_VECTOR:
	S = new (Context) ShuffleVectorExpr(Empty);
	break;

	case EXPR_CONVERT_VECTOR:
	S = new (Context) ConvertVectorExpr(Empty);
	break;

	case EXPR_BLOCK:
	S = new (Context) BlockExpr(Empty);
	break;

	case EXPR_GENERIC_SELECTION:
	S = new (Context) GenericSelectionExpr(Empty);
	break;

	case EXPR_OBJC_STRING_LITERAL:
	S = new (Context) ObjCStringLiteral(Empty);
	break;
	case EXPR_OBJC_BOXED_EXPRESSION:
	S = new (Context) ObjCBoxedExpr(Empty);
	break;
	case EXPR_OBJC_ARRAY_LITERAL:
	S = ObjCArrayLiteral::CreateEmpty(Context,
	Record[ASTStmtReader::NumExprFields]);
	break;
	case EXPR_OBJC_DICTIONARY_LITERAL:
	S = ObjCDictionaryLiteral::CreateEmpty(Context,
	Record[ASTStmtReader::NumExprFields],
	Record[ASTStmtReader::NumExprFields + 1]);
	break;
	case EXPR_OBJC_ENCODE:
	S = new (Context) ObjCEncodeExpr(Empty);
	break;
	case EXPR_OBJC_SELECTOR_EXPR:
	S = new (Context) ObjCSelectorExpr(Empty);
	break;
	case EXPR_OBJC_PROTOCOL_EXPR:
	S = new (Context) ObjCProtocolExpr(Empty);
	break;
	case EXPR_OBJC_IVAR_REF_EXPR:
	S = new (Context) ObjCIvarRefExpr(Empty);
	break;
	case EXPR_OBJC_PROPERTY_REF_EXPR:
	S = new (Context) ObjCPropertyRefExpr(Empty);
	break;
	case EXPR_OBJC_SUBSCRIPT_REF_EXPR:
	S = new (Context) ObjCSubscriptRefExpr(Empty);
	break;
	case EXPR_OBJC_KVC_REF_EXPR:
	llvm_unreachable("mismatching AST file");
	case EXPR_OBJC_MESSAGE_EXPR:
	S = ObjCMessageExpr::CreateEmpty(Context,
	Record[ASTStmtReader::NumExprFields],
	Record[ASTStmtReader::NumExprFields + 1]);
	break;
	case EXPR_OBJC_ISA:
	S = new (Context) ObjCIsaExpr(Empty);
	break;
	case EXPR_OBJC_INDIRECT_COPY_RESTORE:
	S = new (Context) ObjCIndirectCopyRestoreExpr(Empty);
	break;
	case EXPR_OBJC_BRIDGED_CAST:
	S = new (Context) ObjCBridgedCastExpr(Empty);
	break;
	case STMT_OBJC_FOR_COLLECTION:
	S = new (Context) ObjCForCollectionStmt(Empty);
	break;
	case STMT_OBJC_CATCH:
	S = new (Context) ObjCAtCatchStmt(Empty);
	break;
	case STMT_OBJC_FINALLY:
	S = new (Context) ObjCAtFinallyStmt(Empty);
	break;
	case STMT_OBJC_AT_TRY:
	S = ObjCAtTryStmt::CreateEmpty(Context,
	Record[ASTStmtReader::NumStmtFields],
	Record[ASTStmtReader::NumStmtFields + 1]);
	break;
	case STMT_OBJC_AT_SYNCHRONIZED:
	S = new (Context) ObjCAtSynchronizedStmt(Empty);
	break;
	case STMT_OBJC_AT_THROW:
	S = new (Context) ObjCAtThrowStmt(Empty);
	break;
	case STMT_OBJC_AUTORELEASE_POOL:
	S = new (Context) ObjCAutoreleasePoolStmt(Empty);
	break;
	case EXPR_OBJC_BOOL_LITERAL:
	S = new (Context) ObjCBoolLiteralExpr(Empty);
	break;
	case EXPR_OBJC_AVAILABILITY_CHECK:
	S = new (Context) ObjCAvailabilityCheckExpr(Empty);
	break;
	case STMT_SEH_LEAVE:
	S = new (Context) SEHLeaveStmt(Empty);
	break;
	case STMT_SEH_EXCEPT:
	S = new (Context) SEHExceptStmt(Empty);
	break;
	case STMT_SEH_FINALLY:
	S = new (Context) SEHFinallyStmt(Empty);
	break;
	case STMT_SEH_TRY:
	S = new (Context) SEHTryStmt(Empty);
	break;
	case STMT_CXX_CATCH:
	S = new (Context) CXXCatchStmt(Empty);
	break;

	case STMT_CXX_TRY:
	S = CXXTryStmt::Create(Context, Empty,
	/NumHandlers=/Record[ASTStmtReader::NumStmtFields]);
	break;

	case STMT_CXX_FOR_RANGE:
	S = new (Context) CXXForRangeStmt(Empty);
	break;

	case STMT_MS_DEPENDENT_EXISTS:
	S = new (Context) MSDependentExistsStmt(SourceLocation(), true,
	NestedNameSpecifierLoc(),
	DeclarationNameInfo(),
	nullptr);
	break;

	case STMT_OMP_PARALLEL_DIRECTIVE:
	S =
	OMPParallelDirective::CreateEmpty(Context,
	Record[ASTStmtReader::NumStmtFields],
	Empty);
	break;

	case STMT_OMP_SIMD_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPSimdDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_FOR_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPForDirective::CreateEmpty(Context, NumClauses, CollapsedNum,
	Empty);
	break;
	}

	case STMT_OMP_FOR_SIMD_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPForSimdDirective::CreateEmpty(Context, NumClauses, CollapsedNum,
	Empty);
	break;
	}

	case STMT_OMP_SECTIONS_DIRECTIVE:
	S = OMPSectionsDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_SECTION_DIRECTIVE:
	S = OMPSectionDirective::CreateEmpty(Context, Empty);
	break;

	case STMT_OMP_SINGLE_DIRECTIVE:
	S = OMPSingleDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_MASTER_DIRECTIVE:
	S = OMPMasterDirective::CreateEmpty(Context, Empty);
	break;

	case STMT_OMP_CRITICAL_DIRECTIVE:
	S = OMPCriticalDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_PARALLEL_FOR_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPParallelForDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_PARALLEL_FOR_SIMD_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPParallelForSimdDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_PARALLEL_SECTIONS_DIRECTIVE:
	S = OMPParallelSectionsDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TASK_DIRECTIVE:
	S = OMPTaskDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TASKYIELD_DIRECTIVE:
	S = OMPTaskyieldDirective::CreateEmpty(Context, Empty);
	break;

	case STMT_OMP_BARRIER_DIRECTIVE:
	S = OMPBarrierDirective::CreateEmpty(Context, Empty);
	break;

	case STMT_OMP_TASKWAIT_DIRECTIVE:
	S = OMPTaskwaitDirective::CreateEmpty(Context, Empty);
	break;

	case STMT_OMP_TASKGROUP_DIRECTIVE:
	S = OMPTaskgroupDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_FLUSH_DIRECTIVE:
	S = OMPFlushDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_ORDERED_DIRECTIVE:
	S = OMPOrderedDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_ATOMIC_DIRECTIVE:
	S = OMPAtomicDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TARGET_DIRECTIVE:
	S = OMPTargetDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TARGET_DATA_DIRECTIVE:
	S = OMPTargetDataDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TARGET_ENTER_DATA_DIRECTIVE:
	S = OMPTargetEnterDataDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TARGET_EXIT_DATA_DIRECTIVE:
	S = OMPTargetExitDataDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TARGET_PARALLEL_DIRECTIVE:
	S = OMPTargetParallelDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TARGET_PARALLEL_FOR_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTargetParallelForDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TARGET_UPDATE_DIRECTIVE:
	S = OMPTargetUpdateDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TEAMS_DIRECTIVE:
	S = OMPTeamsDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_CANCELLATION_POINT_DIRECTIVE:
	S = OMPCancellationPointDirective::CreateEmpty(Context, Empty);
	break;

	case STMT_OMP_CANCEL_DIRECTIVE:
	S = OMPCancelDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;

	case STMT_OMP_TASKLOOP_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTaskLoopDirective::CreateEmpty(Context, NumClauses, CollapsedNum,
	Empty);
	break;
	}

	case STMT_OMP_TASKLOOP_SIMD_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTaskLoopSimdDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_DISTRIBUTE_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPDistributeDirective::CreateEmpty(Context, NumClauses, CollapsedNum,
	Empty);
	break;
	}

	case STMT_OMP_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPDistributeParallelForDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPDistributeParallelForSimdDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum,
	Empty);
	break;
	}

	case STMT_OMP_DISTRIBUTE_SIMD_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPDistributeSimdDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TARGET_PARALLEL_FOR_SIMD_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTargetParallelForSimdDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TARGET_SIMD_DIRECTIVE: {
	auto NumClauses = Record[ASTStmtReader::NumStmtFields];
	auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTargetSimdDirective::CreateEmpty(Context, NumClauses, CollapsedNum,
	Empty);
	break;
	}

	case STMT_OMP_TEAMS_DISTRIBUTE_DIRECTIVE: {
	auto NumClauses = Record[ASTStmtReader::NumStmtFields];
	auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTeamsDistributeDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE: {
	unsigned NumClauses = Record[ASTStmtReader::NumStmtFields];
	unsigned CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTeamsDistributeSimdDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE: {
	auto NumClauses = Record[ASTStmtReader::NumStmtFields];
	auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTeamsDistributeParallelForSimdDirective::CreateEmpty(
	Context, NumClauses, CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE: {
	auto NumClauses = Record[ASTStmtReader::NumStmtFields];
	auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTeamsDistributeParallelForDirective::CreateEmpty(
	Context, NumClauses, CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TARGET_TEAMS_DIRECTIVE: {
	S = OMPTargetTeamsDirective::CreateEmpty(
	Context, Record[ASTStmtReader::NumStmtFields], Empty);
	break;
	}

	case STMT_OMP_TARGET_TEAMS_DISTRIBUTE_DIRECTIVE: {
	auto NumClauses = Record[ASTStmtReader::NumStmtFields];
	auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTargetTeamsDistributeDirective::CreateEmpty(Context, NumClauses,
	CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE: {
	auto NumClauses = Record[ASTStmtReader::NumStmtFields];
	auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTargetTeamsDistributeParallelForDirective::CreateEmpty(
	Context, NumClauses, CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE: {
	auto NumClauses = Record[ASTStmtReader::NumStmtFields];
	auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTargetTeamsDistributeParallelForSimdDirective::CreateEmpty(
	Context, NumClauses, CollapsedNum, Empty);
	break;
	}

	case STMT_OMP_TARGET_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE: {
	auto NumClauses = Record[ASTStmtReader::NumStmtFields];
	auto CollapsedNum = Record[ASTStmtReader::NumStmtFields + 1];
	S = OMPTargetTeamsDistributeSimdDirective::CreateEmpty(
	Context, NumClauses, CollapsedNum, Empty);
	break;
	}

	case EXPR_CXX_OPERATOR_CALL:
	S = new (Context) CXXOperatorCallExpr(Context, Empty);
	break;

	case EXPR_CXX_MEMBER_CALL:
	S = new (Context) CXXMemberCallExpr(Context, Empty);
	break;

	case EXPR_CXX_CONSTRUCT:
	S = new (Context) CXXConstructExpr(Empty);
	break;

	case EXPR_CXX_INHERITED_CTOR_INIT:
	S = new (Context) CXXInheritedCtorInitExpr(Empty);
	break;

	case EXPR_CXX_TEMPORARY_OBJECT:
	S = new (Context) CXXTemporaryObjectExpr(Empty);
	break;

	case EXPR_CXX_STATIC_CAST:
	S = CXXStaticCastExpr::CreateEmpty(Context,
	/PathSize/ Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_CXX_DYNAMIC_CAST:
	S = CXXDynamicCastExpr::CreateEmpty(Context,
	/PathSize/ Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_CXX_REINTERPRET_CAST:
	S = CXXReinterpretCastExpr::CreateEmpty(Context,
	/PathSize/ Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_CXX_CONST_CAST:
	S = CXXConstCastExpr::CreateEmpty(Context);
	break;

	case EXPR_CXX_FUNCTIONAL_CAST:
	S = CXXFunctionalCastExpr::CreateEmpty(Context,
	/PathSize/ Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_USER_DEFINED_LITERAL:
	S = new (Context) UserDefinedLiteral(Context, Empty);
	break;

	case EXPR_CXX_STD_INITIALIZER_LIST:
	S = new (Context) CXXStdInitializerListExpr(Empty);
	break;

	case EXPR_CXX_BOOL_LITERAL:
	S = new (Context) CXXBoolLiteralExpr(Empty);
	break;

	case EXPR_CXX_NULL_PTR_LITERAL:
	S = new (Context) CXXNullPtrLiteralExpr(Empty);
	break;
	case EXPR_CXX_TYPEID_EXPR:
	S = new (Context) CXXTypeidExpr(Empty, true);
	break;
	case EXPR_CXX_TYPEID_TYPE:
	S = new (Context) CXXTypeidExpr(Empty, false);
	break;
	case EXPR_CXX_UUIDOF_EXPR:
	S = new (Context) CXXUuidofExpr(Empty, true);
	break;
	case EXPR_CXX_PROPERTY_REF_EXPR:
	S = new (Context) MSPropertyRefExpr(Empty);
	break;
	case EXPR_CXX_PROPERTY_SUBSCRIPT_EXPR:
	S = new (Context) MSPropertySubscriptExpr(Empty);
	break;
	case EXPR_CXX_UUIDOF_TYPE:
	S = new (Context) CXXUuidofExpr(Empty, false);
	break;
	case EXPR_CXX_THIS:
	S = new (Context) CXXThisExpr(Empty);
	break;
	case EXPR_CXX_THROW:
	S = new (Context) CXXThrowExpr(Empty);
	break;
	case EXPR_CXX_DEFAULT_ARG:
	S = new (Context) CXXDefaultArgExpr(Empty);
	break;
	case EXPR_CXX_DEFAULT_INIT:
	S = new (Context) CXXDefaultInitExpr(Empty);
	break;
	case EXPR_CXX_BIND_TEMPORARY:
	S = new (Context) CXXBindTemporaryExpr(Empty);
	break;

	case EXPR_CXX_SCALAR_VALUE_INIT:
	S = new (Context) CXXScalarValueInitExpr(Empty);
	break;
	case EXPR_CXX_NEW:
	S = new (Context) CXXNewExpr(Empty);
	break;
	case EXPR_CXX_DELETE:
	S = new (Context) CXXDeleteExpr(Empty);
	break;
	case EXPR_CXX_PSEUDO_DESTRUCTOR:
	S = new (Context) CXXPseudoDestructorExpr(Empty);
	break;

	case EXPR_EXPR_WITH_CLEANUPS:
	S = ExprWithCleanups::Create(Context, Empty,
	Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_CXX_DEPENDENT_SCOPE_MEMBER:
	S = CXXDependentScopeMemberExpr::CreateEmpty(Context,
	/HasTemplateKWAndArgsInfo=/Record[ASTStmtReader::NumExprFields],
	/NumTemplateArgs=/Record[ASTStmtReader::NumExprFields]
	? Record[ASTStmtReader::NumExprFields + 1]
	: 0);
	break;

	case EXPR_CXX_DEPENDENT_SCOPE_DECL_REF:
	S = DependentScopeDeclRefExpr::CreateEmpty(Context,
	/HasTemplateKWAndArgsInfo=/Record[ASTStmtReader::NumExprFields],
	/NumTemplateArgs=/Record[ASTStmtReader::NumExprFields]
	? Record[ASTStmtReader::NumExprFields + 1]
	: 0);
	break;

	case EXPR_CXX_UNRESOLVED_CONSTRUCT:
	S = CXXUnresolvedConstructExpr::CreateEmpty(Context,
	/NumArgs=/Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_CXX_UNRESOLVED_MEMBER:
	S = UnresolvedMemberExpr::CreateEmpty(Context,
	/HasTemplateKWAndArgsInfo=/Record[ASTStmtReader::NumExprFields],
	/NumTemplateArgs=/Record[ASTStmtReader::NumExprFields]
	? Record[ASTStmtReader::NumExprFields + 1]
	: 0);
	break;

	case EXPR_CXX_UNRESOLVED_LOOKUP:
	S = UnresolvedLookupExpr::CreateEmpty(Context,
	/HasTemplateKWAndArgsInfo=/Record[ASTStmtReader::NumExprFields],
	/NumTemplateArgs=/Record[ASTStmtReader::NumExprFields]
	? Record[ASTStmtReader::NumExprFields + 1]
	: 0);
	break;

	case EXPR_TYPE_TRAIT:
	S = TypeTraitExpr::CreateDeserialized(Context,
	Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_ARRAY_TYPE_TRAIT:
	S = new (Context) ArrayTypeTraitExpr(Empty);
	break;

	case EXPR_CXX_EXPRESSION_TRAIT:
	S = new (Context) ExpressionTraitExpr(Empty);
	break;

	case EXPR_CXX_NOEXCEPT:
	S = new (Context) CXXNoexceptExpr(Empty);
	break;

	case EXPR_PACK_EXPANSION:
	S = new (Context) PackExpansionExpr(Empty);
	break;

	case EXPR_SIZEOF_PACK:
	S = SizeOfPackExpr::CreateDeserialized(
	Context,
	/NumPartialArgs=/Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_SUBST_NON_TYPE_TEMPLATE_PARM:
	S = new (Context) SubstNonTypeTemplateParmExpr(Empty);
	break;

	case EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK:
	S = new (Context) SubstNonTypeTemplateParmPackExpr(Empty);
	break;

	case EXPR_FUNCTION_PARM_PACK:
	S = FunctionParmPackExpr::CreateEmpty(Context,
	Record[ASTStmtReader::NumExprFields]);
	break;

	case EXPR_MATERIALIZE_TEMPORARY:
	S = new (Context) MaterializeTemporaryExpr(Empty);
	break;

	case EXPR_CXX_FOLD:
	S = new (Context) CXXFoldExpr(Empty);
	break;

	case EXPR_OPAQUE_VALUE:
	S = new (Context) OpaqueValueExpr(Empty);
	break;

	case EXPR_CUDA_KERNEL_CALL:
	S = new (Context) CUDAKernelCallExpr(Context, Empty);
	break;

	case EXPR_ASTYPE:
	S = new (Context) AsTypeExpr(Empty);
	break;

	case EXPR_PSEUDO_OBJECT: {
	unsigned numSemanticExprs = Record[ASTStmtReader::NumExprFields];
	S = PseudoObjectExpr::Create(Context, Empty, numSemanticExprs);
	break;
	}

	case EXPR_ATOMIC:
	S = new (Context) AtomicExpr(Empty);
	break;

	case EXPR_LAMBDA: {
	unsigned NumCaptures = Record[ASTStmtReader::NumExprFields];
	S = LambdaExpr::CreateDeserialized(Context, NumCaptures);
	break;
	}
	+
	+ case STMT_COROUTINE_BODY: {
	+ unsigned NumParams = Record[ASTStmtReader::NumStmtFields];
	+ S = CoroutineBodyStmt::Create(Context, Empty, NumParams);
	+ break;
	+ }
	+
	+ case STMT_CORETURN:
	+ S = new (Context) CoreturnStmt(Empty);
	+ break;
	+
	+ case EXPR_COAWAIT:
	+ S = new (Context) CoawaitExpr(Empty);
	+ break;
	+
	+ case EXPR_COYIELD:
	+ S = new (Context) CoyieldExpr(Empty);
	+ break;
	+
	+ case EXPR_DEPENDENT_COAWAIT:
	+ S = new (Context) DependentCoawaitExpr(Empty);
	+ break;
	+
	}

	// We hit a STMT_STOP, so we're done with this expression.
	if (Finished)
	break;

	++NumStatementsRead;

	if (S && !IsStmtReference) {
	Reader.Visit(S);
	StmtEntries[Cursor.GetCurrentBitNo()] = S;
	}

	assert(Record.getIdx() == Record.size() &&
	"Invalid deserialization of statement");
	StmtStack.push_back(S);
	}
	Done:
	assert(StmtStack.size() > PrevNumStmts && "Read too many sub-stmts!");
	assert(StmtStack.size() == PrevNumStmts + 1 && "Extra expressions on stack!");
	return StmtStack.pop_back_val();
	}
	Index: head/contrib/llvm/tools/clang/lib/Serialization/ASTWriterStmt.cpp
	===================================================================
	--- head/contrib/llvm/tools/clang/lib/Serialization/ASTWriterStmt.cpp (revision 322319)
	+++ head/contrib/llvm/tools/clang/lib/Serialization/ASTWriterStmt.cpp (revision 322320)
	@@ -1,2712 +1,2731 @@
	//===--- ASTWriterStmt.cpp - Statement and Expression Serialization -------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	///
	/// \file
	/// \brief Implements serialization for Statements and Expressions.
	///
	//===----------------------------------------------------------------------===//

	#include "clang/Serialization/ASTWriter.h"
	#include "clang/AST/ASTContext.h"
	#include "clang/AST/DeclCXX.h"
	#include "clang/AST/DeclObjC.h"
	#include "clang/AST/DeclTemplate.h"
	#include "clang/AST/StmtVisitor.h"
	#include "clang/Lex/Token.h"
	#include "llvm/Bitcode/BitstreamWriter.h"
	using namespace clang;

	//===----------------------------------------------------------------------===//
	// Statement/expression serialization
	//===----------------------------------------------------------------------===//

	namespace clang {

	class ASTStmtWriter : public StmtVisitor<ASTStmtWriter, void> {
	ASTWriter &Writer;
	ASTRecordWriter Record;

	serialization::StmtCode Code;
	unsigned AbbrevToUse;

	public:
	ASTStmtWriter(ASTWriter &Writer, ASTWriter::RecordData &Record)
	: Writer(Writer), Record(Writer, Record),
	Code(serialization::STMT_NULL_PTR), AbbrevToUse(0) {}

	ASTStmtWriter(const ASTStmtWriter&) = delete;

	uint64_t Emit() {
	assert(Code != serialization::STMT_NULL_PTR &&
	"unhandled sub-statement writing AST file");
	return Record.EmitStmt(Code, AbbrevToUse);
	}

	void AddTemplateKWAndArgsInfo(const ASTTemplateKWAndArgsInfo &ArgInfo,
	const TemplateArgumentLoc *Args);

	void VisitStmt(Stmt *S);
	#define STMT(Type, Base) \
	void Visit##Type(Type *);
	#include "clang/AST/StmtNodes.inc"
	};
	}

	void ASTStmtWriter::AddTemplateKWAndArgsInfo(
	const ASTTemplateKWAndArgsInfo &ArgInfo, const TemplateArgumentLoc *Args) {
	Record.AddSourceLocation(ArgInfo.TemplateKWLoc);
	Record.AddSourceLocation(ArgInfo.LAngleLoc);
	Record.AddSourceLocation(ArgInfo.RAngleLoc);
	for (unsigned i = 0; i != ArgInfo.NumTemplateArgs; ++i)
	Record.AddTemplateArgumentLoc(Args[i]);
	}

	void ASTStmtWriter::VisitStmt(Stmt *S) {
	}

	void ASTStmtWriter::VisitNullStmt(NullStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getSemiLoc());
	Record.push_back(S->HasLeadingEmptyMacro);
	Code = serialization::STMT_NULL;
	}

	void ASTStmtWriter::VisitCompoundStmt(CompoundStmt *S) {
	VisitStmt(S);
	Record.push_back(S->size());
	for (auto *CS : S->body())
	Record.AddStmt(CS);
	Record.AddSourceLocation(S->getLBracLoc());
	Record.AddSourceLocation(S->getRBracLoc());
	Code = serialization::STMT_COMPOUND;
	}

	void ASTStmtWriter::VisitSwitchCase(SwitchCase *S) {
	VisitStmt(S);
	Record.push_back(Writer.getSwitchCaseID(S));
	Record.AddSourceLocation(S->getKeywordLoc());
	Record.AddSourceLocation(S->getColonLoc());
	}

	void ASTStmtWriter::VisitCaseStmt(CaseStmt *S) {
	VisitSwitchCase(S);
	Record.AddStmt(S->getLHS());
	Record.AddStmt(S->getRHS());
	Record.AddStmt(S->getSubStmt());
	Record.AddSourceLocation(S->getEllipsisLoc());
	Code = serialization::STMT_CASE;
	}

	void ASTStmtWriter::VisitDefaultStmt(DefaultStmt *S) {
	VisitSwitchCase(S);
	Record.AddStmt(S->getSubStmt());
	Code = serialization::STMT_DEFAULT;
	}

	void ASTStmtWriter::VisitLabelStmt(LabelStmt *S) {
	VisitStmt(S);
	Record.AddDeclRef(S->getDecl());
	Record.AddStmt(S->getSubStmt());
	Record.AddSourceLocation(S->getIdentLoc());
	Code = serialization::STMT_LABEL;
	}

	void ASTStmtWriter::VisitAttributedStmt(AttributedStmt *S) {
	VisitStmt(S);
	Record.push_back(S->getAttrs().size());
	Record.AddAttributes(S->getAttrs());
	Record.AddStmt(S->getSubStmt());
	Record.AddSourceLocation(S->getAttrLoc());
	Code = serialization::STMT_ATTRIBUTED;
	}

	void ASTStmtWriter::VisitIfStmt(IfStmt *S) {
	VisitStmt(S);
	Record.push_back(S->isConstexpr());
	Record.AddStmt(S->getInit());
	Record.AddDeclRef(S->getConditionVariable());
	Record.AddStmt(S->getCond());
	Record.AddStmt(S->getThen());
	Record.AddStmt(S->getElse());
	Record.AddSourceLocation(S->getIfLoc());
	Record.AddSourceLocation(S->getElseLoc());
	Code = serialization::STMT_IF;
	}

	void ASTStmtWriter::VisitSwitchStmt(SwitchStmt *S) {
	VisitStmt(S);
	Record.AddStmt(S->getInit());
	Record.AddDeclRef(S->getConditionVariable());
	Record.AddStmt(S->getCond());
	Record.AddStmt(S->getBody());
	Record.AddSourceLocation(S->getSwitchLoc());
	Record.push_back(S->isAllEnumCasesCovered());
	for (SwitchCase *SC = S->getSwitchCaseList(); SC;
	SC = SC->getNextSwitchCase())
	Record.push_back(Writer.RecordSwitchCaseID(SC));
	Code = serialization::STMT_SWITCH;
	}

	void ASTStmtWriter::VisitWhileStmt(WhileStmt *S) {
	VisitStmt(S);
	Record.AddDeclRef(S->getConditionVariable());
	Record.AddStmt(S->getCond());
	Record.AddStmt(S->getBody());
	Record.AddSourceLocation(S->getWhileLoc());
	Code = serialization::STMT_WHILE;
	}

	void ASTStmtWriter::VisitDoStmt(DoStmt *S) {
	VisitStmt(S);
	Record.AddStmt(S->getCond());
	Record.AddStmt(S->getBody());
	Record.AddSourceLocation(S->getDoLoc());
	Record.AddSourceLocation(S->getWhileLoc());
	Record.AddSourceLocation(S->getRParenLoc());
	Code = serialization::STMT_DO;
	}

	void ASTStmtWriter::VisitForStmt(ForStmt *S) {
	VisitStmt(S);
	Record.AddStmt(S->getInit());
	Record.AddStmt(S->getCond());
	Record.AddDeclRef(S->getConditionVariable());
	Record.AddStmt(S->getInc());
	Record.AddStmt(S->getBody());
	Record.AddSourceLocation(S->getForLoc());
	Record.AddSourceLocation(S->getLParenLoc());
	Record.AddSourceLocation(S->getRParenLoc());
	Code = serialization::STMT_FOR;
	}

	void ASTStmtWriter::VisitGotoStmt(GotoStmt *S) {
	VisitStmt(S);
	Record.AddDeclRef(S->getLabel());
	Record.AddSourceLocation(S->getGotoLoc());
	Record.AddSourceLocation(S->getLabelLoc());
	Code = serialization::STMT_GOTO;
	}

	void ASTStmtWriter::VisitIndirectGotoStmt(IndirectGotoStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getGotoLoc());
	Record.AddSourceLocation(S->getStarLoc());
	Record.AddStmt(S->getTarget());
	Code = serialization::STMT_INDIRECT_GOTO;
	}

	void ASTStmtWriter::VisitContinueStmt(ContinueStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getContinueLoc());
	Code = serialization::STMT_CONTINUE;
	}

	void ASTStmtWriter::VisitBreakStmt(BreakStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getBreakLoc());
	Code = serialization::STMT_BREAK;
	}

	void ASTStmtWriter::VisitReturnStmt(ReturnStmt *S) {
	VisitStmt(S);
	Record.AddStmt(S->getRetValue());
	Record.AddSourceLocation(S->getReturnLoc());
	Record.AddDeclRef(S->getNRVOCandidate());
	Code = serialization::STMT_RETURN;
	}

	void ASTStmtWriter::VisitDeclStmt(DeclStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getStartLoc());
	Record.AddSourceLocation(S->getEndLoc());
	DeclGroupRef DG = S->getDeclGroup();
	for (DeclGroupRef::iterator D = DG.begin(), DEnd = DG.end(); D != DEnd; ++D)
	Record.AddDeclRef(*D);
	Code = serialization::STMT_DECL;
	}

	void ASTStmtWriter::VisitAsmStmt(AsmStmt *S) {
	VisitStmt(S);
	Record.push_back(S->getNumOutputs());
	Record.push_back(S->getNumInputs());
	Record.push_back(S->getNumClobbers());
	Record.AddSourceLocation(S->getAsmLoc());
	Record.push_back(S->isVolatile());
	Record.push_back(S->isSimple());
	}

	void ASTStmtWriter::VisitGCCAsmStmt(GCCAsmStmt *S) {
	VisitAsmStmt(S);
	Record.AddSourceLocation(S->getRParenLoc());
	Record.AddStmt(S->getAsmString());

	// Outputs
	for (unsigned I = 0, N = S->getNumOutputs(); I != N; ++I) {
	Record.AddIdentifierRef(S->getOutputIdentifier(I));
	Record.AddStmt(S->getOutputConstraintLiteral(I));
	Record.AddStmt(S->getOutputExpr(I));
	}

	// Inputs
	for (unsigned I = 0, N = S->getNumInputs(); I != N; ++I) {
	Record.AddIdentifierRef(S->getInputIdentifier(I));
	Record.AddStmt(S->getInputConstraintLiteral(I));
	Record.AddStmt(S->getInputExpr(I));
	}

	// Clobbers
	for (unsigned I = 0, N = S->getNumClobbers(); I != N; ++I)
	Record.AddStmt(S->getClobberStringLiteral(I));

	Code = serialization::STMT_GCCASM;
	}

	void ASTStmtWriter::VisitMSAsmStmt(MSAsmStmt *S) {
	VisitAsmStmt(S);
	Record.AddSourceLocation(S->getLBraceLoc());
	Record.AddSourceLocation(S->getEndLoc());
	Record.push_back(S->getNumAsmToks());
	Record.AddString(S->getAsmString());

	// Tokens
	for (unsigned I = 0, N = S->getNumAsmToks(); I != N; ++I) {
	// FIXME: Move this to ASTRecordWriter?
	Writer.AddToken(S->getAsmToks()[I], Record.getRecordData());
	}

	// Clobbers
	for (unsigned I = 0, N = S->getNumClobbers(); I != N; ++I) {
	Record.AddString(S->getClobber(I));
	}

	// Outputs
	- for (unsigned I = 0, N = S->getNumOutputs(); I != N; ++I) {
	+ for (unsigned I = 0, N = S->getNumOutputs(); I != N; ++I) {
	Record.AddStmt(S->getOutputExpr(I));
	Record.AddString(S->getOutputConstraint(I));
	}

	// Inputs
	for (unsigned I = 0, N = S->getNumInputs(); I != N; ++I) {
	Record.AddStmt(S->getInputExpr(I));
	Record.AddString(S->getInputConstraint(I));
	}

	Code = serialization::STMT_MSASM;
	}

	-void ASTStmtWriter::VisitCoroutineBodyStmt(CoroutineBodyStmt *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+void ASTStmtWriter::VisitCoroutineBodyStmt(CoroutineBodyStmt *CoroStmt) {
	+ VisitStmt(CoroStmt);
	+ Record.push_back(CoroStmt->getParamMoves().size());
	+ for (Stmt *S : CoroStmt->children())
	+ Record.AddStmt(S);
	+ Code = serialization::STMT_COROUTINE_BODY;
	}

	void ASTStmtWriter::VisitCoreturnStmt(CoreturnStmt *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+ VisitStmt(S);
	+ Record.AddSourceLocation(S->getKeywordLoc());
	+ Record.AddStmt(S->getOperand());
	+ Record.AddStmt(S->getPromiseCall());
	+ Record.push_back(S->isImplicit());
	+ Code = serialization::STMT_CORETURN;
	}

	-void ASTStmtWriter::VisitCoawaitExpr(CoawaitExpr *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+void ASTStmtWriter::VisitCoroutineSuspendExpr(CoroutineSuspendExpr *E) {
	+ VisitExpr(E);
	+ Record.AddSourceLocation(E->getKeywordLoc());
	+ for (Stmt *S : E->children())
	+ Record.AddStmt(S);
	+ Record.AddStmt(E->getOpaqueValue());
	}

	-void ASTStmtWriter::VisitDependentCoawaitExpr(DependentCoawaitExpr *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+void ASTStmtWriter::VisitCoawaitExpr(CoawaitExpr *E) {
	+ VisitCoroutineSuspendExpr(E);
	+ Record.push_back(E->isImplicit());
	+ Code = serialization::EXPR_COAWAIT;
	}

	-void ASTStmtWriter::VisitCoyieldExpr(CoyieldExpr *S) {
	- // FIXME: Implement coroutine serialization.
	- llvm_unreachable("unimplemented");
	+void ASTStmtWriter::VisitCoyieldExpr(CoyieldExpr *E) {
	+ VisitCoroutineSuspendExpr(E);
	+ Code = serialization::EXPR_COYIELD;
	+}
	+
	+void ASTStmtWriter::VisitDependentCoawaitExpr(DependentCoawaitExpr *E) {
	+ VisitExpr(E);
	+ Record.AddSourceLocation(E->getKeywordLoc());
	+ for (Stmt *S : E->children())
	+ Record.AddStmt(S);
	+ Code = serialization::EXPR_DEPENDENT_COAWAIT;
	}

	void ASTStmtWriter::VisitCapturedStmt(CapturedStmt *S) {
	VisitStmt(S);
	// NumCaptures
	Record.push_back(std::distance(S->capture_begin(), S->capture_end()));

	// CapturedDecl and captured region kind
	Record.AddDeclRef(S->getCapturedDecl());
	Record.push_back(S->getCapturedRegionKind());

	Record.AddDeclRef(S->getCapturedRecordDecl());

	// Capture inits
	for (auto *I : S->capture_inits())
	Record.AddStmt(I);

	// Body
	Record.AddStmt(S->getCapturedStmt());

	// Captures
	for (const auto &I : S->captures()) {
	if (I.capturesThis() \|\| I.capturesVariableArrayType())
	Record.AddDeclRef(nullptr);
	else
	Record.AddDeclRef(I.getCapturedVar());
	Record.push_back(I.getCaptureKind());
	Record.AddSourceLocation(I.getLocation());
	}

	Code = serialization::STMT_CAPTURED;
	}

	void ASTStmtWriter::VisitExpr(Expr *E) {
	VisitStmt(E);
	Record.AddTypeRef(E->getType());
	Record.push_back(E->isTypeDependent());
	Record.push_back(E->isValueDependent());
	Record.push_back(E->isInstantiationDependent());
	Record.push_back(E->containsUnexpandedParameterPack());
	Record.push_back(E->getValueKind());
	Record.push_back(E->getObjectKind());
	}

	void ASTStmtWriter::VisitPredefinedExpr(PredefinedExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getLocation());
	Record.push_back(E->getIdentType()); // FIXME: stable encoding
	Record.AddStmt(E->getFunctionName());
	Code = serialization::EXPR_PREDEFINED;
	}

	void ASTStmtWriter::VisitDeclRefExpr(DeclRefExpr *E) {
	VisitExpr(E);

	Record.push_back(E->hasQualifier());
	Record.push_back(E->getDecl() != E->getFoundDecl());
	Record.push_back(E->hasTemplateKWAndArgsInfo());
	Record.push_back(E->hadMultipleCandidates());
	Record.push_back(E->refersToEnclosingVariableOrCapture());

	if (E->hasTemplateKWAndArgsInfo()) {
	unsigned NumTemplateArgs = E->getNumTemplateArgs();
	Record.push_back(NumTemplateArgs);
	}

	DeclarationName::NameKind nk = (E->getDecl()->getDeclName().getNameKind());

	if ((!E->hasTemplateKWAndArgsInfo()) && (!E->hasQualifier()) &&
	(E->getDecl() == E->getFoundDecl()) &&
	nk == DeclarationName::Identifier) {
	AbbrevToUse = Writer.getDeclRefExprAbbrev();
	}

	if (E->hasQualifier())
	Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());

	if (E->getDecl() != E->getFoundDecl())
	Record.AddDeclRef(E->getFoundDecl());

	if (E->hasTemplateKWAndArgsInfo())
	AddTemplateKWAndArgsInfo(*E->getTrailingObjects<ASTTemplateKWAndArgsInfo>(),
	E->getTrailingObjects<TemplateArgumentLoc>());

	Record.AddDeclRef(E->getDecl());
	Record.AddSourceLocation(E->getLocation());
	Record.AddDeclarationNameLoc(E->DNLoc, E->getDecl()->getDeclName());
	Code = serialization::EXPR_DECL_REF;
	}

	void ASTStmtWriter::VisitIntegerLiteral(IntegerLiteral *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getLocation());
	Record.AddAPInt(E->getValue());

	if (E->getValue().getBitWidth() == 32) {
	AbbrevToUse = Writer.getIntegerLiteralAbbrev();
	}

	Code = serialization::EXPR_INTEGER_LITERAL;
	}

	void ASTStmtWriter::VisitFloatingLiteral(FloatingLiteral *E) {
	VisitExpr(E);
	Record.push_back(E->getRawSemantics());
	Record.push_back(E->isExact());
	Record.AddAPFloat(E->getValue());
	Record.AddSourceLocation(E->getLocation());
	Code = serialization::EXPR_FLOATING_LITERAL;
	}

	void ASTStmtWriter::VisitImaginaryLiteral(ImaginaryLiteral *E) {
	VisitExpr(E);
	Record.AddStmt(E->getSubExpr());
	Code = serialization::EXPR_IMAGINARY_LITERAL;
	}

	void ASTStmtWriter::VisitStringLiteral(StringLiteral *E) {
	VisitExpr(E);
	Record.push_back(E->getByteLength());
	Record.push_back(E->getNumConcatenated());
	Record.push_back(E->getKind());
	Record.push_back(E->isPascal());
	// FIXME: String data should be stored as a blob at the end of the
	// StringLiteral. However, we can't do so now because we have no
	// provision for coping with abbreviations when we're jumping around
	// the AST file during deserialization.
	Record.append(E->getBytes().begin(), E->getBytes().end());
	for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I)
	Record.AddSourceLocation(E->getStrTokenLoc(I));
	Code = serialization::EXPR_STRING_LITERAL;
	}

	void ASTStmtWriter::VisitCharacterLiteral(CharacterLiteral *E) {
	VisitExpr(E);
	Record.push_back(E->getValue());
	Record.AddSourceLocation(E->getLocation());
	Record.push_back(E->getKind());

	AbbrevToUse = Writer.getCharacterLiteralAbbrev();

	Code = serialization::EXPR_CHARACTER_LITERAL;
	}

	void ASTStmtWriter::VisitParenExpr(ParenExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getLParen());
	Record.AddSourceLocation(E->getRParen());
	Record.AddStmt(E->getSubExpr());
	Code = serialization::EXPR_PAREN;
	}

	void ASTStmtWriter::VisitParenListExpr(ParenListExpr *E) {
	VisitExpr(E);
	Record.push_back(E->NumExprs);
	for (unsigned i=0; i != E->NumExprs; ++i)
	Record.AddStmt(E->Exprs[i]);
	Record.AddSourceLocation(E->LParenLoc);
	Record.AddSourceLocation(E->RParenLoc);
	Code = serialization::EXPR_PAREN_LIST;
	}

	void ASTStmtWriter::VisitUnaryOperator(UnaryOperator *E) {
	VisitExpr(E);
	Record.AddStmt(E->getSubExpr());
	Record.push_back(E->getOpcode()); // FIXME: stable encoding
	Record.AddSourceLocation(E->getOperatorLoc());
	Code = serialization::EXPR_UNARY_OPERATOR;
	}

	void ASTStmtWriter::VisitOffsetOfExpr(OffsetOfExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumComponents());
	Record.push_back(E->getNumExpressions());
	Record.AddSourceLocation(E->getOperatorLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Record.AddTypeSourceInfo(E->getTypeSourceInfo());
	for (unsigned I = 0, N = E->getNumComponents(); I != N; ++I) {
	const OffsetOfNode &ON = E->getComponent(I);
	Record.push_back(ON.getKind()); // FIXME: Stable encoding
	Record.AddSourceLocation(ON.getSourceRange().getBegin());
	Record.AddSourceLocation(ON.getSourceRange().getEnd());
	switch (ON.getKind()) {
	case OffsetOfNode::Array:
	Record.push_back(ON.getArrayExprIndex());
	break;

	case OffsetOfNode::Field:
	Record.AddDeclRef(ON.getField());
	break;

	case OffsetOfNode::Identifier:
	Record.AddIdentifierRef(ON.getFieldName());
	break;

	case OffsetOfNode::Base:
	Record.AddCXXBaseSpecifier(*ON.getBase());
	break;
	}
	}
	for (unsigned I = 0, N = E->getNumExpressions(); I != N; ++I)
	Record.AddStmt(E->getIndexExpr(I));
	Code = serialization::EXPR_OFFSETOF;
	}

	void ASTStmtWriter::VisitUnaryExprOrTypeTraitExpr(UnaryExprOrTypeTraitExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getKind());
	if (E->isArgumentType())
	Record.AddTypeSourceInfo(E->getArgumentTypeInfo());
	else {
	Record.push_back(0);
	Record.AddStmt(E->getArgumentExpr());
	}
	Record.AddSourceLocation(E->getOperatorLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_SIZEOF_ALIGN_OF;
	}

	void ASTStmtWriter::VisitArraySubscriptExpr(ArraySubscriptExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getLHS());
	Record.AddStmt(E->getRHS());
	Record.AddSourceLocation(E->getRBracketLoc());
	Code = serialization::EXPR_ARRAY_SUBSCRIPT;
	}

	void ASTStmtWriter::VisitOMPArraySectionExpr(OMPArraySectionExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getBase());
	Record.AddStmt(E->getLowerBound());
	Record.AddStmt(E->getLength());
	Record.AddSourceLocation(E->getColonLoc());
	Record.AddSourceLocation(E->getRBracketLoc());
	Code = serialization::EXPR_OMP_ARRAY_SECTION;
	}

	void ASTStmtWriter::VisitCallExpr(CallExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumArgs());
	Record.AddSourceLocation(E->getRParenLoc());
	Record.AddStmt(E->getCallee());
	for (CallExpr::arg_iterator Arg = E->arg_begin(), ArgEnd = E->arg_end();
	Arg != ArgEnd; ++Arg)
	Record.AddStmt(*Arg);
	Code = serialization::EXPR_CALL;
	}

	void ASTStmtWriter::VisitMemberExpr(MemberExpr *E) {
	// Don't call VisitExpr, we'll write everything here.

	Record.push_back(E->hasQualifier());
	if (E->hasQualifier())
	Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());

	Record.push_back(E->HasTemplateKWAndArgsInfo);
	if (E->HasTemplateKWAndArgsInfo) {
	Record.AddSourceLocation(E->getTemplateKeywordLoc());
	unsigned NumTemplateArgs = E->getNumTemplateArgs();
	Record.push_back(NumTemplateArgs);
	Record.AddSourceLocation(E->getLAngleLoc());
	Record.AddSourceLocation(E->getRAngleLoc());
	for (unsigned i=0; i != NumTemplateArgs; ++i)
	Record.AddTemplateArgumentLoc(E->getTemplateArgs()[i]);
	}

	Record.push_back(E->hadMultipleCandidates());

	DeclAccessPair FoundDecl = E->getFoundDecl();
	Record.AddDeclRef(FoundDecl.getDecl());
	Record.push_back(FoundDecl.getAccess());

	Record.AddTypeRef(E->getType());
	Record.push_back(E->getValueKind());
	Record.push_back(E->getObjectKind());
	Record.AddStmt(E->getBase());
	Record.AddDeclRef(E->getMemberDecl());
	Record.AddSourceLocation(E->getMemberLoc());
	Record.push_back(E->isArrow());
	Record.AddSourceLocation(E->getOperatorLoc());
	Record.AddDeclarationNameLoc(E->MemberDNLoc,
	E->getMemberDecl()->getDeclName());
	Code = serialization::EXPR_MEMBER;
	}

	void ASTStmtWriter::VisitObjCIsaExpr(ObjCIsaExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getBase());
	Record.AddSourceLocation(E->getIsaMemberLoc());
	Record.AddSourceLocation(E->getOpLoc());
	Record.push_back(E->isArrow());
	Code = serialization::EXPR_OBJC_ISA;
	}

	void ASTStmtWriter::
	VisitObjCIndirectCopyRestoreExpr(ObjCIndirectCopyRestoreExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getSubExpr());
	Record.push_back(E->shouldCopy());
	Code = serialization::EXPR_OBJC_INDIRECT_COPY_RESTORE;
	}

	void ASTStmtWriter::VisitObjCBridgedCastExpr(ObjCBridgedCastExpr *E) {
	VisitExplicitCastExpr(E);
	Record.AddSourceLocation(E->getLParenLoc());
	Record.AddSourceLocation(E->getBridgeKeywordLoc());
	Record.push_back(E->getBridgeKind()); // FIXME: Stable encoding
	Code = serialization::EXPR_OBJC_BRIDGED_CAST;
	}

	void ASTStmtWriter::VisitCastExpr(CastExpr *E) {
	VisitExpr(E);
	Record.push_back(E->path_size());
	Record.AddStmt(E->getSubExpr());
	Record.push_back(E->getCastKind()); // FIXME: stable encoding

	for (CastExpr::path_iterator
	PI = E->path_begin(), PE = E->path_end(); PI != PE; ++PI)
	Record.AddCXXBaseSpecifier(**PI);
	}

	void ASTStmtWriter::VisitBinaryOperator(BinaryOperator *E) {
	VisitExpr(E);
	Record.AddStmt(E->getLHS());
	Record.AddStmt(E->getRHS());
	Record.push_back(E->getOpcode()); // FIXME: stable encoding
	Record.AddSourceLocation(E->getOperatorLoc());
	Record.push_back(E->getFPFeatures().getInt());
	Code = serialization::EXPR_BINARY_OPERATOR;
	}

	void ASTStmtWriter::VisitCompoundAssignOperator(CompoundAssignOperator *E) {
	VisitBinaryOperator(E);
	Record.AddTypeRef(E->getComputationLHSType());
	Record.AddTypeRef(E->getComputationResultType());
	Code = serialization::EXPR_COMPOUND_ASSIGN_OPERATOR;
	}

	void ASTStmtWriter::VisitConditionalOperator(ConditionalOperator *E) {
	VisitExpr(E);
	Record.AddStmt(E->getCond());
	Record.AddStmt(E->getLHS());
	Record.AddStmt(E->getRHS());
	Record.AddSourceLocation(E->getQuestionLoc());
	Record.AddSourceLocation(E->getColonLoc());
	Code = serialization::EXPR_CONDITIONAL_OPERATOR;
	}

	void
	ASTStmtWriter::VisitBinaryConditionalOperator(BinaryConditionalOperator *E) {
	VisitExpr(E);
	Record.AddStmt(E->getOpaqueValue());
	Record.AddStmt(E->getCommon());
	Record.AddStmt(E->getCond());
	Record.AddStmt(E->getTrueExpr());
	Record.AddStmt(E->getFalseExpr());
	Record.AddSourceLocation(E->getQuestionLoc());
	Record.AddSourceLocation(E->getColonLoc());
	Code = serialization::EXPR_BINARY_CONDITIONAL_OPERATOR;
	}

	void ASTStmtWriter::VisitImplicitCastExpr(ImplicitCastExpr *E) {
	VisitCastExpr(E);

	if (E->path_size() == 0)
	AbbrevToUse = Writer.getExprImplicitCastAbbrev();

	Code = serialization::EXPR_IMPLICIT_CAST;
	}

	void ASTStmtWriter::VisitExplicitCastExpr(ExplicitCastExpr *E) {
	VisitCastExpr(E);
	Record.AddTypeSourceInfo(E->getTypeInfoAsWritten());
	}

	void ASTStmtWriter::VisitCStyleCastExpr(CStyleCastExpr *E) {
	VisitExplicitCastExpr(E);
	Record.AddSourceLocation(E->getLParenLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_CSTYLE_CAST;
	}

	void ASTStmtWriter::VisitCompoundLiteralExpr(CompoundLiteralExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getLParenLoc());
	Record.AddTypeSourceInfo(E->getTypeSourceInfo());
	Record.AddStmt(E->getInitializer());
	Record.push_back(E->isFileScope());
	Code = serialization::EXPR_COMPOUND_LITERAL;
	}

	void ASTStmtWriter::VisitExtVectorElementExpr(ExtVectorElementExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getBase());
	Record.AddIdentifierRef(&E->getAccessor());
	Record.AddSourceLocation(E->getAccessorLoc());
	Code = serialization::EXPR_EXT_VECTOR_ELEMENT;
	}

	void ASTStmtWriter::VisitInitListExpr(InitListExpr *E) {
	VisitExpr(E);
	// NOTE: only add the (possibly null) syntactic form.
	// No need to serialize the isSemanticForm flag and the semantic form.
	Record.AddStmt(E->getSyntacticForm());
	Record.AddSourceLocation(E->getLBraceLoc());
	Record.AddSourceLocation(E->getRBraceLoc());
	bool isArrayFiller = E->ArrayFillerOrUnionFieldInit.is<Expr*>();
	Record.push_back(isArrayFiller);
	if (isArrayFiller)
	Record.AddStmt(E->getArrayFiller());
	else
	Record.AddDeclRef(E->getInitializedFieldInUnion());
	Record.push_back(E->hadArrayRangeDesignator());
	Record.push_back(E->getNumInits());
	if (isArrayFiller) {
	// ArrayFiller may have filled "holes" due to designated initializer.
	// Replace them by 0 to indicate that the filler goes in that place.
	Expr *filler = E->getArrayFiller();
	for (unsigned I = 0, N = E->getNumInits(); I != N; ++I)
	Record.AddStmt(E->getInit(I) != filler ? E->getInit(I) : nullptr);
	} else {
	for (unsigned I = 0, N = E->getNumInits(); I != N; ++I)
	Record.AddStmt(E->getInit(I));
	}
	Code = serialization::EXPR_INIT_LIST;
	}

	void ASTStmtWriter::VisitDesignatedInitExpr(DesignatedInitExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumSubExprs());
	for (unsigned I = 0, N = E->getNumSubExprs(); I != N; ++I)
	Record.AddStmt(E->getSubExpr(I));
	Record.AddSourceLocation(E->getEqualOrColonLoc());
	Record.push_back(E->usesGNUSyntax());
	for (const DesignatedInitExpr::Designator &D : E->designators()) {
	if (D.isFieldDesignator()) {
	if (FieldDecl *Field = D.getField()) {
	Record.push_back(serialization::DESIG_FIELD_DECL);
	Record.AddDeclRef(Field);
	} else {
	Record.push_back(serialization::DESIG_FIELD_NAME);
	Record.AddIdentifierRef(D.getFieldName());
	}
	Record.AddSourceLocation(D.getDotLoc());
	Record.AddSourceLocation(D.getFieldLoc());
	} else if (D.isArrayDesignator()) {
	Record.push_back(serialization::DESIG_ARRAY);
	Record.push_back(D.getFirstExprIndex());
	Record.AddSourceLocation(D.getLBracketLoc());
	Record.AddSourceLocation(D.getRBracketLoc());
	} else {
	assert(D.isArrayRangeDesignator() && "Unknown designator");
	Record.push_back(serialization::DESIG_ARRAY_RANGE);
	Record.push_back(D.getFirstExprIndex());
	Record.AddSourceLocation(D.getLBracketLoc());
	Record.AddSourceLocation(D.getEllipsisLoc());
	Record.AddSourceLocation(D.getRBracketLoc());
	}
	}
	Code = serialization::EXPR_DESIGNATED_INIT;
	}

	void ASTStmtWriter::VisitDesignatedInitUpdateExpr(DesignatedInitUpdateExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getBase());
	Record.AddStmt(E->getUpdater());
	Code = serialization::EXPR_DESIGNATED_INIT_UPDATE;
	}

	void ASTStmtWriter::VisitNoInitExpr(NoInitExpr *E) {
	VisitExpr(E);
	Code = serialization::EXPR_NO_INIT;
	}

	void ASTStmtWriter::VisitArrayInitLoopExpr(ArrayInitLoopExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->SubExprs[0]);
	Record.AddStmt(E->SubExprs[1]);
	Code = serialization::EXPR_ARRAY_INIT_LOOP;
	}

	void ASTStmtWriter::VisitArrayInitIndexExpr(ArrayInitIndexExpr *E) {
	VisitExpr(E);
	Code = serialization::EXPR_ARRAY_INIT_INDEX;
	}

	void ASTStmtWriter::VisitImplicitValueInitExpr(ImplicitValueInitExpr *E) {
	VisitExpr(E);
	Code = serialization::EXPR_IMPLICIT_VALUE_INIT;
	}

	void ASTStmtWriter::VisitVAArgExpr(VAArgExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getSubExpr());
	Record.AddTypeSourceInfo(E->getWrittenTypeInfo());
	Record.AddSourceLocation(E->getBuiltinLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Record.push_back(E->isMicrosoftABI());
	Code = serialization::EXPR_VA_ARG;
	}

	void ASTStmtWriter::VisitAddrLabelExpr(AddrLabelExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getAmpAmpLoc());
	Record.AddSourceLocation(E->getLabelLoc());
	Record.AddDeclRef(E->getLabel());
	Code = serialization::EXPR_ADDR_LABEL;
	}

	void ASTStmtWriter::VisitStmtExpr(StmtExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getSubStmt());
	Record.AddSourceLocation(E->getLParenLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_STMT;
	}

	void ASTStmtWriter::VisitChooseExpr(ChooseExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getCond());
	Record.AddStmt(E->getLHS());
	Record.AddStmt(E->getRHS());
	Record.AddSourceLocation(E->getBuiltinLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Record.push_back(E->isConditionDependent() ? false : E->isConditionTrue());
	Code = serialization::EXPR_CHOOSE;
	}

	void ASTStmtWriter::VisitGNUNullExpr(GNUNullExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getTokenLocation());
	Code = serialization::EXPR_GNU_NULL;
	}

	void ASTStmtWriter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumSubExprs());
	for (unsigned I = 0, N = E->getNumSubExprs(); I != N; ++I)
	Record.AddStmt(E->getExpr(I));
	Record.AddSourceLocation(E->getBuiltinLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_SHUFFLE_VECTOR;
	}

	void ASTStmtWriter::VisitConvertVectorExpr(ConvertVectorExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getBuiltinLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Record.AddTypeSourceInfo(E->getTypeSourceInfo());
	Record.AddStmt(E->getSrcExpr());
	Code = serialization::EXPR_CONVERT_VECTOR;
	}

	void ASTStmtWriter::VisitBlockExpr(BlockExpr *E) {
	VisitExpr(E);
	Record.AddDeclRef(E->getBlockDecl());
	Code = serialization::EXPR_BLOCK;
	}

	void ASTStmtWriter::VisitGenericSelectionExpr(GenericSelectionExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumAssocs());

	Record.AddStmt(E->getControllingExpr());
	for (unsigned I = 0, N = E->getNumAssocs(); I != N; ++I) {
	Record.AddTypeSourceInfo(E->getAssocTypeSourceInfo(I));
	Record.AddStmt(E->getAssocExpr(I));
	}
	Record.push_back(E->isResultDependent() ? -1U : E->getResultIndex());

	Record.AddSourceLocation(E->getGenericLoc());
	Record.AddSourceLocation(E->getDefaultLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_GENERIC_SELECTION;
	}

	void ASTStmtWriter::VisitPseudoObjectExpr(PseudoObjectExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumSemanticExprs());

	// Push the result index. Currently, this needs to exactly match
	// the encoding used internally for ResultIndex.
	unsigned result = E->getResultExprIndex();
	result = (result == PseudoObjectExpr::NoResult ? 0 : result + 1);
	Record.push_back(result);

	Record.AddStmt(E->getSyntacticForm());
	for (PseudoObjectExpr::semantics_iterator
	i = E->semantics_begin(), e = E->semantics_end(); i != e; ++i) {
	Record.AddStmt(*i);
	}
	Code = serialization::EXPR_PSEUDO_OBJECT;
	}

	void ASTStmtWriter::VisitAtomicExpr(AtomicExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getOp());
	for (unsigned I = 0, N = E->getNumSubExprs(); I != N; ++I)
	Record.AddStmt(E->getSubExprs()[I]);
	Record.AddSourceLocation(E->getBuiltinLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_ATOMIC;
	}

	//===----------------------------------------------------------------------===//
	// Objective-C Expressions and Statements.
	//===----------------------------------------------------------------------===//

	void ASTStmtWriter::VisitObjCStringLiteral(ObjCStringLiteral *E) {
	VisitExpr(E);
	Record.AddStmt(E->getString());
	Record.AddSourceLocation(E->getAtLoc());
	Code = serialization::EXPR_OBJC_STRING_LITERAL;
	}

	void ASTStmtWriter::VisitObjCBoxedExpr(ObjCBoxedExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getSubExpr());
	Record.AddDeclRef(E->getBoxingMethod());
	Record.AddSourceRange(E->getSourceRange());
	Code = serialization::EXPR_OBJC_BOXED_EXPRESSION;
	}

	void ASTStmtWriter::VisitObjCArrayLiteral(ObjCArrayLiteral *E) {
	VisitExpr(E);
	Record.push_back(E->getNumElements());
	for (unsigned i = 0; i < E->getNumElements(); i++)
	Record.AddStmt(E->getElement(i));
	Record.AddDeclRef(E->getArrayWithObjectsMethod());
	Record.AddSourceRange(E->getSourceRange());
	Code = serialization::EXPR_OBJC_ARRAY_LITERAL;
	}

	void ASTStmtWriter::VisitObjCDictionaryLiteral(ObjCDictionaryLiteral *E) {
	VisitExpr(E);
	Record.push_back(E->getNumElements());
	Record.push_back(E->HasPackExpansions);
	for (unsigned i = 0; i < E->getNumElements(); i++) {
	ObjCDictionaryElement Element = E->getKeyValueElement(i);
	Record.AddStmt(Element.Key);
	Record.AddStmt(Element.Value);
	if (E->HasPackExpansions) {
	Record.AddSourceLocation(Element.EllipsisLoc);
	unsigned NumExpansions = 0;
	if (Element.NumExpansions)
	NumExpansions = *Element.NumExpansions + 1;
	Record.push_back(NumExpansions);
	}
	}

	Record.AddDeclRef(E->getDictWithObjectsMethod());
	Record.AddSourceRange(E->getSourceRange());
	Code = serialization::EXPR_OBJC_DICTIONARY_LITERAL;
	}

	void ASTStmtWriter::VisitObjCEncodeExpr(ObjCEncodeExpr *E) {
	VisitExpr(E);
	Record.AddTypeSourceInfo(E->getEncodedTypeSourceInfo());
	Record.AddSourceLocation(E->getAtLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_OBJC_ENCODE;
	}

	void ASTStmtWriter::VisitObjCSelectorExpr(ObjCSelectorExpr *E) {
	VisitExpr(E);
	Record.AddSelectorRef(E->getSelector());
	Record.AddSourceLocation(E->getAtLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_OBJC_SELECTOR_EXPR;
	}

	void ASTStmtWriter::VisitObjCProtocolExpr(ObjCProtocolExpr *E) {
	VisitExpr(E);
	Record.AddDeclRef(E->getProtocol());
	Record.AddSourceLocation(E->getAtLoc());
	Record.AddSourceLocation(E->ProtoLoc);
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_OBJC_PROTOCOL_EXPR;
	}

	void ASTStmtWriter::VisitObjCIvarRefExpr(ObjCIvarRefExpr *E) {
	VisitExpr(E);
	Record.AddDeclRef(E->getDecl());
	Record.AddSourceLocation(E->getLocation());
	Record.AddSourceLocation(E->getOpLoc());
	Record.AddStmt(E->getBase());
	Record.push_back(E->isArrow());
	Record.push_back(E->isFreeIvar());
	Code = serialization::EXPR_OBJC_IVAR_REF_EXPR;
	}

	void ASTStmtWriter::VisitObjCPropertyRefExpr(ObjCPropertyRefExpr *E) {
	VisitExpr(E);
	Record.push_back(E->SetterAndMethodRefFlags.getInt());
	Record.push_back(E->isImplicitProperty());
	if (E->isImplicitProperty()) {
	Record.AddDeclRef(E->getImplicitPropertyGetter());
	Record.AddDeclRef(E->getImplicitPropertySetter());
	} else {
	Record.AddDeclRef(E->getExplicitProperty());
	}
	Record.AddSourceLocation(E->getLocation());
	Record.AddSourceLocation(E->getReceiverLocation());
	if (E->isObjectReceiver()) {
	Record.push_back(0);
	Record.AddStmt(E->getBase());
	} else if (E->isSuperReceiver()) {
	Record.push_back(1);
	Record.AddTypeRef(E->getSuperReceiverType());
	} else {
	Record.push_back(2);
	Record.AddDeclRef(E->getClassReceiver());
	}

	Code = serialization::EXPR_OBJC_PROPERTY_REF_EXPR;
	}

	void ASTStmtWriter::VisitObjCSubscriptRefExpr(ObjCSubscriptRefExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getRBracket());
	Record.AddStmt(E->getBaseExpr());
	Record.AddStmt(E->getKeyExpr());
	Record.AddDeclRef(E->getAtIndexMethodDecl());
	Record.AddDeclRef(E->setAtIndexMethodDecl());

	Code = serialization::EXPR_OBJC_SUBSCRIPT_REF_EXPR;
	}

	void ASTStmtWriter::VisitObjCMessageExpr(ObjCMessageExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumArgs());
	Record.push_back(E->getNumStoredSelLocs());
	Record.push_back(E->SelLocsKind);
	Record.push_back(E->isDelegateInitCall());
	Record.push_back(E->IsImplicit);
	Record.push_back((unsigned)E->getReceiverKind()); // FIXME: stable encoding
	switch (E->getReceiverKind()) {
	case ObjCMessageExpr::Instance:
	Record.AddStmt(E->getInstanceReceiver());
	break;

	case ObjCMessageExpr::Class:
	Record.AddTypeSourceInfo(E->getClassReceiverTypeInfo());
	break;

	case ObjCMessageExpr::SuperClass:
	case ObjCMessageExpr::SuperInstance:
	Record.AddTypeRef(E->getSuperType());
	Record.AddSourceLocation(E->getSuperLoc());
	break;
	}

	if (E->getMethodDecl()) {
	Record.push_back(1);
	Record.AddDeclRef(E->getMethodDecl());
	} else {
	Record.push_back(0);
	Record.AddSelectorRef(E->getSelector());
	}

	Record.AddSourceLocation(E->getLeftLoc());
	Record.AddSourceLocation(E->getRightLoc());

	for (CallExpr::arg_iterator Arg = E->arg_begin(), ArgEnd = E->arg_end();
	Arg != ArgEnd; ++Arg)
	Record.AddStmt(*Arg);

	SourceLocation *Locs = E->getStoredSelLocs();
	for (unsigned i = 0, e = E->getNumStoredSelLocs(); i != e; ++i)
	Record.AddSourceLocation(Locs[i]);

	Code = serialization::EXPR_OBJC_MESSAGE_EXPR;
	}

	void ASTStmtWriter::VisitObjCForCollectionStmt(ObjCForCollectionStmt *S) {
	VisitStmt(S);
	Record.AddStmt(S->getElement());
	Record.AddStmt(S->getCollection());
	Record.AddStmt(S->getBody());
	Record.AddSourceLocation(S->getForLoc());
	Record.AddSourceLocation(S->getRParenLoc());
	Code = serialization::STMT_OBJC_FOR_COLLECTION;
	}

	void ASTStmtWriter::VisitObjCAtCatchStmt(ObjCAtCatchStmt *S) {
	Record.AddStmt(S->getCatchBody());
	Record.AddDeclRef(S->getCatchParamDecl());
	Record.AddSourceLocation(S->getAtCatchLoc());
	Record.AddSourceLocation(S->getRParenLoc());
	Code = serialization::STMT_OBJC_CATCH;
	}

	void ASTStmtWriter::VisitObjCAtFinallyStmt(ObjCAtFinallyStmt *S) {
	Record.AddStmt(S->getFinallyBody());
	Record.AddSourceLocation(S->getAtFinallyLoc());
	Code = serialization::STMT_OBJC_FINALLY;
	}

	void ASTStmtWriter::VisitObjCAutoreleasePoolStmt(ObjCAutoreleasePoolStmt *S) {
	Record.AddStmt(S->getSubStmt());
	Record.AddSourceLocation(S->getAtLoc());
	Code = serialization::STMT_OBJC_AUTORELEASE_POOL;
	}

	void ASTStmtWriter::VisitObjCAtTryStmt(ObjCAtTryStmt *S) {
	Record.push_back(S->getNumCatchStmts());
	Record.push_back(S->getFinallyStmt() != nullptr);
	Record.AddStmt(S->getTryBody());
	for (unsigned I = 0, N = S->getNumCatchStmts(); I != N; ++I)
	Record.AddStmt(S->getCatchStmt(I));
	if (S->getFinallyStmt())
	Record.AddStmt(S->getFinallyStmt());
	Record.AddSourceLocation(S->getAtTryLoc());
	Code = serialization::STMT_OBJC_AT_TRY;
	}

	void ASTStmtWriter::VisitObjCAtSynchronizedStmt(ObjCAtSynchronizedStmt *S) {
	Record.AddStmt(S->getSynchExpr());
	Record.AddStmt(S->getSynchBody());
	Record.AddSourceLocation(S->getAtSynchronizedLoc());
	Code = serialization::STMT_OBJC_AT_SYNCHRONIZED;
	}

	void ASTStmtWriter::VisitObjCAtThrowStmt(ObjCAtThrowStmt *S) {
	Record.AddStmt(S->getThrowExpr());
	Record.AddSourceLocation(S->getThrowLoc());
	Code = serialization::STMT_OBJC_AT_THROW;
	}

	void ASTStmtWriter::VisitObjCBoolLiteralExpr(ObjCBoolLiteralExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getValue());
	Record.AddSourceLocation(E->getLocation());
	Code = serialization::EXPR_OBJC_BOOL_LITERAL;
	}

	void ASTStmtWriter::VisitObjCAvailabilityCheckExpr(ObjCAvailabilityCheckExpr *E) {
	VisitExpr(E);
	Record.AddSourceRange(E->getSourceRange());
	Record.AddVersionTuple(E->getVersion());
	Code = serialization::EXPR_OBJC_AVAILABILITY_CHECK;
	}

	//===----------------------------------------------------------------------===//
	// C++ Expressions and Statements.
	//===----------------------------------------------------------------------===//

	void ASTStmtWriter::VisitCXXCatchStmt(CXXCatchStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getCatchLoc());
	Record.AddDeclRef(S->getExceptionDecl());
	Record.AddStmt(S->getHandlerBlock());
	Code = serialization::STMT_CXX_CATCH;
	}

	void ASTStmtWriter::VisitCXXTryStmt(CXXTryStmt *S) {
	VisitStmt(S);
	Record.push_back(S->getNumHandlers());
	Record.AddSourceLocation(S->getTryLoc());
	Record.AddStmt(S->getTryBlock());
	for (unsigned i = 0, e = S->getNumHandlers(); i != e; ++i)
	Record.AddStmt(S->getHandler(i));
	Code = serialization::STMT_CXX_TRY;
	}

	void ASTStmtWriter::VisitCXXForRangeStmt(CXXForRangeStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getForLoc());
	Record.AddSourceLocation(S->getCoawaitLoc());
	Record.AddSourceLocation(S->getColonLoc());
	Record.AddSourceLocation(S->getRParenLoc());
	Record.AddStmt(S->getRangeStmt());
	Record.AddStmt(S->getBeginStmt());
	Record.AddStmt(S->getEndStmt());
	Record.AddStmt(S->getCond());
	Record.AddStmt(S->getInc());
	Record.AddStmt(S->getLoopVarStmt());
	Record.AddStmt(S->getBody());
	Code = serialization::STMT_CXX_FOR_RANGE;
	}

	void ASTStmtWriter::VisitMSDependentExistsStmt(MSDependentExistsStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getKeywordLoc());
	Record.push_back(S->isIfExists());
	Record.AddNestedNameSpecifierLoc(S->getQualifierLoc());
	Record.AddDeclarationNameInfo(S->getNameInfo());
	Record.AddStmt(S->getSubStmt());
	Code = serialization::STMT_MS_DEPENDENT_EXISTS;
	}

	void ASTStmtWriter::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *E) {
	VisitCallExpr(E);
	Record.push_back(E->getOperator());
	Record.AddSourceRange(E->Range);
	Record.push_back(E->getFPFeatures().getInt());
	Code = serialization::EXPR_CXX_OPERATOR_CALL;
	}

	void ASTStmtWriter::VisitCXXMemberCallExpr(CXXMemberCallExpr *E) {
	VisitCallExpr(E);
	Code = serialization::EXPR_CXX_MEMBER_CALL;
	}

	void ASTStmtWriter::VisitCXXConstructExpr(CXXConstructExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumArgs());
	for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
	Record.AddStmt(E->getArg(I));
	Record.AddDeclRef(E->getConstructor());
	Record.AddSourceLocation(E->getLocation());
	Record.push_back(E->isElidable());
	Record.push_back(E->hadMultipleCandidates());
	Record.push_back(E->isListInitialization());
	Record.push_back(E->isStdInitListInitialization());
	Record.push_back(E->requiresZeroInitialization());
	Record.push_back(E->getConstructionKind()); // FIXME: stable encoding
	Record.AddSourceRange(E->getParenOrBraceRange());
	Code = serialization::EXPR_CXX_CONSTRUCT;
	}

	void ASTStmtWriter::VisitCXXInheritedCtorInitExpr(CXXInheritedCtorInitExpr *E) {
	VisitExpr(E);
	Record.AddDeclRef(E->getConstructor());
	Record.AddSourceLocation(E->getLocation());
	Record.push_back(E->constructsVBase());
	Record.push_back(E->inheritedFromVBase());
	Code = serialization::EXPR_CXX_INHERITED_CTOR_INIT;
	}

	void ASTStmtWriter::VisitCXXTemporaryObjectExpr(CXXTemporaryObjectExpr *E) {
	VisitCXXConstructExpr(E);
	Record.AddTypeSourceInfo(E->getTypeSourceInfo());
	Code = serialization::EXPR_CXX_TEMPORARY_OBJECT;
	}

	void ASTStmtWriter::VisitLambdaExpr(LambdaExpr *E) {
	VisitExpr(E);
	Record.push_back(E->NumCaptures);
	Record.AddSourceRange(E->IntroducerRange);
	Record.push_back(E->CaptureDefault); // FIXME: stable encoding
	Record.AddSourceLocation(E->CaptureDefaultLoc);
	Record.push_back(E->ExplicitParams);
	Record.push_back(E->ExplicitResultType);
	Record.AddSourceLocation(E->ClosingBrace);

	// Add capture initializers.
	for (LambdaExpr::capture_init_iterator C = E->capture_init_begin(),
	CEnd = E->capture_init_end();
	C != CEnd; ++C) {
	Record.AddStmt(*C);
	}

	Code = serialization::EXPR_LAMBDA;
	}

	void ASTStmtWriter::VisitCXXStdInitializerListExpr(CXXStdInitializerListExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getSubExpr());
	Code = serialization::EXPR_CXX_STD_INITIALIZER_LIST;
	}

	void ASTStmtWriter::VisitCXXNamedCastExpr(CXXNamedCastExpr *E) {
	VisitExplicitCastExpr(E);
	Record.AddSourceRange(SourceRange(E->getOperatorLoc(), E->getRParenLoc()));
	Record.AddSourceRange(E->getAngleBrackets());
	}

	void ASTStmtWriter::VisitCXXStaticCastExpr(CXXStaticCastExpr *E) {
	VisitCXXNamedCastExpr(E);
	Code = serialization::EXPR_CXX_STATIC_CAST;
	}

	void ASTStmtWriter::VisitCXXDynamicCastExpr(CXXDynamicCastExpr *E) {
	VisitCXXNamedCastExpr(E);
	Code = serialization::EXPR_CXX_DYNAMIC_CAST;
	}

	void ASTStmtWriter::VisitCXXReinterpretCastExpr(CXXReinterpretCastExpr *E) {
	VisitCXXNamedCastExpr(E);
	Code = serialization::EXPR_CXX_REINTERPRET_CAST;
	}

	void ASTStmtWriter::VisitCXXConstCastExpr(CXXConstCastExpr *E) {
	VisitCXXNamedCastExpr(E);
	Code = serialization::EXPR_CXX_CONST_CAST;
	}

	void ASTStmtWriter::VisitCXXFunctionalCastExpr(CXXFunctionalCastExpr *E) {
	VisitExplicitCastExpr(E);
	Record.AddSourceLocation(E->getLParenLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_CXX_FUNCTIONAL_CAST;
	}

	void ASTStmtWriter::VisitUserDefinedLiteral(UserDefinedLiteral *E) {
	VisitCallExpr(E);
	Record.AddSourceLocation(E->UDSuffixLoc);
	Code = serialization::EXPR_USER_DEFINED_LITERAL;
	}

	void ASTStmtWriter::VisitCXXBoolLiteralExpr(CXXBoolLiteralExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getValue());
	Record.AddSourceLocation(E->getLocation());
	Code = serialization::EXPR_CXX_BOOL_LITERAL;
	}

	void ASTStmtWriter::VisitCXXNullPtrLiteralExpr(CXXNullPtrLiteralExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getLocation());
	Code = serialization::EXPR_CXX_NULL_PTR_LITERAL;
	}

	void ASTStmtWriter::VisitCXXTypeidExpr(CXXTypeidExpr *E) {
	VisitExpr(E);
	Record.AddSourceRange(E->getSourceRange());
	if (E->isTypeOperand()) {
	Record.AddTypeSourceInfo(E->getTypeOperandSourceInfo());
	Code = serialization::EXPR_CXX_TYPEID_TYPE;
	} else {
	Record.AddStmt(E->getExprOperand());
	Code = serialization::EXPR_CXX_TYPEID_EXPR;
	}
	}

	void ASTStmtWriter::VisitCXXThisExpr(CXXThisExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getLocation());
	Record.push_back(E->isImplicit());
	Code = serialization::EXPR_CXX_THIS;
	}

	void ASTStmtWriter::VisitCXXThrowExpr(CXXThrowExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getThrowLoc());
	Record.AddStmt(E->getSubExpr());
	Record.push_back(E->isThrownVariableInScope());
	Code = serialization::EXPR_CXX_THROW;
	}

	void ASTStmtWriter::VisitCXXDefaultArgExpr(CXXDefaultArgExpr *E) {
	VisitExpr(E);
	Record.AddDeclRef(E->getParam());
	Record.AddSourceLocation(E->getUsedLocation());
	Code = serialization::EXPR_CXX_DEFAULT_ARG;
	}

	void ASTStmtWriter::VisitCXXDefaultInitExpr(CXXDefaultInitExpr *E) {
	VisitExpr(E);
	Record.AddDeclRef(E->getField());
	Record.AddSourceLocation(E->getExprLoc());
	Code = serialization::EXPR_CXX_DEFAULT_INIT;
	}

	void ASTStmtWriter::VisitCXXBindTemporaryExpr(CXXBindTemporaryExpr *E) {
	VisitExpr(E);
	Record.AddCXXTemporary(E->getTemporary());
	Record.AddStmt(E->getSubExpr());
	Code = serialization::EXPR_CXX_BIND_TEMPORARY;
	}

	void ASTStmtWriter::VisitCXXScalarValueInitExpr(CXXScalarValueInitExpr *E) {
	VisitExpr(E);
	Record.AddTypeSourceInfo(E->getTypeSourceInfo());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_CXX_SCALAR_VALUE_INIT;
	}

	void ASTStmtWriter::VisitCXXNewExpr(CXXNewExpr *E) {
	VisitExpr(E);
	Record.push_back(E->isGlobalNew());
	Record.push_back(E->isArray());
	Record.push_back(E->passAlignment());
	Record.push_back(E->doesUsualArrayDeleteWantSize());
	Record.push_back(E->getNumPlacementArgs());
	Record.push_back(E->StoredInitializationStyle);
	Record.AddDeclRef(E->getOperatorNew());
	Record.AddDeclRef(E->getOperatorDelete());
	Record.AddTypeSourceInfo(E->getAllocatedTypeSourceInfo());
	Record.AddSourceRange(E->getTypeIdParens());
	Record.AddSourceRange(E->getSourceRange());
	Record.AddSourceRange(E->getDirectInitRange());
	for (CXXNewExpr::arg_iterator I = E->raw_arg_begin(), e = E->raw_arg_end();
	I != e; ++I)
	Record.AddStmt(*I);

	Code = serialization::EXPR_CXX_NEW;
	}

	void ASTStmtWriter::VisitCXXDeleteExpr(CXXDeleteExpr *E) {
	VisitExpr(E);
	Record.push_back(E->isGlobalDelete());
	Record.push_back(E->isArrayForm());
	Record.push_back(E->isArrayFormAsWritten());
	Record.push_back(E->doesUsualArrayDeleteWantSize());
	Record.AddDeclRef(E->getOperatorDelete());
	Record.AddStmt(E->getArgument());
	Record.AddSourceLocation(E->getSourceRange().getBegin());

	Code = serialization::EXPR_CXX_DELETE;
	}

	void ASTStmtWriter::VisitCXXPseudoDestructorExpr(CXXPseudoDestructorExpr *E) {
	VisitExpr(E);

	Record.AddStmt(E->getBase());
	Record.push_back(E->isArrow());
	Record.AddSourceLocation(E->getOperatorLoc());
	Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
	Record.AddTypeSourceInfo(E->getScopeTypeInfo());
	Record.AddSourceLocation(E->getColonColonLoc());
	Record.AddSourceLocation(E->getTildeLoc());

	// PseudoDestructorTypeStorage.
	Record.AddIdentifierRef(E->getDestroyedTypeIdentifier());
	if (E->getDestroyedTypeIdentifier())
	Record.AddSourceLocation(E->getDestroyedTypeLoc());
	else
	Record.AddTypeSourceInfo(E->getDestroyedTypeInfo());

	Code = serialization::EXPR_CXX_PSEUDO_DESTRUCTOR;
	}

	void ASTStmtWriter::VisitExprWithCleanups(ExprWithCleanups *E) {
	VisitExpr(E);
	Record.push_back(E->getNumObjects());
	for (unsigned i = 0, e = E->getNumObjects(); i != e; ++i)
	Record.AddDeclRef(E->getObject(i));

	Record.push_back(E->cleanupsHaveSideEffects());
	Record.AddStmt(E->getSubExpr());
	Code = serialization::EXPR_EXPR_WITH_CLEANUPS;
	}

	void
	ASTStmtWriter::VisitCXXDependentScopeMemberExpr(CXXDependentScopeMemberExpr *E){
	VisitExpr(E);

	// Don't emit anything here, HasTemplateKWAndArgsInfo must be
	// emitted first.

	Record.push_back(E->HasTemplateKWAndArgsInfo);
	if (E->HasTemplateKWAndArgsInfo) {
	const ASTTemplateKWAndArgsInfo &ArgInfo =
	*E->getTrailingObjects<ASTTemplateKWAndArgsInfo>();
	Record.push_back(ArgInfo.NumTemplateArgs);
	AddTemplateKWAndArgsInfo(ArgInfo,
	E->getTrailingObjects<TemplateArgumentLoc>());
	}

	if (!E->isImplicitAccess())
	Record.AddStmt(E->getBase());
	else
	Record.AddStmt(nullptr);
	Record.AddTypeRef(E->getBaseType());
	Record.push_back(E->isArrow());
	Record.AddSourceLocation(E->getOperatorLoc());
	Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
	Record.AddDeclRef(E->getFirstQualifierFoundInScope());
	Record.AddDeclarationNameInfo(E->MemberNameInfo);
	Code = serialization::EXPR_CXX_DEPENDENT_SCOPE_MEMBER;
	}

	void
	ASTStmtWriter::VisitDependentScopeDeclRefExpr(DependentScopeDeclRefExpr *E) {
	VisitExpr(E);

	// Don't emit anything here, HasTemplateKWAndArgsInfo must be
	// emitted first.

	Record.push_back(E->HasTemplateKWAndArgsInfo);
	if (E->HasTemplateKWAndArgsInfo) {
	const ASTTemplateKWAndArgsInfo &ArgInfo =
	*E->getTrailingObjects<ASTTemplateKWAndArgsInfo>();
	Record.push_back(ArgInfo.NumTemplateArgs);
	AddTemplateKWAndArgsInfo(ArgInfo,
	E->getTrailingObjects<TemplateArgumentLoc>());
	}

	Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
	Record.AddDeclarationNameInfo(E->NameInfo);
	Code = serialization::EXPR_CXX_DEPENDENT_SCOPE_DECL_REF;
	}

	void
	ASTStmtWriter::VisitCXXUnresolvedConstructExpr(CXXUnresolvedConstructExpr *E) {
	VisitExpr(E);
	Record.push_back(E->arg_size());
	for (CXXUnresolvedConstructExpr::arg_iterator
	ArgI = E->arg_begin(), ArgE = E->arg_end(); ArgI != ArgE; ++ArgI)
	Record.AddStmt(*ArgI);
	Record.AddTypeSourceInfo(E->getTypeSourceInfo());
	Record.AddSourceLocation(E->getLParenLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Code = serialization::EXPR_CXX_UNRESOLVED_CONSTRUCT;
	}

	void ASTStmtWriter::VisitOverloadExpr(OverloadExpr *E) {
	VisitExpr(E);

	// Don't emit anything here, HasTemplateKWAndArgsInfo must be
	// emitted first.

	Record.push_back(E->HasTemplateKWAndArgsInfo);
	if (E->HasTemplateKWAndArgsInfo) {
	const ASTTemplateKWAndArgsInfo &ArgInfo =
	*E->getTrailingASTTemplateKWAndArgsInfo();
	Record.push_back(ArgInfo.NumTemplateArgs);
	AddTemplateKWAndArgsInfo(ArgInfo, E->getTrailingTemplateArgumentLoc());
	}

	Record.push_back(E->getNumDecls());
	for (OverloadExpr::decls_iterator
	OvI = E->decls_begin(), OvE = E->decls_end(); OvI != OvE; ++OvI) {
	Record.AddDeclRef(OvI.getDecl());
	Record.push_back(OvI.getAccess());
	}

	Record.AddDeclarationNameInfo(E->NameInfo);
	Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
	}

	void ASTStmtWriter::VisitUnresolvedMemberExpr(UnresolvedMemberExpr *E) {
	VisitOverloadExpr(E);
	Record.push_back(E->isArrow());
	Record.push_back(E->hasUnresolvedUsing());
	Record.AddStmt(!E->isImplicitAccess() ? E->getBase() : nullptr);
	Record.AddTypeRef(E->getBaseType());
	Record.AddSourceLocation(E->getOperatorLoc());
	Code = serialization::EXPR_CXX_UNRESOLVED_MEMBER;
	}

	void ASTStmtWriter::VisitUnresolvedLookupExpr(UnresolvedLookupExpr *E) {
	VisitOverloadExpr(E);
	Record.push_back(E->requiresADL());
	Record.push_back(E->isOverloaded());
	Record.AddDeclRef(E->getNamingClass());
	Code = serialization::EXPR_CXX_UNRESOLVED_LOOKUP;
	}

	void ASTStmtWriter::VisitTypeTraitExpr(TypeTraitExpr *E) {
	VisitExpr(E);
	Record.push_back(E->TypeTraitExprBits.NumArgs);
	Record.push_back(E->TypeTraitExprBits.Kind); // FIXME: Stable encoding
	Record.push_back(E->TypeTraitExprBits.Value);
	Record.AddSourceRange(E->getSourceRange());
	for (unsigned I = 0, N = E->getNumArgs(); I != N; ++I)
	Record.AddTypeSourceInfo(E->getArg(I));
	Code = serialization::EXPR_TYPE_TRAIT;
	}

	void ASTStmtWriter::VisitArrayTypeTraitExpr(ArrayTypeTraitExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getTrait());
	Record.push_back(E->getValue());
	Record.AddSourceRange(E->getSourceRange());
	Record.AddTypeSourceInfo(E->getQueriedTypeSourceInfo());
	Record.AddStmt(E->getDimensionExpression());
	Code = serialization::EXPR_ARRAY_TYPE_TRAIT;
	}

	void ASTStmtWriter::VisitExpressionTraitExpr(ExpressionTraitExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getTrait());
	Record.push_back(E->getValue());
	Record.AddSourceRange(E->getSourceRange());
	Record.AddStmt(E->getQueriedExpression());
	Code = serialization::EXPR_CXX_EXPRESSION_TRAIT;
	}

	void ASTStmtWriter::VisitCXXNoexceptExpr(CXXNoexceptExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getValue());
	Record.AddSourceRange(E->getSourceRange());
	Record.AddStmt(E->getOperand());
	Code = serialization::EXPR_CXX_NOEXCEPT;
	}

	void ASTStmtWriter::VisitPackExpansionExpr(PackExpansionExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getEllipsisLoc());
	Record.push_back(E->NumExpansions);
	Record.AddStmt(E->getPattern());
	Code = serialization::EXPR_PACK_EXPANSION;
	}

	void ASTStmtWriter::VisitSizeOfPackExpr(SizeOfPackExpr *E) {
	VisitExpr(E);
	Record.push_back(E->isPartiallySubstituted() ? E->getPartialArguments().size()
	: 0);
	Record.AddSourceLocation(E->OperatorLoc);
	Record.AddSourceLocation(E->PackLoc);
	Record.AddSourceLocation(E->RParenLoc);
	Record.AddDeclRef(E->Pack);
	if (E->isPartiallySubstituted()) {
	for (const auto &TA : E->getPartialArguments())
	Record.AddTemplateArgument(TA);
	} else if (!E->isValueDependent()) {
	Record.push_back(E->getPackLength());
	}
	Code = serialization::EXPR_SIZEOF_PACK;
	}

	void ASTStmtWriter::VisitSubstNonTypeTemplateParmExpr(
	SubstNonTypeTemplateParmExpr *E) {
	VisitExpr(E);
	Record.AddDeclRef(E->getParameter());
	Record.AddSourceLocation(E->getNameLoc());
	Record.AddStmt(E->getReplacement());
	Code = serialization::EXPR_SUBST_NON_TYPE_TEMPLATE_PARM;
	}

	void ASTStmtWriter::VisitSubstNonTypeTemplateParmPackExpr(
	SubstNonTypeTemplateParmPackExpr *E) {
	VisitExpr(E);
	Record.AddDeclRef(E->getParameterPack());
	Record.AddTemplateArgument(E->getArgumentPack());
	Record.AddSourceLocation(E->getParameterPackLocation());
	Code = serialization::EXPR_SUBST_NON_TYPE_TEMPLATE_PARM_PACK;
	}

	void ASTStmtWriter::VisitFunctionParmPackExpr(FunctionParmPackExpr *E) {
	VisitExpr(E);
	Record.push_back(E->getNumExpansions());
	Record.AddDeclRef(E->getParameterPack());
	Record.AddSourceLocation(E->getParameterPackLocation());
	for (FunctionParmPackExpr::iterator I = E->begin(), End = E->end();
	I != End; ++I)
	Record.AddDeclRef(*I);
	Code = serialization::EXPR_FUNCTION_PARM_PACK;
	}

	void ASTStmtWriter::VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getTemporary());
	Record.AddDeclRef(E->getExtendingDecl());
	Record.push_back(E->getManglingNumber());
	Code = serialization::EXPR_MATERIALIZE_TEMPORARY;
	}

	void ASTStmtWriter::VisitCXXFoldExpr(CXXFoldExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->LParenLoc);
	Record.AddSourceLocation(E->EllipsisLoc);
	Record.AddSourceLocation(E->RParenLoc);
	Record.AddStmt(E->SubExprs[0]);
	Record.AddStmt(E->SubExprs[1]);
	Record.push_back(E->Opcode);
	Code = serialization::EXPR_CXX_FOLD;
	}

	void ASTStmtWriter::VisitOpaqueValueExpr(OpaqueValueExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getSourceExpr());
	Record.AddSourceLocation(E->getLocation());
	Code = serialization::EXPR_OPAQUE_VALUE;
	}

	void ASTStmtWriter::VisitTypoExpr(TypoExpr *E) {
	VisitExpr(E);
	// TODO: Figure out sane writer behavior for a TypoExpr, if necessary
	llvm_unreachable("Cannot write TypoExpr nodes");
	}

	//===----------------------------------------------------------------------===//
	// CUDA Expressions and Statements.
	//===----------------------------------------------------------------------===//

	void ASTStmtWriter::VisitCUDAKernelCallExpr(CUDAKernelCallExpr *E) {
	VisitCallExpr(E);
	Record.AddStmt(E->getConfig());
	Code = serialization::EXPR_CUDA_KERNEL_CALL;
	}

	//===----------------------------------------------------------------------===//
	// OpenCL Expressions and Statements.
	//===----------------------------------------------------------------------===//
	void ASTStmtWriter::VisitAsTypeExpr(AsTypeExpr *E) {
	VisitExpr(E);
	Record.AddSourceLocation(E->getBuiltinLoc());
	Record.AddSourceLocation(E->getRParenLoc());
	Record.AddStmt(E->getSrcExpr());
	Code = serialization::EXPR_ASTYPE;
	}

	//===----------------------------------------------------------------------===//
	// Microsoft Expressions and Statements.
	//===----------------------------------------------------------------------===//
	void ASTStmtWriter::VisitMSPropertyRefExpr(MSPropertyRefExpr *E) {
	VisitExpr(E);
	Record.push_back(E->isArrow());
	Record.AddStmt(E->getBaseExpr());
	Record.AddNestedNameSpecifierLoc(E->getQualifierLoc());
	Record.AddSourceLocation(E->getMemberLoc());
	Record.AddDeclRef(E->getPropertyDecl());
	Code = serialization::EXPR_CXX_PROPERTY_REF_EXPR;
	}

	void ASTStmtWriter::VisitMSPropertySubscriptExpr(MSPropertySubscriptExpr *E) {
	VisitExpr(E);
	Record.AddStmt(E->getBase());
	Record.AddStmt(E->getIdx());
	Record.AddSourceLocation(E->getRBracketLoc());
	Code = serialization::EXPR_CXX_PROPERTY_SUBSCRIPT_EXPR;
	}

	void ASTStmtWriter::VisitCXXUuidofExpr(CXXUuidofExpr *E) {
	VisitExpr(E);
	Record.AddSourceRange(E->getSourceRange());
	Record.AddString(E->getUuidStr());
	if (E->isTypeOperand()) {
	Record.AddTypeSourceInfo(E->getTypeOperandSourceInfo());
	Code = serialization::EXPR_CXX_UUIDOF_TYPE;
	} else {
	Record.AddStmt(E->getExprOperand());
	Code = serialization::EXPR_CXX_UUIDOF_EXPR;
	}
	}

	void ASTStmtWriter::VisitSEHExceptStmt(SEHExceptStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getExceptLoc());
	Record.AddStmt(S->getFilterExpr());
	Record.AddStmt(S->getBlock());
	Code = serialization::STMT_SEH_EXCEPT;
	}

	void ASTStmtWriter::VisitSEHFinallyStmt(SEHFinallyStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getFinallyLoc());
	Record.AddStmt(S->getBlock());
	Code = serialization::STMT_SEH_FINALLY;
	}

	void ASTStmtWriter::VisitSEHTryStmt(SEHTryStmt *S) {
	VisitStmt(S);
	Record.push_back(S->getIsCXXTry());
	Record.AddSourceLocation(S->getTryLoc());
	Record.AddStmt(S->getTryBlock());
	Record.AddStmt(S->getHandler());
	Code = serialization::STMT_SEH_TRY;
	}

	void ASTStmtWriter::VisitSEHLeaveStmt(SEHLeaveStmt *S) {
	VisitStmt(S);
	Record.AddSourceLocation(S->getLeaveLoc());
	Code = serialization::STMT_SEH_LEAVE;
	}

	//===----------------------------------------------------------------------===//
	// OpenMP Clauses.
	//===----------------------------------------------------------------------===//

	namespace clang {
	class OMPClauseWriter : public OMPClauseVisitor<OMPClauseWriter> {
	ASTRecordWriter &Record;
	public:
	OMPClauseWriter(ASTRecordWriter &Record) : Record(Record) {}
	#define OPENMP_CLAUSE(Name, Class) \
	void Visit##Class(Class *S);
	#include "clang/Basic/OpenMPKinds.def"
	void writeClause(OMPClause *C);
	void VisitOMPClauseWithPreInit(OMPClauseWithPreInit *C);
	void VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *C);
	};
	}

	void OMPClauseWriter::writeClause(OMPClause *C) {
	Record.push_back(C->getClauseKind());
	Visit(C);
	Record.AddSourceLocation(C->getLocStart());
	Record.AddSourceLocation(C->getLocEnd());
	}

	void OMPClauseWriter::VisitOMPClauseWithPreInit(OMPClauseWithPreInit *C) {
	Record.push_back(C->getCaptureRegion());
	Record.AddStmt(C->getPreInitStmt());
	}

	void OMPClauseWriter::VisitOMPClauseWithPostUpdate(OMPClauseWithPostUpdate *C) {
	VisitOMPClauseWithPreInit(C);
	Record.AddStmt(C->getPostUpdateExpr());
	}

	void OMPClauseWriter::VisitOMPIfClause(OMPIfClause *C) {
	VisitOMPClauseWithPreInit(C);
	Record.push_back(C->getNameModifier());
	Record.AddSourceLocation(C->getNameModifierLoc());
	Record.AddSourceLocation(C->getColonLoc());
	Record.AddStmt(C->getCondition());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPFinalClause(OMPFinalClause *C) {
	Record.AddStmt(C->getCondition());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPNumThreadsClause(OMPNumThreadsClause *C) {
	VisitOMPClauseWithPreInit(C);
	Record.AddStmt(C->getNumThreads());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPSafelenClause(OMPSafelenClause *C) {
	Record.AddStmt(C->getSafelen());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPSimdlenClause(OMPSimdlenClause *C) {
	Record.AddStmt(C->getSimdlen());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPCollapseClause(OMPCollapseClause *C) {
	Record.AddStmt(C->getNumForLoops());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPDefaultClause(OMPDefaultClause *C) {
	Record.push_back(C->getDefaultKind());
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getDefaultKindKwLoc());
	}

	void OMPClauseWriter::VisitOMPProcBindClause(OMPProcBindClause *C) {
	Record.push_back(C->getProcBindKind());
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getProcBindKindKwLoc());
	}

	void OMPClauseWriter::VisitOMPScheduleClause(OMPScheduleClause *C) {
	VisitOMPClauseWithPreInit(C);
	Record.push_back(C->getScheduleKind());
	Record.push_back(C->getFirstScheduleModifier());
	Record.push_back(C->getSecondScheduleModifier());
	Record.AddStmt(C->getChunkSize());
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getFirstScheduleModifierLoc());
	Record.AddSourceLocation(C->getSecondScheduleModifierLoc());
	Record.AddSourceLocation(C->getScheduleKindLoc());
	Record.AddSourceLocation(C->getCommaLoc());
	}

	void OMPClauseWriter::VisitOMPOrderedClause(OMPOrderedClause *C) {
	Record.AddStmt(C->getNumForLoops());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPNowaitClause(OMPNowaitClause *) {}

	void OMPClauseWriter::VisitOMPUntiedClause(OMPUntiedClause *) {}

	void OMPClauseWriter::VisitOMPMergeableClause(OMPMergeableClause *) {}

	void OMPClauseWriter::VisitOMPReadClause(OMPReadClause *) {}

	void OMPClauseWriter::VisitOMPWriteClause(OMPWriteClause *) {}

	void OMPClauseWriter::VisitOMPUpdateClause(OMPUpdateClause *) {}

	void OMPClauseWriter::VisitOMPCaptureClause(OMPCaptureClause *) {}

	void OMPClauseWriter::VisitOMPSeqCstClause(OMPSeqCstClause *) {}

	void OMPClauseWriter::VisitOMPThreadsClause(OMPThreadsClause *) {}

	void OMPClauseWriter::VisitOMPSIMDClause(OMPSIMDClause *) {}

	void OMPClauseWriter::VisitOMPNogroupClause(OMPNogroupClause *) {}

	void OMPClauseWriter::VisitOMPPrivateClause(OMPPrivateClause *C) {
	Record.push_back(C->varlist_size());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *VE : C->varlists()) {
	Record.AddStmt(VE);
	}
	for (auto *VE : C->private_copies()) {
	Record.AddStmt(VE);
	}
	}

	void OMPClauseWriter::VisitOMPFirstprivateClause(OMPFirstprivateClause *C) {
	Record.push_back(C->varlist_size());
	VisitOMPClauseWithPreInit(C);
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *VE : C->varlists()) {
	Record.AddStmt(VE);
	}
	for (auto *VE : C->private_copies()) {
	Record.AddStmt(VE);
	}
	for (auto *VE : C->inits()) {
	Record.AddStmt(VE);
	}
	}

	void OMPClauseWriter::VisitOMPLastprivateClause(OMPLastprivateClause *C) {
	Record.push_back(C->varlist_size());
	VisitOMPClauseWithPostUpdate(C);
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	for (auto *E : C->private_copies())
	Record.AddStmt(E);
	for (auto *E : C->source_exprs())
	Record.AddStmt(E);
	for (auto *E : C->destination_exprs())
	Record.AddStmt(E);
	for (auto *E : C->assignment_ops())
	Record.AddStmt(E);
	}

	void OMPClauseWriter::VisitOMPSharedClause(OMPSharedClause *C) {
	Record.push_back(C->varlist_size());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	}

	void OMPClauseWriter::VisitOMPReductionClause(OMPReductionClause *C) {
	Record.push_back(C->varlist_size());
	VisitOMPClauseWithPostUpdate(C);
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getColonLoc());
	Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
	Record.AddDeclarationNameInfo(C->getNameInfo());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	for (auto *VE : C->privates())
	Record.AddStmt(VE);
	for (auto *E : C->lhs_exprs())
	Record.AddStmt(E);
	for (auto *E : C->rhs_exprs())
	Record.AddStmt(E);
	for (auto *E : C->reduction_ops())
	Record.AddStmt(E);
	}

	void OMPClauseWriter::VisitOMPTaskReductionClause(OMPTaskReductionClause *C) {
	Record.push_back(C->varlist_size());
	VisitOMPClauseWithPostUpdate(C);
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getColonLoc());
	Record.AddNestedNameSpecifierLoc(C->getQualifierLoc());
	Record.AddDeclarationNameInfo(C->getNameInfo());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	for (auto *VE : C->privates())
	Record.AddStmt(VE);
	for (auto *E : C->lhs_exprs())
	Record.AddStmt(E);
	for (auto *E : C->rhs_exprs())
	Record.AddStmt(E);
	for (auto *E : C->reduction_ops())
	Record.AddStmt(E);
	}

	void OMPClauseWriter::VisitOMPLinearClause(OMPLinearClause *C) {
	Record.push_back(C->varlist_size());
	VisitOMPClauseWithPostUpdate(C);
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getColonLoc());
	Record.push_back(C->getModifier());
	Record.AddSourceLocation(C->getModifierLoc());
	for (auto *VE : C->varlists()) {
	Record.AddStmt(VE);
	}
	for (auto *VE : C->privates()) {
	Record.AddStmt(VE);
	}
	for (auto *VE : C->inits()) {
	Record.AddStmt(VE);
	}
	for (auto *VE : C->updates()) {
	Record.AddStmt(VE);
	}
	for (auto *VE : C->finals()) {
	Record.AddStmt(VE);
	}
	Record.AddStmt(C->getStep());
	Record.AddStmt(C->getCalcStep());
	}

	void OMPClauseWriter::VisitOMPAlignedClause(OMPAlignedClause *C) {
	Record.push_back(C->varlist_size());
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getColonLoc());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	Record.AddStmt(C->getAlignment());
	}

	void OMPClauseWriter::VisitOMPCopyinClause(OMPCopyinClause *C) {
	Record.push_back(C->varlist_size());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	for (auto *E : C->source_exprs())
	Record.AddStmt(E);
	for (auto *E : C->destination_exprs())
	Record.AddStmt(E);
	for (auto *E : C->assignment_ops())
	Record.AddStmt(E);
	}

	void OMPClauseWriter::VisitOMPCopyprivateClause(OMPCopyprivateClause *C) {
	Record.push_back(C->varlist_size());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	for (auto *E : C->source_exprs())
	Record.AddStmt(E);
	for (auto *E : C->destination_exprs())
	Record.AddStmt(E);
	for (auto *E : C->assignment_ops())
	Record.AddStmt(E);
	}

	void OMPClauseWriter::VisitOMPFlushClause(OMPFlushClause *C) {
	Record.push_back(C->varlist_size());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	}

	void OMPClauseWriter::VisitOMPDependClause(OMPDependClause *C) {
	Record.push_back(C->varlist_size());
	Record.AddSourceLocation(C->getLParenLoc());
	Record.push_back(C->getDependencyKind());
	Record.AddSourceLocation(C->getDependencyLoc());
	Record.AddSourceLocation(C->getColonLoc());
	for (auto *VE : C->varlists())
	Record.AddStmt(VE);
	Record.AddStmt(C->getCounterValue());
	}

	void OMPClauseWriter::VisitOMPDeviceClause(OMPDeviceClause *C) {
	Record.AddStmt(C->getDevice());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPMapClause(OMPMapClause *C) {
	Record.push_back(C->varlist_size());
	Record.push_back(C->getUniqueDeclarationsNum());
	Record.push_back(C->getTotalComponentListNum());
	Record.push_back(C->getTotalComponentsNum());
	Record.AddSourceLocation(C->getLParenLoc());
	Record.push_back(C->getMapTypeModifier());
	Record.push_back(C->getMapType());
	Record.AddSourceLocation(C->getMapLoc());
	Record.AddSourceLocation(C->getColonLoc());
	for (auto *E : C->varlists())
	Record.AddStmt(E);
	for (auto *D : C->all_decls())
	Record.AddDeclRef(D);
	for (auto N : C->all_num_lists())
	Record.push_back(N);
	for (auto N : C->all_lists_sizes())
	Record.push_back(N);
	for (auto &M : C->all_components()) {
	Record.AddStmt(M.getAssociatedExpression());
	Record.AddDeclRef(M.getAssociatedDeclaration());
	}
	}

	void OMPClauseWriter::VisitOMPNumTeamsClause(OMPNumTeamsClause *C) {
	VisitOMPClauseWithPreInit(C);
	Record.AddStmt(C->getNumTeams());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPThreadLimitClause(OMPThreadLimitClause *C) {
	VisitOMPClauseWithPreInit(C);
	Record.AddStmt(C->getThreadLimit());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPPriorityClause(OMPPriorityClause *C) {
	Record.AddStmt(C->getPriority());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPGrainsizeClause(OMPGrainsizeClause *C) {
	Record.AddStmt(C->getGrainsize());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPNumTasksClause(OMPNumTasksClause *C) {
	Record.AddStmt(C->getNumTasks());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPHintClause(OMPHintClause *C) {
	Record.AddStmt(C->getHint());
	Record.AddSourceLocation(C->getLParenLoc());
	}

	void OMPClauseWriter::VisitOMPDistScheduleClause(OMPDistScheduleClause *C) {
	VisitOMPClauseWithPreInit(C);
	Record.push_back(C->getDistScheduleKind());
	Record.AddStmt(C->getChunkSize());
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getDistScheduleKindLoc());
	Record.AddSourceLocation(C->getCommaLoc());
	}

	void OMPClauseWriter::VisitOMPDefaultmapClause(OMPDefaultmapClause *C) {
	Record.push_back(C->getDefaultmapKind());
	Record.push_back(C->getDefaultmapModifier());
	Record.AddSourceLocation(C->getLParenLoc());
	Record.AddSourceLocation(C->getDefaultmapModifierLoc());
	Record.AddSourceLocation(C->getDefaultmapKindLoc());
	}

	void OMPClauseWriter::VisitOMPToClause(OMPToClause *C) {
	Record.push_back(C->varlist_size());
	Record.push_back(C->getUniqueDeclarationsNum());
	Record.push_back(C->getTotalComponentListNum());
	Record.push_back(C->getTotalComponentsNum());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *E : C->varlists())
	Record.AddStmt(E);
	for (auto *D : C->all_decls())
	Record.AddDeclRef(D);
	for (auto N : C->all_num_lists())
	Record.push_back(N);
	for (auto N : C->all_lists_sizes())
	Record.push_back(N);
	for (auto &M : C->all_components()) {
	Record.AddStmt(M.getAssociatedExpression());
	Record.AddDeclRef(M.getAssociatedDeclaration());
	}
	}

	void OMPClauseWriter::VisitOMPFromClause(OMPFromClause *C) {
	Record.push_back(C->varlist_size());
	Record.push_back(C->getUniqueDeclarationsNum());
	Record.push_back(C->getTotalComponentListNum());
	Record.push_back(C->getTotalComponentsNum());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *E : C->varlists())
	Record.AddStmt(E);
	for (auto *D : C->all_decls())
	Record.AddDeclRef(D);
	for (auto N : C->all_num_lists())
	Record.push_back(N);
	for (auto N : C->all_lists_sizes())
	Record.push_back(N);
	for (auto &M : C->all_components()) {
	Record.AddStmt(M.getAssociatedExpression());
	Record.AddDeclRef(M.getAssociatedDeclaration());
	}
	}

	void OMPClauseWriter::VisitOMPUseDevicePtrClause(OMPUseDevicePtrClause *C) {
	Record.push_back(C->varlist_size());
	Record.push_back(C->getUniqueDeclarationsNum());
	Record.push_back(C->getTotalComponentListNum());
	Record.push_back(C->getTotalComponentsNum());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *E : C->varlists())
	Record.AddStmt(E);
	for (auto *VE : C->private_copies())
	Record.AddStmt(VE);
	for (auto *VE : C->inits())
	Record.AddStmt(VE);
	for (auto *D : C->all_decls())
	Record.AddDeclRef(D);
	for (auto N : C->all_num_lists())
	Record.push_back(N);
	for (auto N : C->all_lists_sizes())
	Record.push_back(N);
	for (auto &M : C->all_components()) {
	Record.AddStmt(M.getAssociatedExpression());
	Record.AddDeclRef(M.getAssociatedDeclaration());
	}
	}

	void OMPClauseWriter::VisitOMPIsDevicePtrClause(OMPIsDevicePtrClause *C) {
	Record.push_back(C->varlist_size());
	Record.push_back(C->getUniqueDeclarationsNum());
	Record.push_back(C->getTotalComponentListNum());
	Record.push_back(C->getTotalComponentsNum());
	Record.AddSourceLocation(C->getLParenLoc());
	for (auto *E : C->varlists())
	Record.AddStmt(E);
	for (auto *D : C->all_decls())
	Record.AddDeclRef(D);
	for (auto N : C->all_num_lists())
	Record.push_back(N);
	for (auto N : C->all_lists_sizes())
	Record.push_back(N);
	for (auto &M : C->all_components()) {
	Record.AddStmt(M.getAssociatedExpression());
	Record.AddDeclRef(M.getAssociatedDeclaration());
	}
	}

	//===----------------------------------------------------------------------===//
	// OpenMP Directives.
	//===----------------------------------------------------------------------===//
	void ASTStmtWriter::VisitOMPExecutableDirective(OMPExecutableDirective *E) {
	Record.AddSourceLocation(E->getLocStart());
	Record.AddSourceLocation(E->getLocEnd());
	OMPClauseWriter ClauseWriter(Record);
	for (unsigned i = 0; i < E->getNumClauses(); ++i) {
	ClauseWriter.writeClause(E->getClause(i));
	}
	if (E->hasAssociatedStmt())
	Record.AddStmt(E->getAssociatedStmt());
	}

	void ASTStmtWriter::VisitOMPLoopDirective(OMPLoopDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	Record.push_back(D->getCollapsedNumber());
	VisitOMPExecutableDirective(D);
	Record.AddStmt(D->getIterationVariable());
	Record.AddStmt(D->getLastIteration());
	Record.AddStmt(D->getCalcLastIteration());
	Record.AddStmt(D->getPreCond());
	Record.AddStmt(D->getCond());
	Record.AddStmt(D->getInit());
	Record.AddStmt(D->getInc());
	Record.AddStmt(D->getPreInits());
	if (isOpenMPWorksharingDirective(D->getDirectiveKind()) \|\|
	isOpenMPTaskLoopDirective(D->getDirectiveKind()) \|\|
	isOpenMPDistributeDirective(D->getDirectiveKind())) {
	Record.AddStmt(D->getIsLastIterVariable());
	Record.AddStmt(D->getLowerBoundVariable());
	Record.AddStmt(D->getUpperBoundVariable());
	Record.AddStmt(D->getStrideVariable());
	Record.AddStmt(D->getEnsureUpperBound());
	Record.AddStmt(D->getNextLowerBound());
	Record.AddStmt(D->getNextUpperBound());
	Record.AddStmt(D->getNumIterations());
	}
	if (isOpenMPLoopBoundSharingDirective(D->getDirectiveKind())) {
	Record.AddStmt(D->getPrevLowerBoundVariable());
	Record.AddStmt(D->getPrevUpperBoundVariable());
	Record.AddStmt(D->getDistInc());
	Record.AddStmt(D->getPrevEnsureUpperBound());
	Record.AddStmt(D->getCombinedLowerBoundVariable());
	Record.AddStmt(D->getCombinedUpperBoundVariable());
	Record.AddStmt(D->getCombinedEnsureUpperBound());
	Record.AddStmt(D->getCombinedInit());
	Record.AddStmt(D->getCombinedCond());
	Record.AddStmt(D->getCombinedNextLowerBound());
	Record.AddStmt(D->getCombinedNextUpperBound());
	}
	for (auto I : D->counters()) {
	Record.AddStmt(I);
	}
	for (auto I : D->private_counters()) {
	Record.AddStmt(I);
	}
	for (auto I : D->inits()) {
	Record.AddStmt(I);
	}
	for (auto I : D->updates()) {
	Record.AddStmt(I);
	}
	for (auto I : D->finals()) {
	Record.AddStmt(I);
	}
	}

	void ASTStmtWriter::VisitOMPParallelDirective(OMPParallelDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Record.push_back(D->hasCancel() ? 1 : 0);
	Code = serialization::STMT_OMP_PARALLEL_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPSimdDirective(OMPSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPForDirective(OMPForDirective *D) {
	VisitOMPLoopDirective(D);
	Record.push_back(D->hasCancel() ? 1 : 0);
	Code = serialization::STMT_OMP_FOR_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPForSimdDirective(OMPForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_FOR_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPSectionsDirective(OMPSectionsDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Record.push_back(D->hasCancel() ? 1 : 0);
	Code = serialization::STMT_OMP_SECTIONS_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPSectionDirective(OMPSectionDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	Record.push_back(D->hasCancel() ? 1 : 0);
	Code = serialization::STMT_OMP_SECTION_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPSingleDirective(OMPSingleDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_SINGLE_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPMasterDirective(OMPMasterDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_MASTER_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPCriticalDirective(OMPCriticalDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Record.AddDeclarationNameInfo(D->getDirectiveName());
	Code = serialization::STMT_OMP_CRITICAL_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPParallelForDirective(OMPParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	Record.push_back(D->hasCancel() ? 1 : 0);
	Code = serialization::STMT_OMP_PARALLEL_FOR_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPParallelForSimdDirective(
	OMPParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_PARALLEL_FOR_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPParallelSectionsDirective(
	OMPParallelSectionsDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Record.push_back(D->hasCancel() ? 1 : 0);
	Code = serialization::STMT_OMP_PARALLEL_SECTIONS_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTaskDirective(OMPTaskDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Record.push_back(D->hasCancel() ? 1 : 0);
	Code = serialization::STMT_OMP_TASK_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPAtomicDirective(OMPAtomicDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Record.AddStmt(D->getX());
	Record.AddStmt(D->getV());
	Record.AddStmt(D->getExpr());
	Record.AddStmt(D->getUpdateExpr());
	Record.push_back(D->isXLHSInRHSPart() ? 1 : 0);
	Record.push_back(D->isPostfixUpdate() ? 1 : 0);
	Code = serialization::STMT_OMP_ATOMIC_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetDirective(OMPTargetDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TARGET_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetDataDirective(OMPTargetDataDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TARGET_DATA_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetEnterDataDirective(
	OMPTargetEnterDataDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TARGET_ENTER_DATA_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetExitDataDirective(
	OMPTargetExitDataDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TARGET_EXIT_DATA_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetParallelDirective(
	OMPTargetParallelDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TARGET_PARALLEL_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetParallelForDirective(
	OMPTargetParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	Record.push_back(D->hasCancel() ? 1 : 0);
	Code = serialization::STMT_OMP_TARGET_PARALLEL_FOR_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTaskyieldDirective(OMPTaskyieldDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TASKYIELD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPBarrierDirective(OMPBarrierDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_BARRIER_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTaskwaitDirective(OMPTaskwaitDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TASKWAIT_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTaskgroupDirective(OMPTaskgroupDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TASKGROUP_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPFlushDirective(OMPFlushDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_FLUSH_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPOrderedDirective(OMPOrderedDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_ORDERED_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTeamsDirective(OMPTeamsDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TEAMS_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPCancellationPointDirective(
	OMPCancellationPointDirective *D) {
	VisitStmt(D);
	VisitOMPExecutableDirective(D);
	Record.push_back(D->getCancelRegion());
	Code = serialization::STMT_OMP_CANCELLATION_POINT_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPCancelDirective(OMPCancelDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Record.push_back(D->getCancelRegion());
	Code = serialization::STMT_OMP_CANCEL_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTaskLoopDirective(OMPTaskLoopDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TASKLOOP_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTaskLoopSimdDirective(OMPTaskLoopSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TASKLOOP_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPDistributeDirective(OMPDistributeDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_DISTRIBUTE_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetUpdateDirective(OMPTargetUpdateDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TARGET_UPDATE_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPDistributeParallelForDirective(
	OMPDistributeParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPDistributeParallelForSimdDirective(
	OMPDistributeParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPDistributeSimdDirective(
	OMPDistributeSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_DISTRIBUTE_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetParallelForSimdDirective(
	OMPTargetParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TARGET_PARALLEL_FOR_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetSimdDirective(OMPTargetSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TARGET_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTeamsDistributeDirective(
	OMPTeamsDistributeDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TEAMS_DISTRIBUTE_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTeamsDistributeSimdDirective(
	OMPTeamsDistributeSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTeamsDistributeParallelForSimdDirective(
	OMPTeamsDistributeParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTeamsDistributeParallelForDirective(
	OMPTeamsDistributeParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetTeamsDirective(OMPTargetTeamsDirective *D) {
	VisitStmt(D);
	Record.push_back(D->getNumClauses());
	VisitOMPExecutableDirective(D);
	Code = serialization::STMT_OMP_TARGET_TEAMS_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetTeamsDistributeDirective(
	OMPTargetTeamsDistributeDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TARGET_TEAMS_DISTRIBUTE_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetTeamsDistributeParallelForDirective(
	OMPTargetTeamsDistributeParallelForDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetTeamsDistributeParallelForSimdDirective(
	OMPTargetTeamsDistributeParallelForSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::
	STMT_OMP_TARGET_TEAMS_DISTRIBUTE_PARALLEL_FOR_SIMD_DIRECTIVE;
	}

	void ASTStmtWriter::VisitOMPTargetTeamsDistributeSimdDirective(
	OMPTargetTeamsDistributeSimdDirective *D) {
	VisitOMPLoopDirective(D);
	Code = serialization::STMT_OMP_TARGET_TEAMS_DISTRIBUTE_SIMD_DIRECTIVE;
	}

	//===----------------------------------------------------------------------===//
	// ASTWriter Implementation
	//===----------------------------------------------------------------------===//

	unsigned ASTWriter::RecordSwitchCaseID(SwitchCase *S) {
	assert(SwitchCaseIDs.find(S) == SwitchCaseIDs.end() &&
	"SwitchCase recorded twice");
	unsigned NextID = SwitchCaseIDs.size();
	SwitchCaseIDs[S] = NextID;
	return NextID;
	}

	unsigned ASTWriter::getSwitchCaseID(SwitchCase *S) {
	assert(SwitchCaseIDs.find(S) != SwitchCaseIDs.end() &&
	"SwitchCase hasn't been seen yet");
	return SwitchCaseIDs[S];
	}

	void ASTWriter::ClearSwitchCaseIDs() {
	SwitchCaseIDs.clear();
	}

	/// \brief Write the given substatement or subexpression to the
	/// bitstream.
	void ASTWriter::WriteSubStmt(Stmt *S) {
	RecordData Record;
	ASTStmtWriter Writer(*this, Record);
	++NumStatements;

	if (!S) {
	Stream.EmitRecord(serialization::STMT_NULL_PTR, Record);
	return;
	}

	llvm::DenseMap<Stmt *, uint64_t>::iterator I = SubStmtEntries.find(S);
	if (I != SubStmtEntries.end()) {
	Record.push_back(I->second);
	Stream.EmitRecord(serialization::STMT_REF_PTR, Record);
	return;
	}

	#ifndef NDEBUG
	assert(!ParentStmts.count(S) && "There is a Stmt cycle!");

	struct ParentStmtInserterRAII {
	Stmt *S;
	llvm::DenseSet<Stmt *> &ParentStmts;

	ParentStmtInserterRAII(Stmt S, llvm::DenseSet<Stmt > &ParentStmts)
	: S(S), ParentStmts(ParentStmts) {
	ParentStmts.insert(S);
	}
	~ParentStmtInserterRAII() {
	ParentStmts.erase(S);
	}
	};

	ParentStmtInserterRAII ParentStmtInserter(S, ParentStmts);
	#endif

	Writer.Visit(S);

	uint64_t Offset = Writer.Emit();
	SubStmtEntries[S] = Offset;
	}

	/// \brief Flush all of the statements that have been added to the
	/// queue via AddStmt().
	void ASTRecordWriter::FlushStmts() {
	// We expect to be the only consumer of the two temporary statement maps,
	// assert that they are empty.
	assert(Writer->SubStmtEntries.empty() && "unexpected entries in sub-stmt map");
	assert(Writer->ParentStmts.empty() && "unexpected entries in parent stmt map");

	for (unsigned I = 0, N = StmtsToEmit.size(); I != N; ++I) {
	Writer->WriteSubStmt(StmtsToEmit[I]);

	assert(N == StmtsToEmit.size() && "record modified while being written!");

	// Note that we are at the end of a full expression. Any
	// expression records that follow this one are part of a different
	// expression.
	Writer->Stream.EmitRecord(serialization::STMT_STOP, ArrayRef<uint32_t>());

	Writer->SubStmtEntries.clear();
	Writer->ParentStmts.clear();
	}

	StmtsToEmit.clear();
	}

	void ASTRecordWriter::FlushSubStmts() {
	// For a nested statement, write out the substatements in reverse order (so
	// that a simple stack machine can be used when loading), and don't emit a
	// STMT_STOP after each one.
	for (unsigned I = 0, N = StmtsToEmit.size(); I != N; ++I) {
	Writer->WriteSubStmt(StmtsToEmit[N - I - 1]);
	assert(N == StmtsToEmit.size() && "record modified while being written!");
	}

	StmtsToEmit.clear();
	}
	Index: head/contrib/llvm/tools/clang
	===================================================================
	--- head/contrib/llvm/tools/clang (revision 322319)
	+++ head/contrib/llvm/tools/clang (revision 322320)

	Property changes on: head/contrib/llvm/tools/clang
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/clang/dist:r321713-322299
	Index: head/contrib/llvm/tools/lld
	===================================================================
	--- head/contrib/llvm/tools/lld (revision 322319)
	+++ head/contrib/llvm/tools/lld (revision 322320)

	Property changes on: head/contrib/llvm/tools/lld
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lld/dist:r321713-322299
	Index: head/contrib/llvm/tools/lldb
	===================================================================
	--- head/contrib/llvm/tools/lldb (revision 322319)
	+++ head/contrib/llvm/tools/lldb (revision 322320)

	Property changes on: head/contrib/llvm/tools/lldb
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/lldb/dist:r321713-322300
	Index: head/contrib/llvm
	===================================================================
	--- head/contrib/llvm (revision 322319)
	+++ head/contrib/llvm (revision 322320)

	Property changes on: head/contrib/llvm
	___________________________________________________________________
	Modified: svn:mergeinfo
	## -0,0 +0,1 ##
	Merged /vendor/llvm/dist:r321702-322299
	Index: head/lib/clang/include/clang/Basic/Version.inc
	===================================================================
	--- head/lib/clang/include/clang/Basic/Version.inc (revision 322319)
	+++ head/lib/clang/include/clang/Basic/Version.inc (revision 322320)
	@@ -1,11 +1,11 @@
	/* $FreeBSD$ */

	#define CLANG_VERSION 5.0.0
	#define CLANG_VERSION_STRING "5.0.0"
	#define CLANG_VERSION_MAJOR 5
	#define CLANG_VERSION_MINOR 0
	#define CLANG_VERSION_PATCHLEVEL 0

	#define CLANG_VENDOR "FreeBSD "

	-#define SVN_REVISION "309439"
	+#define SVN_REVISION "310316"
	Index: head/lib/clang/include/lld/Config/Version.inc
	===================================================================
	--- head/lib/clang/include/lld/Config/Version.inc (revision 322319)
	+++ head/lib/clang/include/lld/Config/Version.inc (revision 322320)
	@@ -1,8 +1,8 @@
	// $FreeBSD$

	#define LLD_VERSION 5.0.0
	#define LLD_VERSION_STRING "5.0.0"
	#define LLD_VERSION_MAJOR 5
	#define LLD_VERSION_MINOR 0
	-#define LLD_REVISION_STRING "309439"
	+#define LLD_REVISION_STRING "310316"
	#define LLD_REPOSITORY_STRING "FreeBSD"
	Index: head/lib/clang/include/llvm/Support/VCSRevision.h
	===================================================================
	--- head/lib/clang/include/llvm/Support/VCSRevision.h (revision 322319)
	+++ head/lib/clang/include/llvm/Support/VCSRevision.h (revision 322320)
	@@ -1,2 +1,2 @@
	/* $FreeBSD$ */
	-#define LLVM_REVISION "svn-r309439"
	+#define LLVM_REVISION "svn-r310316"

File Metadata

Mime Type: application/octet-stream
Expires: Tue, May 7, 11:13 AM (1 d, 23 h)
Storage Engine: chunks
Storage Format: Chunks
Storage Handle: 0enYZS_TojWt
Default Alt Text: (6 MB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions